Unverified Commit 44f0270e authored by guoshzhao's avatar guoshzhao Committed by GitHub
Browse files

Benchmarks: Add Feature - Add return_code metric into result (#256)

**Description**
Add return_code metric into result and revise unit tests.
parent 655f238d
......@@ -283,3 +283,8 @@ def result(self):
def serialized_result(self):
"""Decoration function to access benchmark result."""
return self._result.to_string()
@property
def default_metric_count(self):
"""Decoration function to get the count of default metrics."""
return self._result.default_metric_count
......@@ -31,6 +31,7 @@ def __init__(self, name, type, return_code, run_count=0):
self.__end_time = None
self.__raw_data = dict()
self.__result = dict()
self.__result['return_code'] = [return_code.value]
self.__reduce_op = dict()
def __eq__(self, rhs):
......@@ -119,6 +120,7 @@ def set_return_code(self, return_code):
return_code (ReturnCode): return code defined in superbench.benchmarks.ReturnCode.
"""
self.__return_code = return_code
self.__result['return_code'][0] = return_code.value
def to_string(self):
"""Serialize the BenchmarkResult object to string.
......@@ -158,6 +160,15 @@ def return_code(self):
"""Decoration function to access __return_code."""
return self.__return_code
@property
def default_metric_count(self):
"""Decoration function to get the count of default metrics."""
count = 0
if 'return_code' in self.__result:
count += 1
return count
@property
def start_time(self):
"""Decoration function to access __start_time."""
......
......@@ -45,8 +45,8 @@ def test_pytorch_computation_communication_overlap_normal():
assert (benchmark.run_count == 1)
assert (benchmark.return_code == ReturnCode.SUCCESS)
assert (len(benchmark.raw_data) == benchmark.run_count * len(benchmark._args.kernel))
assert (len(benchmark.result) == benchmark.run_count * len(benchmark._args.kernel))
assert (len(benchmark.raw_data) == len(benchmark._args.kernel))
assert (len(benchmark.result) == len(benchmark._args.kernel) + benchmark.default_metric_count)
@decorator.cuda_test
......@@ -79,6 +79,6 @@ def test_pytorch_computation_communication_overlap_fake_distributed():
assert (benchmark.run_count == 1)
assert (benchmark.return_code == ReturnCode.SUCCESS)
assert (len(benchmark.raw_data) == benchmark.run_count * len(benchmark._args.kernel))
assert (len(benchmark.result) == benchmark.run_count * len(benchmark._args.kernel))
assert (len(benchmark.raw_data) == len(benchmark._args.kernel))
assert (len(benchmark.result) == len(benchmark._args.kernel) + benchmark.default_metric_count)
utils.clean_simulated_ddp_distributed_env()
......@@ -42,7 +42,8 @@ def test_cublas_functions():
for metric in list(benchmark.result.keys()):
assert (len(benchmark.result[metric]) == 1)
assert (isinstance(benchmark.result[metric][0], numbers.Number))
assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)
if metric != 'return_code':
assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)
# Test for custom configuration
custom_config_str = '{"name":"cublasCgemm","m":512,"n":512,"k":32,"transa":1,"transb":0}'
......@@ -73,8 +74,9 @@ def test_cublas_functions():
assert (len(benchmark.raw_data['raw_output_0']) == 1)
assert (isinstance(benchmark.raw_data['raw_output_0'][0], str))
assert (1 == len(benchmark.result))
assert (1 + benchmark.default_metric_count == len(benchmark.result))
for metric in list(benchmark.result.keys()):
assert (len(benchmark.result[metric]) == 1)
assert (isinstance(benchmark.result[metric][0], numbers.Number))
assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)
if metric != 'return_code':
assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)
......@@ -42,7 +42,8 @@ def test_cudnn_functions():
for metric in list(benchmark.result.keys()):
assert (len(benchmark.result[metric]) == 1)
assert (isinstance(benchmark.result[metric][0], numbers.Number))
assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)
if metric != 'return_code':
assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)
# Test for custom configuration
custom_config_str = '{"algo":0,"arrayLength":2,"convType":0,"dilationA":[1,1],"filterStrideA":[1,1],' \
......@@ -77,8 +78,9 @@ def test_cudnn_functions():
assert (len(benchmark.raw_data['raw_output_0']) == 1)
assert (isinstance(benchmark.raw_data['raw_output_0'][0], str))
assert (1 == len(benchmark.result))
assert (1 + benchmark.default_metric_count == len(benchmark.result))
for metric in list(benchmark.result.keys()):
assert (len(benchmark.result[metric]) == 1)
assert (isinstance(benchmark.result[metric][0], numbers.Number))
assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)
if metric != 'return_code':
assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)
......@@ -509,7 +509,7 @@ def test_disk_performance_result_parsing(self):
assert (benchmark.return_code == ReturnCode.SUCCESS)
# bs + <read, write> x <iops, 95th, 99th, 99.9th>
assert (9 == len(benchmark.result.keys()))
assert (9 + benchmark.default_metric_count == len(benchmark.result.keys()))
assert (1 == len(benchmark.result[jobname_prefix + '_bs']))
assert (4096 == benchmark.result[jobname_prefix + '_bs'][0])
......
......@@ -93,7 +93,7 @@ def test_gpcnet_network_test(self):
--------------------------------------------------------------------------
"""
assert (benchmark._process_raw_result(0, raw_output_no_execution))
assert (len(benchmark.result) == 0)
assert (len(benchmark.result) == benchmark.default_metric_count)
# Check function process_raw_data.
# Positive case - valid raw output.
......@@ -250,7 +250,7 @@ def test_gpcnet_network_load(self): # noqa: C901
--------------------------------------------------------------------------
"""
assert (benchmark._process_raw_result(0, raw_output_no_execution))
assert (len(benchmark.result) == 0)
assert (len(benchmark.result) == benchmark.default_metric_count)
# Positive case - valid raw output.
assert (benchmark._process_raw_result(0, raw_output))
test_name = 'IsolatedNetworkTests'
......
......@@ -112,12 +112,15 @@ def _test_gpu_copy_bw_performance_result_parsing(self, platform):
assert (1 == len(benchmark.raw_data))
print(test_raw_output.splitlines())
test_raw_output_dict = {x.split()[0]: float(x.split()[1]) for x in test_raw_output.strip().splitlines()}
assert (len(test_raw_output_dict) == len(benchmark.result))
assert (len(test_raw_output_dict) + benchmark.default_metric_count == len(benchmark.result))
for output_key in benchmark.result:
assert (len(benchmark.result[output_key]) == 1)
assert (isinstance(benchmark.result[output_key][0], numbers.Number))
assert (output_key in test_raw_output_dict)
assert (test_raw_output_dict[output_key] == benchmark.result[output_key][0])
if output_key == 'return_code':
assert (benchmark.result[output_key] == [0])
else:
assert (len(benchmark.result[output_key]) == 1)
assert (isinstance(benchmark.result[output_key][0], numbers.Number))
assert (output_key in test_raw_output_dict)
assert (test_raw_output_dict[output_key] == benchmark.result[output_key][0])
# Negative case - invalid raw output.
assert (benchmark._process_raw_result(1, 'Invalid raw output') is False)
......
......@@ -133,7 +133,7 @@ def test_tensorrt_inference_result_parsing(self, test_raw_log):
self.assertTrue(benchmark._process_raw_result(0, test_raw_log))
self.assertEqual(ReturnCode.SUCCESS, benchmark.return_code)
self.assertEqual(6, len(benchmark.result))
self.assertEqual(6 + benchmark.default_metric_count, len(benchmark.result))
for tag in ['mean', '99']:
self.assertEqual(0.5, benchmark.result[f'gpu_lat_ms_{tag}'][0])
self.assertEqual(0.6, benchmark.result[f'host_lat_ms_{tag}'][0])
......
......@@ -225,7 +225,7 @@ def test_train():
'"start_time": null, "end_time": null, "raw_data": {'
'"steptime_train_float32": [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]], '
'"throughput_train_float32": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]]}, '
'"result": {"steptime_train_float32": [2.0], "throughput_train_float32": [16000.0]}, '
'"result": {"return_code": [0], "steptime_train_float32": [2.0], "throughput_train_float32": [16000.0]}, '
'"reduce_op": {"steptime_train_float32": "max", "throughput_train_float32": "min"}}'
)
assert (benchmark._preprocess())
......@@ -236,7 +236,7 @@ def test_train():
benchmark = create_benchmark('--num_steps 0')
expected_result = (
'{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 3, '
'"start_time": null, "end_time": null, "raw_data": {}, "result": {}, "reduce_op": {}}'
'"start_time": null, "end_time": null, "raw_data": {}, "result": {"return_code": [3]}, "reduce_op": {}}'
)
assert (benchmark._preprocess())
assert (benchmark._ModelBenchmark__train(Precision.FLOAT32) is False)
......@@ -251,8 +251,8 @@ def test_inference():
'"start_time": null, "end_time": null, "raw_data": {'
'"steptime_inference_float16": [[4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0]], '
'"throughput_inference_float16": [[8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0]]}, '
'"result": {"steptime_inference_float16": [4.0], "throughput_inference_float16": [8000.0]}, '
'"reduce_op": {"steptime_inference_float16": null, "throughput_inference_float16": null}}'
'"result": {"return_code": [0], "steptime_inference_float16": [4.0], "throughput_inference_float16": '
'[8000.0]}, "reduce_op": {"steptime_inference_float16": null, "throughput_inference_float16": null}}'
)
assert (benchmark._preprocess())
assert (benchmark._ModelBenchmark__inference(Precision.FLOAT16))
......@@ -262,7 +262,7 @@ def test_inference():
benchmark = create_benchmark('--num_steps 0')
expected_result = (
'{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 3, '
'"start_time": null, "end_time": null, "raw_data": {}, "result": {}, "reduce_op": {}}'
'"start_time": null, "end_time": null, "raw_data": {}, "result": {"return_code": [3]}, "reduce_op": {}}'
)
assert (benchmark._preprocess())
assert (benchmark._ModelBenchmark__inference(Precision.FLOAT16) is False)
......@@ -287,6 +287,7 @@ def test_benchmark():
}
assert (benchmark.raw_data == expected_raw_data)
expected_result = {
'return_code': [0],
'steptime_train_float32': [2.0],
'throughput_train_float32': [16000.0],
'steptime_train_float16': [2.0],
......@@ -300,7 +301,7 @@ def test_benchmark():
'"throughput_train_float32": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]], '
'"steptime_train_float16": [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]], '
'"throughput_train_float16": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]]}, '
'"result": {"steptime_train_float32": [2.0], "throughput_train_float32": [16000.0], '
'"result": {"return_code": [0], "steptime_train_float32": [2.0], "throughput_train_float32": [16000.0], '
'"steptime_train_float16": [2.0], "throughput_train_float16": [16000.0]}, '
'"reduce_op": {"steptime_train_float32": "max", "throughput_train_float32": "min", '
'"steptime_train_float16": "max", "throughput_train_float16": "min"}}'
......@@ -337,7 +338,7 @@ def test_check_result_format():
assert (benchmark._Benchmark__check_raw_data())
# Negative case for __check_result_format() - change List[int] to List[str].
benchmark._result._BenchmarkResult__result = {'metric1': ['2.0']}
benchmark._result._BenchmarkResult__result = {'return_code': [0], 'metric1': ['2.0']}
assert (benchmark._Benchmark__check_summarized_result() is False)
# Negative case for __check_raw_data() - change List[List[int]] to List[List[str]].
......
......@@ -139,7 +139,7 @@ def test_launch_benchmark():
assert (benchmark.run_count == 1)
assert (benchmark.return_code == ReturnCode.SUCCESS)
assert (benchmark.raw_data == {'accumulation_result': ['1,3,6,10']})
assert (benchmark.result == {'accumulation_result': [10]})
assert (benchmark.result == {'return_code': [0], 'accumulation_result': [10]})
# Replace the timestamp as null.
result = re.sub(r'\"\d+-\d+-\d+ \d+:\d+:\d+\"', 'null', benchmark.serialized_result)
......@@ -147,7 +147,7 @@ def test_launch_benchmark():
'{"name": "accumulation", "type": "micro", "run_count": 1, '
'"return_code": 0, "start_time": null, "end_time": null, '
'"raw_data": {"accumulation_result": ["1,3,6,10"]}, '
'"result": {"accumulation_result": [10]}, '
'"result": {"return_code": [0], "accumulation_result": [10]}, '
'"reduce_op": {"accumulation_result": null}}'
)
assert (result == expected)
......@@ -163,7 +163,7 @@ def test_launch_benchmark():
assert (benchmark.run_count == 1)
assert (benchmark.return_code == ReturnCode.SUCCESS)
assert (benchmark.raw_data == {'accumulation_result': ['1,3,6']})
assert (benchmark.result == {'accumulation_result': [6]})
assert (benchmark.result == {'return_code': [0], 'accumulation_result': [6]})
# Replace the timestamp as null.
result = re.sub(r'\"\d+-\d+-\d+ \d+:\d+:\d+\"', 'null', benchmark.serialized_result)
......@@ -171,7 +171,7 @@ def test_launch_benchmark():
'{"name": "accumulation", "type": "micro", "run_count": 1, '
'"return_code": 0, "start_time": null, "end_time": null, '
'"raw_data": {"accumulation_result": ["1,3,6"]}, '
'"result": {"accumulation_result": [6]}, '
'"result": {"return_code": [0], "accumulation_result": [6]}, '
'"reduce_op": {"accumulation_result": null}}'
)
assert (result == expected)
......
......@@ -9,27 +9,27 @@
def test_add_raw_data():
"""Test interface BenchmarkResult.add_raw_data()."""
result = BenchmarkResult('micro', BenchmarkType.MICRO.value, ReturnCode.SUCCESS.value)
result = BenchmarkResult('micro', BenchmarkType.MICRO, ReturnCode.SUCCESS)
result.add_raw_data('metric1', 'raw log 1')
result.add_raw_data('metric1', 'raw log 2')
assert (result.raw_data['metric1'][0] == 'raw log 1')
assert (result.raw_data['metric1'][1] == 'raw log 2')
assert (result.type == BenchmarkType.MICRO.value)
assert (result.return_code == ReturnCode.SUCCESS.value)
assert (result.type == BenchmarkType.MICRO)
assert (result.return_code == ReturnCode.SUCCESS)
result = BenchmarkResult('model', BenchmarkType.MODEL.value, ReturnCode.SUCCESS.value)
result = BenchmarkResult('model', BenchmarkType.MODEL, ReturnCode.SUCCESS)
result.add_raw_data('metric1', [1, 2, 3])
result.add_raw_data('metric1', [4, 5, 6])
assert (result.raw_data['metric1'][0] == [1, 2, 3])
assert (result.raw_data['metric1'][1] == [4, 5, 6])
assert (result.type == BenchmarkType.MODEL.value)
assert (result.return_code == ReturnCode.SUCCESS.value)
assert (result.type == BenchmarkType.MODEL)
assert (result.return_code == ReturnCode.SUCCESS)
def test_add_result():
"""Test interface BenchmarkResult.add_result()."""
result = BenchmarkResult('micro', BenchmarkType.MICRO.value, ReturnCode.SUCCESS.value)
result = BenchmarkResult('micro', BenchmarkType.MICRO, ReturnCode.SUCCESS)
result.add_result('metric1', 300)
result.add_result('metric1', 200)
assert (result.result['metric1'][0] == 300)
......@@ -38,7 +38,7 @@ def test_add_result():
def test_set_timestamp():
"""Test interface BenchmarkResult.set_timestamp()."""
result = BenchmarkResult('micro', BenchmarkType.MICRO.value, ReturnCode.SUCCESS.value)
result = BenchmarkResult('micro', BenchmarkType.MICRO, ReturnCode.SUCCESS)
start_time = '2021-02-03 16:59:49'
end_time = '2021-02-03 17:00:08'
result.set_timestamp(start_time, end_time)
......@@ -48,25 +48,28 @@ def test_set_timestamp():
def test_set_benchmark_type():
"""Test interface BenchmarkResult.set_benchmark_type()."""
result = BenchmarkResult('micro', BenchmarkType.MICRO.value, ReturnCode.SUCCESS.value)
result.set_benchmark_type(BenchmarkType.MICRO.value)
assert (result.type == BenchmarkType.MICRO.value)
result = BenchmarkResult('micro', BenchmarkType.MICRO, ReturnCode.SUCCESS)
result.set_benchmark_type(BenchmarkType.MICRO)
assert (result.type == BenchmarkType.MICRO)
def test_set_return_code():
"""Test interface BenchmarkResult.set_return_code()."""
result = BenchmarkResult('micro', BenchmarkType.MICRO.value, ReturnCode.SUCCESS.value)
assert (result.return_code == ReturnCode.SUCCESS.value)
result.set_return_code(ReturnCode.INVALID_ARGUMENT.value)
assert (result.return_code == ReturnCode.INVALID_ARGUMENT.value)
result.set_return_code(ReturnCode.INVALID_BENCHMARK_RESULT.value)
assert (result.return_code == ReturnCode.INVALID_BENCHMARK_RESULT.value)
result = BenchmarkResult('micro', BenchmarkType.MICRO, ReturnCode.SUCCESS)
assert (result.return_code == ReturnCode.SUCCESS)
assert (result.result['return_code'] == [ReturnCode.SUCCESS.value])
result.set_return_code(ReturnCode.INVALID_ARGUMENT)
assert (result.return_code == ReturnCode.INVALID_ARGUMENT)
assert (result.result['return_code'] == [ReturnCode.INVALID_ARGUMENT.value])
result.set_return_code(ReturnCode.INVALID_BENCHMARK_RESULT)
assert (result.return_code == ReturnCode.INVALID_BENCHMARK_RESULT)
assert (result.result['return_code'] == [ReturnCode.INVALID_BENCHMARK_RESULT.value])
def test_serialize_deserialize():
"""Test serialization/deserialization and compare the results."""
# Result with one metric.
result = BenchmarkResult('pytorch-bert-base1', BenchmarkType.MICRO.value, ReturnCode.SUCCESS.value, run_count=2)
result = BenchmarkResult('pytorch-bert-base1', BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=2)
result.add_result('metric1', 300, ReduceType.MAX)
result.add_result('metric1', 200, ReduceType.MAX)
result.add_result('metric2', 100, ReduceType.AVG)
......@@ -76,13 +79,13 @@ def test_serialize_deserialize():
start_time = '2021-02-03 16:59:49'
end_time = '2021-02-03 17:00:08'
result.set_timestamp(start_time, end_time)
result.set_benchmark_type(BenchmarkType.MICRO.value)
result.set_benchmark_type(BenchmarkType.MICRO)
expected = (
'{"name": "pytorch-bert-base1", "type": "micro", "run_count": 2, "return_code": 0, '
'"start_time": "2021-02-03 16:59:49", "end_time": "2021-02-03 17:00:08", '
'"raw_data": {"metric1": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]}, '
'"result": {"metric1": [300, 200], "metric2": [100]}, '
'"result": {"return_code": [0], "metric1": [300, 200], "metric2": [100]}, '
'"reduce_op": {"metric1": "max", "metric2": "avg"}}'
)
assert (result.to_string() == expected)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment