test_model_base.py 17.6 KB
Newer Older
1
2
3
4
5
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Tests for BenchmarkRegistry module."""

6
7
import json

8
from superbench.benchmarks import Platform, Framework, Precision, BenchmarkRegistry, BenchmarkType, ReturnCode
9
10
11
12
13
14
15
16
17
18
19
20
21
22
from superbench.benchmarks.model_benchmarks import ModelBenchmark


class FakeModelBenchmark(ModelBenchmark):
    """Fake benchmark inherit from ModelBenchmark."""
    def __init__(self, name, parameters=''):
        """Constructor.

        Args:
            name: benchmark name.
            parameters: benchmark parameters.
        """
        super().__init__(name, parameters)
        self._supported_precision = [Precision.FLOAT32, Precision.FLOAT16]
23
        self._sub_benchmark_start_time = 0
24
25
26
27
28
29
30
31
32

    def add_parser_arguments(self):
        """Add the specified arguments."""
        super().add_parser_arguments()
        self._parser.add_argument(
            '--hidden_size',
            type=int,
            default=1024,
            required=False,
33
            help='Hidden size.',
34
35
36
37
38
39
40
        )

        self._parser.add_argument(
            '--seq_len',
            type=int,
            default=512,
            required=False,
41
            help='Sequence length.',
42
43
        )

44
45
46
47
    def _judge_gpu_availability(self):
        """Judge GPUs' availability according to arguments and running environment."""
        self._gpu_available = False

48
49
50
51
    def _set_force_fp32(self):
        """Set the config that controls whether full float32 precision will be used."""
        pass

52
53
    def _init_distributed_setting(self):
        """Initialize the distributed library and bind the worker to GPU."""
54
        return True
55
56
57

    def _generate_dataset(self):
        """Generate dataset for benchmarking according to shape info."""
58
        return True
59
60
61

    def _init_dataloader(self):
        """Initialize the distributed dataloader."""
62
        return True
63
64
65

    def _create_optimizer(self):
        """Create the optimzier instance used for training."""
66
        return True
67
68
69

    def _create_model(self, precision):
        """Construct the model for benchmarking."""
70
        return True
71
72
73
74
75
76
77
78
79
80
81
82
83

    def _train_step(self, precision):
        """Define the training process.

        Args:
            precision (str): precision of model and input data,
              such as float, half.

        Return:
            The step-time list of every training step.
        """
        duration = []
        for i in range(self._args.num_steps):
84
            duration.append(2.0)
85
86
87
88
89
90
91
92
93
94
95
96
97
98
        return duration

    def _inference_step(self, precision):
        """Define the inference process.

        Args:
            precision (str): precision of model and input data,
              such as float, half.

        Return:
            The latency list of every inference operation.
        """
        duration = []
        for i in range(self._args.num_steps):
99
            duration.append(4.0)
100
101
        return duration

102
    def _cal_params_count(self):
103
104
105
106
107
108
109
110
        """Calculate the parameters scale of the model.

        Return:
            The count of trainable parameters.
        """
        return 200


111
def create_benchmark(params='--num_steps 8'):
112
113
114
115
116
    """Register and create benchmark."""
    # Register the FakeModelBenchmark benchmark.
    BenchmarkRegistry.register_benchmark(
        'pytorch-fake-model',
        FakeModelBenchmark,
117
        parameters='--hidden_size 2',
118
119
        platform=Platform.CUDA,
    )
120
121
122
    context = BenchmarkRegistry.create_benchmark_context(
        'fake-model', platform=Platform.CUDA, parameters=params, framework=Framework.PYTORCH
    )
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
    name = BenchmarkRegistry._BenchmarkRegistry__get_benchmark_name(context)
    assert (name)
    (benchmark_class, predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(name, context.platform)
    assert (benchmark_class)
    return benchmark_class(name, predefine_params + ' ' + context.parameters)


def test_arguments_related_interfaces():
    """Test arguments related interfaces.

    Benchmark.add_parser_arguments(),
    Benchmark.parse_args(),
    Benchmark.get_configurable_settings()
    """
    # Positive case for parse_args().
138
    benchmark = create_benchmark('--num_steps 9')
139
140
141
142
143
    benchmark.add_parser_arguments()
    (ret, args, unknown) = benchmark.parse_args()
    assert (ret and args.num_steps == 9)

    # Negative case for parse_args() - invalid precision.
144
    benchmark = create_benchmark('--num_steps 8 --precision fp32')
145
146
147
148
149
150
151
152
    benchmark.add_parser_arguments()
    (ret, args, unknown) = benchmark.parse_args()
    assert (ret is False)

    # Test get_configurable_settings().
    settings = benchmark.get_configurable_settings()
    expected_settings = (
        """optional arguments:
153
  --batch_size int      The number of batch size.
154
155
  --distributed_backend DistributedBackend
                        Distributed backends. E.g. nccl mpi gloo.
156
157
  --distributed_impl DistributedImpl
                        Distributed implementations. E.g. ddp mirrored
158
                        multiworkermirrored parameterserver horovod.
159
  --duration int        The elapsed time of benchmark in seconds.
160
  --force_fp32          Enable option to use full float32 precision.
161
  --hidden_size int     Hidden size.
162
163
  --log_flushing        Real-time log flushing.
  --log_n_steps int     Real-time log every n steps.
164
165
  --log_raw_data        Log raw data into file instead of saving it into
                        result object.
166
167
168
169
170
  --model_action ModelAction [ModelAction ...]
                        Benchmark model process. E.g. train inference.
  --no_gpu              Disable GPU training.
  --num_steps int       The number of test step.
  --num_warmup int      The number of warmup step.
171
  --num_workers int     Number of subprocesses to use for data loading.
172
173
  --pin_memory          Enable option to pin memory in data loader.
  --precision Precision [Precision ...]
174
175
176
                        Model precision. E.g. fp8_hybrid fp8_e4m3 fp8_e5m2
                        float16 float32 float64 bfloat16 uint8 int8 int16
                        int32 int64.
177
178
  --run_count int       The run count of benchmark.
  --sample_count int    The number of data samples in dataset.
179
  --seq_len int         Sequence length."""
180
181
182
183
184
185
186
    )
    assert (settings == expected_settings)


def test_preprocess():
    """Test interface Benchmark._preprocess()."""
    # Positive case for _preprocess().
187
    benchmark = create_benchmark('--num_steps 8')
188
189
190
191
192
    assert (benchmark._preprocess())
    assert (benchmark.return_code == ReturnCode.SUCCESS)
    settings = benchmark.get_configurable_settings()
    expected_settings = (
        """optional arguments:
193
  --batch_size int      The number of batch size.
194
195
  --distributed_backend DistributedBackend
                        Distributed backends. E.g. nccl mpi gloo.
196
197
  --distributed_impl DistributedImpl
                        Distributed implementations. E.g. ddp mirrored
198
                        multiworkermirrored parameterserver horovod.
199
  --duration int        The elapsed time of benchmark in seconds.
200
  --force_fp32          Enable option to use full float32 precision.
201
  --hidden_size int     Hidden size.
202
203
  --log_flushing        Real-time log flushing.
  --log_n_steps int     Real-time log every n steps.
204
205
  --log_raw_data        Log raw data into file instead of saving it into
                        result object.
206
207
208
209
210
  --model_action ModelAction [ModelAction ...]
                        Benchmark model process. E.g. train inference.
  --no_gpu              Disable GPU training.
  --num_steps int       The number of test step.
  --num_warmup int      The number of warmup step.
211
  --num_workers int     Number of subprocesses to use for data loading.
212
213
  --pin_memory          Enable option to pin memory in data loader.
  --precision Precision [Precision ...]
214
215
216
                        Model precision. E.g. fp8_hybrid fp8_e4m3 fp8_e5m2
                        float16 float32 float64 bfloat16 uint8 int8 int16
                        int32 int64.
217
218
  --run_count int       The run count of benchmark.
  --sample_count int    The number of data samples in dataset.
219
  --seq_len int         Sequence length."""
220
221
222
223
    )
    assert (settings == expected_settings)

    # Negative case for _preprocess() - invalid precision.
224
    benchmark = create_benchmark('--num_steps 8 --precision fp32')
225
226
227
228
    assert (benchmark._preprocess() is False)
    assert (benchmark.return_code == ReturnCode.INVALID_ARGUMENT)

    # Negative case for _preprocess() - invalid benchmark type.
229
    benchmark = create_benchmark('--num_steps 8 --precision float32')
230
231
232
233
234
235
236
237
238
239
    benchmark._benchmark_type = Platform.CUDA
    assert (benchmark._preprocess() is False)
    assert (benchmark.return_code == ReturnCode.INVALID_BENCHMARK_TYPE)


def test_train():
    """Test interface Benchmark.__train()."""
    benchmark = create_benchmark()
    expected_result = (
        '{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 0, '
240
        '"start_time": null, "end_time": null, "raw_data": {'
241
242
243
        '"fp32_train_step_time": [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]], '
        '"fp32_train_throughput": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]]}, '
        '"result": {"return_code": [0], "fp32_train_step_time": [2.0], "fp32_train_throughput": [16000.0]}, '
244
        '"reduce_op": {"return_code": null, "fp32_train_step_time": null, "fp32_train_throughput": null}}'
245
246
247
    )
    assert (benchmark._preprocess())
    assert (benchmark._ModelBenchmark__train(Precision.FLOAT32))
248
    assert (json.loads(benchmark.serialized_result) == json.loads(expected_result))
249
250

    # Step time list is empty (simulate training failure).
251
    benchmark = create_benchmark('--num_steps 0')
252
    expected_result = (
253
        '{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 3, '
254
255
        '"start_time": null, "end_time": null, "raw_data": {}, '
        '"result": {"return_code": [3]}, "reduce_op": {"return_code": null}}'
256
257
258
    )
    assert (benchmark._preprocess())
    assert (benchmark._ModelBenchmark__train(Precision.FLOAT32) is False)
259
    assert (json.loads(benchmark.serialized_result) == json.loads(expected_result))
260
261
262
263
264
265
266


def test_inference():
    """Test interface Benchmark.__inference()."""
    benchmark = create_benchmark()
    expected_result = (
        '{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 0, '
267
        '"start_time": null, "end_time": null, "raw_data": {'
268
269
        '"fp16_inference_step_time": [[4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0]], '
        '"fp16_inference_throughput": [[8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0]]}, '
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
        '"result": {"return_code": [0], "fp16_inference_step_time": [4.0], '
        '"fp16_inference_step_time_50": [4.0], "fp16_inference_step_time_90": [4.0], '
        '"fp16_inference_step_time_95": [4.0], "fp16_inference_step_time_99": [4.0], '
        '"fp16_inference_step_time_99.9": [4.0], '
        '"fp16_inference_throughput": [8000.0], '
        '"fp16_inference_throughput_50": [8000.0], "fp16_inference_throughput_90": [8000.0], '
        '"fp16_inference_throughput_95": [8000.0], "fp16_inference_throughput_99": [8000.0], '
        '"fp16_inference_throughput_99.9": [8000.0]}, '
        '"reduce_op": {"return_code": null, "fp16_inference_step_time": null, '
        '"fp16_inference_step_time_50": null, "fp16_inference_step_time_90": null, '
        '"fp16_inference_step_time_95": null, "fp16_inference_step_time_99": null, '
        '"fp16_inference_step_time_99.9": null, "fp16_inference_throughput": null, '
        '"fp16_inference_throughput_50": null, "fp16_inference_throughput_90": null, '
        '"fp16_inference_throughput_95": null, "fp16_inference_throughput_99": null, '
        '"fp16_inference_throughput_99.9": null}}'
285
286
287
    )
    assert (benchmark._preprocess())
    assert (benchmark._ModelBenchmark__inference(Precision.FLOAT16))
288
    assert (json.loads(benchmark.serialized_result) == json.loads(expected_result))
289
290

    # Step time list is empty (simulate inference failure).
291
    benchmark = create_benchmark('--num_steps 0')
292
    expected_result = (
293
        '{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 3, '
294
295
        '"start_time": null, "end_time": null, "raw_data": {}, '
        '"result": {"return_code": [3]}, "reduce_op": {"return_code": null}}'
296
297
298
    )
    assert (benchmark._preprocess())
    assert (benchmark._ModelBenchmark__inference(Precision.FLOAT16) is False)
299
    assert (json.loads(benchmark.serialized_result) == json.loads(expected_result))
300
301
302
303
304
305
306
307
308
309
310
311
312


def test_benchmark():
    """Test interface Benchmark._benchmark()."""
    # Positive case for _benchmark().
    benchmark = create_benchmark()
    benchmark._preprocess()
    assert (benchmark._benchmark())
    assert (benchmark.name == 'pytorch-fake-model')
    assert (benchmark.type == BenchmarkType.MODEL)
    assert (benchmark.run_count == 1)
    assert (benchmark.return_code == ReturnCode.SUCCESS)
    expected_raw_data = {
313
314
315
316
        'fp32_train_step_time': [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]],
        'fp32_train_throughput': [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]],
        'fp16_train_step_time': [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]],
        'fp16_train_throughput': [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]]
317
318
319
    }
    assert (benchmark.raw_data == expected_raw_data)
    expected_result = {
320
        'return_code': [0],
321
322
323
324
        'fp32_train_step_time': [2.0],
        'fp32_train_throughput': [16000.0],
        'fp16_train_step_time': [2.0],
        'fp16_train_throughput': [16000.0]
325
326
327
328
329
    }
    assert (benchmark.result == expected_result)

    expected_serialized_result = (
        '{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 0, "start_time": null, '
330
331
332
333
334
335
        '"end_time": null, "raw_data": {"fp32_train_step_time": [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]], '
        '"fp32_train_throughput": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]], '
        '"fp16_train_step_time": [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]], '
        '"fp16_train_throughput": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]]}, '
        '"result": {"return_code": [0], "fp32_train_step_time": [2.0], "fp32_train_throughput": [16000.0], '
        '"fp16_train_step_time": [2.0], "fp16_train_throughput": [16000.0]}, '
336
337
        '"reduce_op": {"return_code": null, "fp32_train_step_time": null, "fp32_train_throughput": null, '
        '"fp16_train_step_time": null, "fp16_train_throughput": null}}'
338
    )
339
    assert (json.loads(benchmark.serialized_result) == json.loads(expected_serialized_result))
340
341

    # Negative case for _benchmark() - no supported precision found.
342
    benchmark = create_benchmark('--precision int16')
343
344
345
346
347
    assert (benchmark._preprocess())
    assert (benchmark._benchmark() is False)
    assert (benchmark.return_code == ReturnCode.NO_SUPPORTED_PRECISION)

    # Negative case for _benchmark() - model train failure, step time list is empty.
348
    benchmark = create_benchmark('--num_steps 0')
349
350
    assert (benchmark._preprocess())
    assert (benchmark._benchmark() is False)
351
    assert (benchmark.return_code == ReturnCode.INVALID_BENCHMARK_RESULT)
352
353

    # Negative case for _benchmark() - model inference failure, step time list is empty.
354
    benchmark = create_benchmark('--model_action inference --num_steps 0')
355
356
    assert (benchmark._preprocess())
    assert (benchmark._benchmark() is False)
357
    assert (benchmark.return_code == ReturnCode.INVALID_BENCHMARK_RESULT)
358
359
360
361
362
363
364
365
366
367
368
369
370


def test_check_result_format():
    """Test interface Benchmark.__check_result_format()."""
    # Positive case for __check_result_format().
    benchmark = create_benchmark()
    benchmark._preprocess()
    assert (benchmark._benchmark())
    assert (benchmark._Benchmark__check_result_type())
    assert (benchmark._Benchmark__check_summarized_result())
    assert (benchmark._Benchmark__check_raw_data())

    # Negative case for __check_result_format() - change List[int] to List[str].
371
    benchmark._result._BenchmarkResult__result = {'return_code': [0], 'metric1': ['2.0']}
372
373
374
375
376
377
378
379
380
    assert (benchmark._Benchmark__check_summarized_result() is False)

    # Negative case for __check_raw_data() - change List[List[int]] to List[List[str]].
    benchmark._result._BenchmarkResult__raw_data = {'metric1': [['2.0']]}
    assert (benchmark._Benchmark__check_raw_data() is False)

    # Negative case for __check_raw_data() - invalid benchmark result.
    assert (benchmark._Benchmark__check_result_format() is False)
    assert (benchmark.return_code == ReturnCode.INVALID_BENCHMARK_RESULT)
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414


def test_is_finished():
    """Test interface Benchmark._is_finished()."""
    # Only step takes effect, benchmarking finish due to step.
    benchmark = create_benchmark('--num_warmup 32 --num_steps 128 --duration 0')
    benchmark._preprocess()
    end_time = 2
    curr_step = 50
    assert (benchmark._is_finished(curr_step, end_time) is False)
    curr_step = 160
    assert (benchmark._is_finished(curr_step, end_time))

    # Only duration takes effect, benchmarking finish due to duration.
    benchmark = create_benchmark('--num_warmup 32 --num_steps 0 --duration 10')
    benchmark._preprocess()
    benchmark._sub_benchmark_start_time = 0
    curr_step = 50
    end_time = 1
    assert (benchmark._is_finished(curr_step, end_time) is False)
    end_time = 10
    assert (benchmark._is_finished(curr_step, end_time))

    # Both step and duration take effect.
    benchmark = create_benchmark('--num_warmup 32 --num_steps 128 --duration 10')
    benchmark._preprocess()
    # Benchmarking finish due to step.
    curr_step = 160
    end_time = 2
    assert (benchmark._is_finished(curr_step, end_time))
    # Benchmarking finish due to duration.
    curr_step = 50
    end_time = 10
    assert (benchmark._is_finished(curr_step, end_time))