test_model_base.py 17.7 KB
Newer Older
1
2
3
4
5
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Tests for BenchmarkRegistry module."""

6
7
import json

8
from superbench.benchmarks import Platform, Framework, Precision, BenchmarkRegistry, BenchmarkType, ReturnCode
9
10
11
12
13
14
15
16
17
18
19
20
21
22
from superbench.benchmarks.model_benchmarks import ModelBenchmark


class FakeModelBenchmark(ModelBenchmark):
    """Fake benchmark inherit from ModelBenchmark."""
    def __init__(self, name, parameters=''):
        """Constructor.

        Args:
            name: benchmark name.
            parameters: benchmark parameters.
        """
        super().__init__(name, parameters)
        self._supported_precision = [Precision.FLOAT32, Precision.FLOAT16]
23
        self._sub_benchmark_start_time = 0
24
25
26
27
28
29
30
31
32

    def add_parser_arguments(self):
        """Add the specified arguments."""
        super().add_parser_arguments()
        self._parser.add_argument(
            '--hidden_size',
            type=int,
            default=1024,
            required=False,
33
            help='Hidden size.',
34
35
36
37
38
39
40
        )

        self._parser.add_argument(
            '--seq_len',
            type=int,
            default=512,
            required=False,
41
            help='Sequence length.',
42
43
        )

44
45
46
47
    def _judge_gpu_availability(self):
        """Judge GPUs' availability according to arguments and running environment."""
        self._gpu_available = False

48
49
50
51
    def _set_force_fp32(self):
        """Set the config that controls whether full float32 precision will be used."""
        pass

52
53
    def _init_distributed_setting(self):
        """Initialize the distributed library and bind the worker to GPU."""
54
        return True
55
56
57

    def _generate_dataset(self):
        """Generate dataset for benchmarking according to shape info."""
58
        return True
59
60
61

    def _init_dataloader(self):
        """Initialize the distributed dataloader."""
62
        return True
63
64
65

    def _create_optimizer(self):
        """Create the optimzier instance used for training."""
66
        return True
67
68
69

    def _create_model(self, precision):
        """Construct the model for benchmarking."""
70
        return True
71
72
73
74
75
76
77
78
79
80
81
82
83

    def _train_step(self, precision):
        """Define the training process.

        Args:
            precision (str): precision of model and input data,
              such as float, half.

        Return:
            The step-time list of every training step.
        """
        duration = []
        for i in range(self._args.num_steps):
84
            duration.append(2.0)
85
86
87
88
89
90
91
92
93
94
95
96
97
98
        return duration

    def _inference_step(self, precision):
        """Define the inference process.

        Args:
            precision (str): precision of model and input data,
              such as float, half.

        Return:
            The latency list of every inference operation.
        """
        duration = []
        for i in range(self._args.num_steps):
99
            duration.append(4.0)
100
101
        return duration

102
    def _cal_params_count(self):
103
104
105
106
107
108
109
110
        """Calculate the parameters scale of the model.

        Return:
            The count of trainable parameters.
        """
        return 200


111
def create_benchmark(params='--num_steps 8'):
112
113
114
115
116
    """Register and create benchmark."""
    # Register the FakeModelBenchmark benchmark.
    BenchmarkRegistry.register_benchmark(
        'pytorch-fake-model',
        FakeModelBenchmark,
117
        parameters='--hidden_size 2',
118
119
        platform=Platform.CUDA,
    )
120
121
122
    context = BenchmarkRegistry.create_benchmark_context(
        'fake-model', platform=Platform.CUDA, parameters=params, framework=Framework.PYTORCH
    )
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
    name = BenchmarkRegistry._BenchmarkRegistry__get_benchmark_name(context)
    assert (name)
    (benchmark_class, predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(name, context.platform)
    assert (benchmark_class)
    return benchmark_class(name, predefine_params + ' ' + context.parameters)


def test_arguments_related_interfaces():
    """Test arguments related interfaces.

    Benchmark.add_parser_arguments(),
    Benchmark.parse_args(),
    Benchmark.get_configurable_settings()
    """
    # Positive case for parse_args().
138
    benchmark = create_benchmark('--num_steps 9')
139
140
141
142
143
    benchmark.add_parser_arguments()
    (ret, args, unknown) = benchmark.parse_args()
    assert (ret and args.num_steps == 9)

    # Negative case for parse_args() - invalid precision.
144
    benchmark = create_benchmark('--num_steps 8 --precision fp32')
145
146
147
148
149
150
151
152
    benchmark.add_parser_arguments()
    (ret, args, unknown) = benchmark.parse_args()
    assert (ret is False)

    # Test get_configurable_settings().
    settings = benchmark.get_configurable_settings()
    expected_settings = (
        """optional arguments:
153
  --batch_size int      The number of batch size.
154
155
  --distributed_backend DistributedBackend
                        Distributed backends. E.g. nccl mpi gloo.
156
157
  --distributed_impl DistributedImpl
                        Distributed implementations. E.g. ddp mirrored
158
                        multiworkermirrored parameterserver horovod.
159
  --duration int        The elapsed time of benchmark in seconds.
160
  --exclude_copy_time   Exclude GPU data copy time from measured time.
161
  --force_fp32          Enable option to use full float32 precision.
162
  --hidden_size int     Hidden size.
163
164
  --log_flushing        Real-time log flushing.
  --log_n_steps int     Real-time log every n steps.
165
166
  --log_raw_data        Log raw data into file instead of saving it into
                        result object.
167
168
169
170
171
  --model_action ModelAction [ModelAction ...]
                        Benchmark model process. E.g. train inference.
  --no_gpu              Disable GPU training.
  --num_steps int       The number of test step.
  --num_warmup int      The number of warmup step.
172
  --num_workers int     Number of subprocesses to use for data loading.
173
174
  --pin_memory          Enable option to pin memory in data loader.
  --precision Precision [Precision ...]
175
176
177
                        Model precision. E.g. fp8_hybrid fp8_e4m3 fp8_e5m2
                        float16 float32 float64 bfloat16 uint8 int8 int16
                        int32 int64.
178
179
  --run_count int       The run count of benchmark.
  --sample_count int    The number of data samples in dataset.
180
  --seq_len int         Sequence length."""
181
182
183
184
185
186
187
    )
    assert (settings == expected_settings)


def test_preprocess():
    """Test interface Benchmark._preprocess()."""
    # Positive case for _preprocess().
188
    benchmark = create_benchmark('--num_steps 8')
189
190
191
192
193
    assert (benchmark._preprocess())
    assert (benchmark.return_code == ReturnCode.SUCCESS)
    settings = benchmark.get_configurable_settings()
    expected_settings = (
        """optional arguments:
194
  --batch_size int      The number of batch size.
195
196
  --distributed_backend DistributedBackend
                        Distributed backends. E.g. nccl mpi gloo.
197
198
  --distributed_impl DistributedImpl
                        Distributed implementations. E.g. ddp mirrored
199
                        multiworkermirrored parameterserver horovod.
200
  --duration int        The elapsed time of benchmark in seconds.
201
  --exclude_copy_time   Exclude GPU data copy time from measured time.
202
  --force_fp32          Enable option to use full float32 precision.
203
  --hidden_size int     Hidden size.
204
205
  --log_flushing        Real-time log flushing.
  --log_n_steps int     Real-time log every n steps.
206
207
  --log_raw_data        Log raw data into file instead of saving it into
                        result object.
208
209
210
211
212
  --model_action ModelAction [ModelAction ...]
                        Benchmark model process. E.g. train inference.
  --no_gpu              Disable GPU training.
  --num_steps int       The number of test step.
  --num_warmup int      The number of warmup step.
213
  --num_workers int     Number of subprocesses to use for data loading.
214
215
  --pin_memory          Enable option to pin memory in data loader.
  --precision Precision [Precision ...]
216
217
218
                        Model precision. E.g. fp8_hybrid fp8_e4m3 fp8_e5m2
                        float16 float32 float64 bfloat16 uint8 int8 int16
                        int32 int64.
219
220
  --run_count int       The run count of benchmark.
  --sample_count int    The number of data samples in dataset.
221
  --seq_len int         Sequence length."""
222
223
224
225
    )
    assert (settings == expected_settings)

    # Negative case for _preprocess() - invalid precision.
226
    benchmark = create_benchmark('--num_steps 8 --precision fp32')
227
228
229
230
    assert (benchmark._preprocess() is False)
    assert (benchmark.return_code == ReturnCode.INVALID_ARGUMENT)

    # Negative case for _preprocess() - invalid benchmark type.
231
    benchmark = create_benchmark('--num_steps 8 --precision float32')
232
233
234
235
236
237
238
239
240
241
    benchmark._benchmark_type = Platform.CUDA
    assert (benchmark._preprocess() is False)
    assert (benchmark.return_code == ReturnCode.INVALID_BENCHMARK_TYPE)


def test_train():
    """Test interface Benchmark.__train()."""
    benchmark = create_benchmark()
    expected_result = (
        '{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 0, '
242
        '"start_time": null, "end_time": null, "raw_data": {'
243
244
245
        '"fp32_train_step_time": [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]], '
        '"fp32_train_throughput": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]]}, '
        '"result": {"return_code": [0], "fp32_train_step_time": [2.0], "fp32_train_throughput": [16000.0]}, '
246
        '"reduce_op": {"return_code": null, "fp32_train_step_time": null, "fp32_train_throughput": null}}'
247
248
249
    )
    assert (benchmark._preprocess())
    assert (benchmark._ModelBenchmark__train(Precision.FLOAT32))
250
    assert (json.loads(benchmark.serialized_result) == json.loads(expected_result))
251
252

    # Step time list is empty (simulate training failure).
253
    benchmark = create_benchmark('--num_steps 0')
254
    expected_result = (
255
        '{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 3, '
256
257
        '"start_time": null, "end_time": null, "raw_data": {}, '
        '"result": {"return_code": [3]}, "reduce_op": {"return_code": null}}'
258
259
260
    )
    assert (benchmark._preprocess())
    assert (benchmark._ModelBenchmark__train(Precision.FLOAT32) is False)
261
    assert (json.loads(benchmark.serialized_result) == json.loads(expected_result))
262
263
264
265
266
267
268


def test_inference():
    """Test interface Benchmark.__inference()."""
    benchmark = create_benchmark()
    expected_result = (
        '{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 0, '
269
        '"start_time": null, "end_time": null, "raw_data": {'
270
271
        '"fp16_inference_step_time": [[4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0]], '
        '"fp16_inference_throughput": [[8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0]]}, '
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
        '"result": {"return_code": [0], "fp16_inference_step_time": [4.0], '
        '"fp16_inference_step_time_50": [4.0], "fp16_inference_step_time_90": [4.0], '
        '"fp16_inference_step_time_95": [4.0], "fp16_inference_step_time_99": [4.0], '
        '"fp16_inference_step_time_99.9": [4.0], '
        '"fp16_inference_throughput": [8000.0], '
        '"fp16_inference_throughput_50": [8000.0], "fp16_inference_throughput_90": [8000.0], '
        '"fp16_inference_throughput_95": [8000.0], "fp16_inference_throughput_99": [8000.0], '
        '"fp16_inference_throughput_99.9": [8000.0]}, '
        '"reduce_op": {"return_code": null, "fp16_inference_step_time": null, '
        '"fp16_inference_step_time_50": null, "fp16_inference_step_time_90": null, '
        '"fp16_inference_step_time_95": null, "fp16_inference_step_time_99": null, '
        '"fp16_inference_step_time_99.9": null, "fp16_inference_throughput": null, '
        '"fp16_inference_throughput_50": null, "fp16_inference_throughput_90": null, '
        '"fp16_inference_throughput_95": null, "fp16_inference_throughput_99": null, '
        '"fp16_inference_throughput_99.9": null}}'
287
288
289
    )
    assert (benchmark._preprocess())
    assert (benchmark._ModelBenchmark__inference(Precision.FLOAT16))
290
    assert (json.loads(benchmark.serialized_result) == json.loads(expected_result))
291
292

    # Step time list is empty (simulate inference failure).
293
    benchmark = create_benchmark('--num_steps 0')
294
    expected_result = (
295
        '{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 3, '
296
297
        '"start_time": null, "end_time": null, "raw_data": {}, '
        '"result": {"return_code": [3]}, "reduce_op": {"return_code": null}}'
298
299
300
    )
    assert (benchmark._preprocess())
    assert (benchmark._ModelBenchmark__inference(Precision.FLOAT16) is False)
301
    assert (json.loads(benchmark.serialized_result) == json.loads(expected_result))
302
303
304
305
306
307
308
309
310
311
312
313
314


def test_benchmark():
    """Test interface Benchmark._benchmark()."""
    # Positive case for _benchmark().
    benchmark = create_benchmark()
    benchmark._preprocess()
    assert (benchmark._benchmark())
    assert (benchmark.name == 'pytorch-fake-model')
    assert (benchmark.type == BenchmarkType.MODEL)
    assert (benchmark.run_count == 1)
    assert (benchmark.return_code == ReturnCode.SUCCESS)
    expected_raw_data = {
315
316
317
318
        'fp32_train_step_time': [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]],
        'fp32_train_throughput': [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]],
        'fp16_train_step_time': [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]],
        'fp16_train_throughput': [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]]
319
320
321
    }
    assert (benchmark.raw_data == expected_raw_data)
    expected_result = {
322
        'return_code': [0],
323
324
325
326
        'fp32_train_step_time': [2.0],
        'fp32_train_throughput': [16000.0],
        'fp16_train_step_time': [2.0],
        'fp16_train_throughput': [16000.0]
327
328
329
330
331
    }
    assert (benchmark.result == expected_result)

    expected_serialized_result = (
        '{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 0, "start_time": null, '
332
333
334
335
336
337
        '"end_time": null, "raw_data": {"fp32_train_step_time": [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]], '
        '"fp32_train_throughput": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]], '
        '"fp16_train_step_time": [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]], '
        '"fp16_train_throughput": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]]}, '
        '"result": {"return_code": [0], "fp32_train_step_time": [2.0], "fp32_train_throughput": [16000.0], '
        '"fp16_train_step_time": [2.0], "fp16_train_throughput": [16000.0]}, '
338
339
        '"reduce_op": {"return_code": null, "fp32_train_step_time": null, "fp32_train_throughput": null, '
        '"fp16_train_step_time": null, "fp16_train_throughput": null}}'
340
    )
341
    assert (json.loads(benchmark.serialized_result) == json.loads(expected_serialized_result))
342
343

    # Negative case for _benchmark() - no supported precision found.
344
    benchmark = create_benchmark('--precision int16')
345
346
347
348
349
    assert (benchmark._preprocess())
    assert (benchmark._benchmark() is False)
    assert (benchmark.return_code == ReturnCode.NO_SUPPORTED_PRECISION)

    # Negative case for _benchmark() - model train failure, step time list is empty.
350
    benchmark = create_benchmark('--num_steps 0')
351
352
    assert (benchmark._preprocess())
    assert (benchmark._benchmark() is False)
353
    assert (benchmark.return_code == ReturnCode.INVALID_BENCHMARK_RESULT)
354
355

    # Negative case for _benchmark() - model inference failure, step time list is empty.
356
    benchmark = create_benchmark('--model_action inference --num_steps 0')
357
358
    assert (benchmark._preprocess())
    assert (benchmark._benchmark() is False)
359
    assert (benchmark.return_code == ReturnCode.INVALID_BENCHMARK_RESULT)
360
361
362
363
364
365
366
367
368
369
370
371
372


def test_check_result_format():
    """Test interface Benchmark.__check_result_format()."""
    # Positive case for __check_result_format().
    benchmark = create_benchmark()
    benchmark._preprocess()
    assert (benchmark._benchmark())
    assert (benchmark._Benchmark__check_result_type())
    assert (benchmark._Benchmark__check_summarized_result())
    assert (benchmark._Benchmark__check_raw_data())

    # Negative case for __check_result_format() - change List[int] to List[str].
373
    benchmark._result._BenchmarkResult__result = {'return_code': [0], 'metric1': ['2.0']}
374
375
376
377
378
379
380
381
382
    assert (benchmark._Benchmark__check_summarized_result() is False)

    # Negative case for __check_raw_data() - change List[List[int]] to List[List[str]].
    benchmark._result._BenchmarkResult__raw_data = {'metric1': [['2.0']]}
    assert (benchmark._Benchmark__check_raw_data() is False)

    # Negative case for __check_raw_data() - invalid benchmark result.
    assert (benchmark._Benchmark__check_result_format() is False)
    assert (benchmark.return_code == ReturnCode.INVALID_BENCHMARK_RESULT)
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416


def test_is_finished():
    """Test interface Benchmark._is_finished()."""
    # Only step takes effect, benchmarking finish due to step.
    benchmark = create_benchmark('--num_warmup 32 --num_steps 128 --duration 0')
    benchmark._preprocess()
    end_time = 2
    curr_step = 50
    assert (benchmark._is_finished(curr_step, end_time) is False)
    curr_step = 160
    assert (benchmark._is_finished(curr_step, end_time))

    # Only duration takes effect, benchmarking finish due to duration.
    benchmark = create_benchmark('--num_warmup 32 --num_steps 0 --duration 10')
    benchmark._preprocess()
    benchmark._sub_benchmark_start_time = 0
    curr_step = 50
    end_time = 1
    assert (benchmark._is_finished(curr_step, end_time) is False)
    end_time = 10
    assert (benchmark._is_finished(curr_step, end_time))

    # Both step and duration take effect.
    benchmark = create_benchmark('--num_warmup 32 --num_steps 128 --duration 10')
    benchmark._preprocess()
    # Benchmarking finish due to step.
    curr_step = 160
    end_time = 2
    assert (benchmark._is_finished(curr_step, end_time))
    # Benchmarking finish due to duration.
    curr_step = 50
    end_time = 10
    assert (benchmark._is_finished(curr_step, end_time))