test_model_base.py 16.3 KB
Newer Older
1
2
3
4
5
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Tests for BenchmarkRegistry module."""

6
7
import json

8
from superbench.benchmarks import Platform, Framework, Precision, BenchmarkRegistry, BenchmarkType, ReturnCode
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from superbench.benchmarks.model_benchmarks import ModelBenchmark


class FakeModelBenchmark(ModelBenchmark):
    """Fake benchmark inherit from ModelBenchmark."""
    def __init__(self, name, parameters=''):
        """Constructor.

        Args:
            name: benchmark name.
            parameters: benchmark parameters.
        """
        super().__init__(name, parameters)
        self._supported_precision = [Precision.FLOAT32, Precision.FLOAT16]

    def add_parser_arguments(self):
        """Add the specified arguments."""
        super().add_parser_arguments()
        self._parser.add_argument(
            '--hidden_size',
            type=int,
            default=1024,
            required=False,
32
            help='Hidden size.',
33
34
35
36
37
38
39
        )

        self._parser.add_argument(
            '--seq_len',
            type=int,
            default=512,
            required=False,
40
            help='Sequence length.',
41
42
        )

43
44
45
46
    def _judge_gpu_availability(self):
        """Judge GPUs' availability according to arguments and running environment."""
        self._gpu_available = False

47
48
49
50
    def _set_force_fp32(self):
        """Set the config that controls whether full float32 precision will be used."""
        pass

51
52
    def _init_distributed_setting(self):
        """Initialize the distributed library and bind the worker to GPU."""
53
        return True
54
55
56

    def _generate_dataset(self):
        """Generate dataset for benchmarking according to shape info."""
57
        return True
58
59
60

    def _init_dataloader(self):
        """Initialize the distributed dataloader."""
61
        return True
62
63
64

    def _create_optimizer(self):
        """Create the optimzier instance used for training."""
65
        return True
66
67
68

    def _create_model(self, precision):
        """Construct the model for benchmarking."""
69
        return True
70
71
72
73
74
75
76
77
78
79
80
81
82

    def _train_step(self, precision):
        """Define the training process.

        Args:
            precision (str): precision of model and input data,
              such as float, half.

        Return:
            The step-time list of every training step.
        """
        duration = []
        for i in range(self._args.num_steps):
83
            duration.append(2.0)
84
85
86
87
88
89
90
91
92
93
94
95
96
97
        return duration

    def _inference_step(self, precision):
        """Define the inference process.

        Args:
            precision (str): precision of model and input data,
              such as float, half.

        Return:
            The latency list of every inference operation.
        """
        duration = []
        for i in range(self._args.num_steps):
98
            duration.append(4.0)
99
100
        return duration

101
    def _cal_params_count(self):
102
103
104
105
106
107
108
109
        """Calculate the parameters scale of the model.

        Return:
            The count of trainable parameters.
        """
        return 200


110
def create_benchmark(params='--num_steps 8'):
111
112
113
114
115
    """Register and create benchmark."""
    # Register the FakeModelBenchmark benchmark.
    BenchmarkRegistry.register_benchmark(
        'pytorch-fake-model',
        FakeModelBenchmark,
116
        parameters='--hidden_size 2',
117
118
        platform=Platform.CUDA,
    )
119
120
121
    context = BenchmarkRegistry.create_benchmark_context(
        'fake-model', platform=Platform.CUDA, parameters=params, framework=Framework.PYTORCH
    )
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
    name = BenchmarkRegistry._BenchmarkRegistry__get_benchmark_name(context)
    assert (name)
    (benchmark_class, predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(name, context.platform)
    assert (benchmark_class)
    return benchmark_class(name, predefine_params + ' ' + context.parameters)


def test_arguments_related_interfaces():
    """Test arguments related interfaces.

    Benchmark.add_parser_arguments(),
    Benchmark.parse_args(),
    Benchmark.get_configurable_settings()
    """
    # Positive case for parse_args().
137
    benchmark = create_benchmark('--num_steps 9')
138
139
140
141
142
    benchmark.add_parser_arguments()
    (ret, args, unknown) = benchmark.parse_args()
    assert (ret and args.num_steps == 9)

    # Negative case for parse_args() - invalid precision.
143
    benchmark = create_benchmark('--num_steps 8 --precision fp32')
144
145
146
147
148
149
150
151
    benchmark.add_parser_arguments()
    (ret, args, unknown) = benchmark.parse_args()
    assert (ret is False)

    # Test get_configurable_settings().
    settings = benchmark.get_configurable_settings()
    expected_settings = (
        """optional arguments:
152
  --batch_size int      The number of batch size.
153
154
  --distributed_backend DistributedBackend
                        Distributed backends. E.g. nccl mpi gloo.
155
156
  --distributed_impl DistributedImpl
                        Distributed implementations. E.g. ddp mirrored
157
                        multiworkermirrored parameterserver horovod.
158
  --duration int        The elapsed time of benchmark in seconds.
159
  --force_fp32          Enable option to use full float32 precision.
160
  --hidden_size int     Hidden size.
161
162
  --log_flushing        Real-time log flushing.
  --log_n_steps int     Real-time log every n steps.
163
164
  --log_raw_data        Log raw data into file instead of saving it into
                        result object.
165
166
167
168
169
  --model_action ModelAction [ModelAction ...]
                        Benchmark model process. E.g. train inference.
  --no_gpu              Disable GPU training.
  --num_steps int       The number of test step.
  --num_warmup int      The number of warmup step.
170
  --num_workers int     Number of subprocesses to use for data loading.
171
172
  --pin_memory          Enable option to pin memory in data loader.
  --precision Precision [Precision ...]
173
174
175
                        Model precision. E.g. fp8_hybrid fp8_e4m3 fp8_e5m2
                        float16 float32 float64 bfloat16 uint8 int8 int16
                        int32 int64.
176
177
  --run_count int       The run count of benchmark.
  --sample_count int    The number of data samples in dataset.
178
  --seq_len int         Sequence length."""
179
180
181
182
183
184
185
    )
    assert (settings == expected_settings)


def test_preprocess():
    """Test interface Benchmark._preprocess()."""
    # Positive case for _preprocess().
186
    benchmark = create_benchmark('--num_steps 8')
187
188
189
190
191
    assert (benchmark._preprocess())
    assert (benchmark.return_code == ReturnCode.SUCCESS)
    settings = benchmark.get_configurable_settings()
    expected_settings = (
        """optional arguments:
192
  --batch_size int      The number of batch size.
193
194
  --distributed_backend DistributedBackend
                        Distributed backends. E.g. nccl mpi gloo.
195
196
  --distributed_impl DistributedImpl
                        Distributed implementations. E.g. ddp mirrored
197
                        multiworkermirrored parameterserver horovod.
198
  --duration int        The elapsed time of benchmark in seconds.
199
  --force_fp32          Enable option to use full float32 precision.
200
  --hidden_size int     Hidden size.
201
202
  --log_flushing        Real-time log flushing.
  --log_n_steps int     Real-time log every n steps.
203
204
  --log_raw_data        Log raw data into file instead of saving it into
                        result object.
205
206
207
208
209
  --model_action ModelAction [ModelAction ...]
                        Benchmark model process. E.g. train inference.
  --no_gpu              Disable GPU training.
  --num_steps int       The number of test step.
  --num_warmup int      The number of warmup step.
210
  --num_workers int     Number of subprocesses to use for data loading.
211
212
  --pin_memory          Enable option to pin memory in data loader.
  --precision Precision [Precision ...]
213
214
215
                        Model precision. E.g. fp8_hybrid fp8_e4m3 fp8_e5m2
                        float16 float32 float64 bfloat16 uint8 int8 int16
                        int32 int64.
216
217
  --run_count int       The run count of benchmark.
  --sample_count int    The number of data samples in dataset.
218
  --seq_len int         Sequence length."""
219
220
221
222
    )
    assert (settings == expected_settings)

    # Negative case for _preprocess() - invalid precision.
223
    benchmark = create_benchmark('--num_steps 8 --precision fp32')
224
225
226
227
    assert (benchmark._preprocess() is False)
    assert (benchmark.return_code == ReturnCode.INVALID_ARGUMENT)

    # Negative case for _preprocess() - invalid benchmark type.
228
    benchmark = create_benchmark('--num_steps 8 --precision float32')
229
230
231
232
233
234
235
236
237
238
    benchmark._benchmark_type = Platform.CUDA
    assert (benchmark._preprocess() is False)
    assert (benchmark.return_code == ReturnCode.INVALID_BENCHMARK_TYPE)


def test_train():
    """Test interface Benchmark.__train()."""
    benchmark = create_benchmark()
    expected_result = (
        '{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 0, '
239
        '"start_time": null, "end_time": null, "raw_data": {'
240
241
242
        '"fp32_train_step_time": [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]], '
        '"fp32_train_throughput": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]]}, '
        '"result": {"return_code": [0], "fp32_train_step_time": [2.0], "fp32_train_throughput": [16000.0]}, '
243
        '"reduce_op": {"return_code": null, "fp32_train_step_time": null, "fp32_train_throughput": null}}'
244
245
246
    )
    assert (benchmark._preprocess())
    assert (benchmark._ModelBenchmark__train(Precision.FLOAT32))
247
    assert (json.loads(benchmark.serialized_result) == json.loads(expected_result))
248
249

    # Step time list is empty (simulate training failure).
250
    benchmark = create_benchmark('--num_steps 0')
251
    expected_result = (
252
        '{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 3, '
253
254
        '"start_time": null, "end_time": null, "raw_data": {}, '
        '"result": {"return_code": [3]}, "reduce_op": {"return_code": null}}'
255
256
257
    )
    assert (benchmark._preprocess())
    assert (benchmark._ModelBenchmark__train(Precision.FLOAT32) is False)
258
    assert (json.loads(benchmark.serialized_result) == json.loads(expected_result))
259
260
261
262
263
264
265


def test_inference():
    """Test interface Benchmark.__inference()."""
    benchmark = create_benchmark()
    expected_result = (
        '{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 0, '
266
        '"start_time": null, "end_time": null, "raw_data": {'
267
268
        '"fp16_inference_step_time": [[4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0]], '
        '"fp16_inference_throughput": [[8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0]]}, '
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
        '"result": {"return_code": [0], "fp16_inference_step_time": [4.0], '
        '"fp16_inference_step_time_50": [4.0], "fp16_inference_step_time_90": [4.0], '
        '"fp16_inference_step_time_95": [4.0], "fp16_inference_step_time_99": [4.0], '
        '"fp16_inference_step_time_99.9": [4.0], '
        '"fp16_inference_throughput": [8000.0], '
        '"fp16_inference_throughput_50": [8000.0], "fp16_inference_throughput_90": [8000.0], '
        '"fp16_inference_throughput_95": [8000.0], "fp16_inference_throughput_99": [8000.0], '
        '"fp16_inference_throughput_99.9": [8000.0]}, '
        '"reduce_op": {"return_code": null, "fp16_inference_step_time": null, '
        '"fp16_inference_step_time_50": null, "fp16_inference_step_time_90": null, '
        '"fp16_inference_step_time_95": null, "fp16_inference_step_time_99": null, '
        '"fp16_inference_step_time_99.9": null, "fp16_inference_throughput": null, '
        '"fp16_inference_throughput_50": null, "fp16_inference_throughput_90": null, '
        '"fp16_inference_throughput_95": null, "fp16_inference_throughput_99": null, '
        '"fp16_inference_throughput_99.9": null}}'
284
285
286
    )
    assert (benchmark._preprocess())
    assert (benchmark._ModelBenchmark__inference(Precision.FLOAT16))
287
    assert (json.loads(benchmark.serialized_result) == json.loads(expected_result))
288
289

    # Step time list is empty (simulate inference failure).
290
    benchmark = create_benchmark('--num_steps 0')
291
    expected_result = (
292
        '{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 3, '
293
294
        '"start_time": null, "end_time": null, "raw_data": {}, '
        '"result": {"return_code": [3]}, "reduce_op": {"return_code": null}}'
295
296
297
    )
    assert (benchmark._preprocess())
    assert (benchmark._ModelBenchmark__inference(Precision.FLOAT16) is False)
298
    assert (json.loads(benchmark.serialized_result) == json.loads(expected_result))
299
300
301
302
303
304
305
306
307
308
309
310
311


def test_benchmark():
    """Test interface Benchmark._benchmark()."""
    # Positive case for _benchmark().
    benchmark = create_benchmark()
    benchmark._preprocess()
    assert (benchmark._benchmark())
    assert (benchmark.name == 'pytorch-fake-model')
    assert (benchmark.type == BenchmarkType.MODEL)
    assert (benchmark.run_count == 1)
    assert (benchmark.return_code == ReturnCode.SUCCESS)
    expected_raw_data = {
312
313
314
315
        'fp32_train_step_time': [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]],
        'fp32_train_throughput': [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]],
        'fp16_train_step_time': [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]],
        'fp16_train_throughput': [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]]
316
317
318
    }
    assert (benchmark.raw_data == expected_raw_data)
    expected_result = {
319
        'return_code': [0],
320
321
322
323
        'fp32_train_step_time': [2.0],
        'fp32_train_throughput': [16000.0],
        'fp16_train_step_time': [2.0],
        'fp16_train_throughput': [16000.0]
324
325
326
327
328
    }
    assert (benchmark.result == expected_result)

    expected_serialized_result = (
        '{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 0, "start_time": null, '
329
330
331
332
333
334
        '"end_time": null, "raw_data": {"fp32_train_step_time": [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]], '
        '"fp32_train_throughput": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]], '
        '"fp16_train_step_time": [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]], '
        '"fp16_train_throughput": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]]}, '
        '"result": {"return_code": [0], "fp32_train_step_time": [2.0], "fp32_train_throughput": [16000.0], '
        '"fp16_train_step_time": [2.0], "fp16_train_throughput": [16000.0]}, '
335
336
        '"reduce_op": {"return_code": null, "fp32_train_step_time": null, "fp32_train_throughput": null, '
        '"fp16_train_step_time": null, "fp16_train_throughput": null}}'
337
    )
338
    assert (json.loads(benchmark.serialized_result) == json.loads(expected_serialized_result))
339
340

    # Negative case for _benchmark() - no supported precision found.
341
    benchmark = create_benchmark('--precision int16')
342
343
344
345
346
    assert (benchmark._preprocess())
    assert (benchmark._benchmark() is False)
    assert (benchmark.return_code == ReturnCode.NO_SUPPORTED_PRECISION)

    # Negative case for _benchmark() - model train failure, step time list is empty.
347
    benchmark = create_benchmark('--num_steps 0')
348
349
    assert (benchmark._preprocess())
    assert (benchmark._benchmark() is False)
350
    assert (benchmark.return_code == ReturnCode.INVALID_BENCHMARK_RESULT)
351
352

    # Negative case for _benchmark() - model inference failure, step time list is empty.
353
    benchmark = create_benchmark('--model_action inference --num_steps 0')
354
355
    assert (benchmark._preprocess())
    assert (benchmark._benchmark() is False)
356
    assert (benchmark.return_code == ReturnCode.INVALID_BENCHMARK_RESULT)
357
358
359
360
361
362
363
364
365
366
367
368
369


def test_check_result_format():
    """Test interface Benchmark.__check_result_format()."""
    # Positive case for __check_result_format().
    benchmark = create_benchmark()
    benchmark._preprocess()
    assert (benchmark._benchmark())
    assert (benchmark._Benchmark__check_result_type())
    assert (benchmark._Benchmark__check_summarized_result())
    assert (benchmark._Benchmark__check_raw_data())

    # Negative case for __check_result_format() - change List[int] to List[str].
370
    benchmark._result._BenchmarkResult__result = {'return_code': [0], 'metric1': ['2.0']}
371
372
373
374
375
376
377
378
379
    assert (benchmark._Benchmark__check_summarized_result() is False)

    # Negative case for __check_raw_data() - change List[List[int]] to List[List[str]].
    benchmark._result._BenchmarkResult__raw_data = {'metric1': [['2.0']]}
    assert (benchmark._Benchmark__check_raw_data() is False)

    # Negative case for __check_raw_data() - invalid benchmark result.
    assert (benchmark._Benchmark__check_result_format() is False)
    assert (benchmark.return_code == ReturnCode.INVALID_BENCHMARK_RESULT)