test_dist_inference.py 10.4 KB
Newer Older
1
2
3
4
5
6
7
8
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Tests for distributed inference benchmark."""

import unittest

from tests.helper import decorator
9
from tests.helper.testcase import BenchmarkTestCase
10
11
import tests.benchmarks.utils as utils
from superbench.benchmarks \
12
13
    import BenchmarkRegistry, Framework, BenchmarkType, ReturnCode, Precision, DistributedImpl, DistributedBackend, \
    Platform
14
15
16
17
18
19
20
21
22
23
24
from superbench.benchmarks.micro_benchmarks.dist_inference \
    import DistInference, ComputationKernelType, CommunicationKernelType, ActivationKernelType
from superbench.common.utils import network


# TODO - replace unittest.skip("no multiple GPUs") to decorator of skipIfNoMultiGPUS
@unittest.skip('no multiple GPUs')
@decorator.cuda_test
@decorator.pytorch_test
def test_pytorch_dist_inference_normal():
    """Test pytorch-dist-inference benchmark on distributed normal case."""
25
26
27
    context = BenchmarkRegistry.create_benchmark_context(
        'dist-inference', parameters='--use_pytorch', framework=Framework.PYTORCH
    )
28
29
30
31
32
33
34
35
36
37
38
39
    world_size = 2
    assert (BenchmarkRegistry.is_benchmark_context_valid(context))
    results = utils.simulated_ddp_distributed_benchmark(context, world_size)
    assert (results)
    for benchmark in results:
        # Check basic information.
        assert (benchmark)
        assert (isinstance(benchmark, DistInference))
        assert (benchmark.name == 'pytorch-dist-inference')
        assert (benchmark.type == BenchmarkType.MICRO)

        # Check predefined parameters of dist-inference benchmark.
40
        assert (benchmark._args.use_pytorch is True)
41
42
43
        assert (benchmark._args.batch_size == 64)
        assert (benchmark._args.input_size == 1024)
        assert (benchmark._args.hidden_size == 1024)
44
45
        assert (benchmark._args.alpha == 1.0)
        assert (benchmark._args.beta == 1.0)
46
47
48
49
50
51
52
53
54
        assert (benchmark._args.num_layers == 1)
        assert (benchmark._args.computation_kernel == ComputationKernelType.MATMUL)
        assert (benchmark._args.communication_kernel == CommunicationKernelType.ALLREDUCE)
        assert (benchmark._args.activation_kernel == ActivationKernelType.RELU)
        assert (benchmark._args.precision == Precision.FLOAT32)
        assert (benchmark._args.num_warmup == 50)
        assert (benchmark._args.num_steps == 10000)
        assert (benchmark._args.distributed_impl == DistributedImpl.DDP)
        assert (benchmark._args.distributed_backend == DistributedBackend.NCCL)
55
        assert (benchmark._args.use_cuda_graph is False)
56
        assert (benchmark._args.tune_gemm is False)
57
58
59
60
61
62
63

        # Check results and metrics.
        assert (benchmark.run_count == 1)
        assert (benchmark.return_code == ReturnCode.SUCCESS)
        # step_times
        assert (len(benchmark.raw_data) == 1)
        # return code + (avg, 50th, 90th, 95th, 99th, 99.9th)
64
        assert (7 == len(benchmark.result))
65
66
67
68
69
70


@decorator.cuda_test
@decorator.pytorch_test
def test_pytorch_dist_inference_fake_distributed():
    """Test pytorch-dist-inference benchmark on single gpu."""
71
72
73
    context = BenchmarkRegistry.create_benchmark_context(
        'dist-inference', parameters='--use_pytorch', framework=Framework.PYTORCH
    )
74
75
76
77
78
79
80
81
82
83
84
85
    port = network.get_free_port()
    assert (port)
    utils.setup_simulated_ddp_distributed_env(1, 0, port)
    benchmark = BenchmarkRegistry.launch_benchmark(context)

    # Check basic information.
    assert (benchmark)
    assert (isinstance(benchmark, DistInference))
    assert (benchmark.name == 'pytorch-dist-inference')
    assert (benchmark.type == BenchmarkType.MICRO)

    # Check predefined parameters of dist-inference benchmark.
86
    assert (benchmark._args.use_pytorch is True)
87
88
89
    assert (benchmark._args.batch_size == 64)
    assert (benchmark._args.input_size == 1024)
    assert (benchmark._args.hidden_size == 1024)
90
91
    assert (benchmark._args.alpha == 1.0)
    assert (benchmark._args.beta == 1.0)
92
93
94
95
96
97
98
99
100
    assert (benchmark._args.num_layers == 1)
    assert (benchmark._args.computation_kernel == ComputationKernelType.MATMUL)
    assert (benchmark._args.communication_kernel == CommunicationKernelType.ALLREDUCE)
    assert (benchmark._args.activation_kernel == ActivationKernelType.RELU)
    assert (benchmark._args.precision == Precision.FLOAT32)
    assert (benchmark._args.num_warmup == 50)
    assert (benchmark._args.num_steps == 10000)
    assert (benchmark._args.distributed_impl == DistributedImpl.DDP)
    assert (benchmark._args.distributed_backend == DistributedBackend.NCCL)
101
    assert (benchmark._args.use_cuda_graph is False)
102
    assert (benchmark._args.tune_gemm is False)
103
104
105
106
107
108
109
110
111
112

    # Check results and metrics.
    assert (benchmark.run_count == 1)
    assert (benchmark.return_code == ReturnCode.SUCCESS)
    # step_times
    assert (len(benchmark.raw_data) == 1)
    # return code + (avg, 50th, 90th, 95th, 99th, 99.9th)
    assert (len(benchmark.result) == 7)

    utils.clean_simulated_ddp_distributed_env()
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140


class DistInferenceCppImplTest(BenchmarkTestCase, unittest.TestCase):
    """Test class for pytorch-dist-inference benchmark."""
    @classmethod
    def setUpClass(cls):
        """Hook method for setting up class fixture before running tests in the class."""
        super().setUpClass()
        cls.createMockEnvs(cls)
        cls.createMockFiles(cls, ['bin/dist_inference'])

    def _test_dist_inference_command_generation(self, platform):
        """Test pytorch-dist-inference cpp impl benchmark command generation."""
        benchmark_name = 'pytorch-dist-inference'
        (benchmark_class,
         predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, platform)
        assert (benchmark_class)

        batch_size = 1
        input_size = 2
        hidden_size = 3
        alpha = 4.0
        beta = 5.0
        num_layers = 6
        num_warmup = 7
        num_steps = 8
        wrapper_params_format_str = \
            '--batch_size %d --input_size %d --hidden_size %d ' \
141
            '--alpha %g --beta %g --num_layers %d --num_warmup %d --num_steps %d --use_cuda_graph --tune_gemm'
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
        parameters = wrapper_params_format_str % (
            batch_size, input_size, hidden_size, alpha, beta, num_layers, num_warmup, num_steps
        )
        benchmark = benchmark_class(benchmark_name, parameters=parameters)

        # Check basic information
        assert (benchmark)
        ret = benchmark._preprocess()
        assert (ret is True)
        assert (benchmark.return_code == ReturnCode.SUCCESS)
        assert (benchmark.name == benchmark_name)
        assert (benchmark.type == BenchmarkType.MICRO)

        # Check parameters specified in BenchmarkContext.
        assert (benchmark._args.use_pytorch is False)
        assert (benchmark._args.batch_size == batch_size)
        assert (benchmark._args.input_size == input_size)
        assert (benchmark._args.hidden_size == hidden_size)
        assert (benchmark._args.alpha == alpha)
        assert (benchmark._args.beta == beta)
        assert (benchmark._args.num_layers == num_layers)
        assert (benchmark._args.num_warmup == num_warmup)
        assert (benchmark._args.num_steps == num_steps)
        assert (benchmark._args.use_cuda_graph is True)
166
        assert (benchmark._args.tune_gemm is True)
167
168
169
170
171
172
173

        # Check command
        assert (1 == len(benchmark._commands))
        for cmd in benchmark._commands:
            m, n, k = hidden_size, batch_size, input_size
            bench_params_format_str = \
                '%s -m %d -n %d -k %d --alpha %g --beta %g ' + \
174
                '--num_layers %d --num_warmups %d --num_iters %d --use_cuda_graph --tune_gemm'
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
            assert (
                cmd == (
                    bench_params_format_str %
                    (benchmark._DistInference__bin_path, m, n, k, alpha, beta, num_layers, num_warmup, num_steps)
                )
            )

    @decorator.cuda_test
    def test_dist_inference_command_generation_cuda(self):
        """Test pytorch-dist-inference cpp impl benchmark command generation, CUDA case."""
        self._test_dist_inference_command_generation(Platform.CUDA)

    @decorator.rocm_test
    def test_dist_inference_command_generation_rocm(self):
        """Test pytorch-dist-inference cpp impl benchmark command generation, ROCm case."""
        self._test_dist_inference_command_generation(Platform.ROCM)

    @decorator.load_data('tests/data/dist_inference.log')
    def _test_dist_inference_result_parsing(self, platform, test_raw_output):
        """Test pytorch-dist-inference cpp impl benchmark result parsing."""
        benchmark_name = 'pytorch-dist-inference'
        (benchmark_class,
         predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, platform)
        assert (benchmark_class)
        benchmark = benchmark_class(benchmark_name, parameters='')
        assert (benchmark)
        ret = benchmark._preprocess()
        assert (ret is True)
        assert (benchmark.return_code == ReturnCode.SUCCESS)
        assert (benchmark.name == 'pytorch-dist-inference')
        assert (benchmark.type == BenchmarkType.MICRO)

        # Positive case - valid raw output.
        assert (benchmark._process_raw_result(0, test_raw_output))
        assert (benchmark.return_code == ReturnCode.SUCCESS)

        # step_times
        assert (len(benchmark.raw_data) == 2)
        # return code + (avg, 50th, 90th, 95th, 99th, 99.9th)
        assert (7 == len(benchmark.result))
215
216
217
218
219
220
221
        assert (benchmark.result['return_code'] == [0])
        assert (benchmark.result['step_times'] == [1.9052048])
        assert (benchmark.result['step_times_50'] == [1.851])
        assert (benchmark.result['step_times_90'] == [1.89637])
        assert (benchmark.result['step_times_95'] == [2.12037])
        assert (benchmark.result['step_times_99'] == [2.67155])
        assert (benchmark.result['step_times_99.9'] == [4.4198])
222
223

        # Negative case - invalid raw output.
224
        assert (benchmark._process_raw_result(1, 'Latency of step: xxx ms') is False)
225
226
227
228
229
230
231
232
233
234
235
        assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)

    @decorator.cuda_test
    def test_dist_inference_result_parsing_cuda(self):
        """Test pytorch-dist-inference cpp impl benchmark result parsing, CUDA case."""
        self._test_dist_inference_result_parsing(Platform.CUDA)

    @decorator.rocm_test
    def test_dist_inference_result_parsing_rocm(self):
        """Test pytorch-dist-inference cpp impl benchmark result parsing, ROCm case."""
        self._test_dist_inference_result_parsing(Platform.ROCM)