"vscode:/vscode.git/clone" did not exist on "b6a61c3f7d0267c77f8626167cc1eda0335f2753"
test_elastic.py 8.9 KB
Newer Older
aiss's avatar
aiss committed
1
2
3
4
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team
aiss's avatar
aiss committed
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19

import pytest
import deepspeed
from unit.common import DistributedTest
from deepspeed.git_version_info import version as ds_version
import os
from unit.simple_model import SimpleModel


@pytest.fixture
def ds_config():
    config_dict = {
        "elasticity": {
            "enabled": True,
            "max_train_batch_size": 10000,
aiss's avatar
aiss committed
20
            "micro_batch_sizes": [8, 12, 16, 17],
aiss's avatar
aiss committed
21
22
23
24
25
26
27
28
29
30
            "min_gpus": 32,
            "max_gpus": 1500,
            "min_time": 20,
            "version": 0.1
        }
    }
    return config_dict


def test_basic_10k(ds_config):
aiss's avatar
aiss committed
31
32
    final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
                                                                               target_deepspeed_version=ds_version)
aiss's avatar
aiss committed
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50

    for gpu_num in valid_gpus:
        assert final_batch_size % gpu_num == 0, f"Batch {final_batch_size} is not divisible by GPU count {gpu_num}"
        batch_per_gpu = final_batch_size // gpu_num
        found_valid_mbsize = False

        for mb in ds_config['elasticity']['micro_batch_sizes']:
            if batch_per_gpu % mb == 0:
                found_valid_mb = True
                break
        assert found_valid_mb, "No valid mb found"

    assert len(valid_gpus) == 23
    assert final_batch_size == 9792


def test_old_version(ds_config):
    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
aiss's avatar
aiss committed
51
52
        final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
                                                                                   target_deepspeed_version="0.2")
aiss's avatar
aiss committed
53
54
55
56
57


def test_disabled(ds_config):
    ds_config['elasticity']['enabled'] = False
    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
aiss's avatar
aiss committed
58
59
        final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
                                                                                   target_deepspeed_version=ds_version)
aiss's avatar
aiss committed
60
61
62
63


def test_valid_world_size(ds_config):
    final_batch_size, valid_gpus, mbsize = deepspeed.elasticity.compute_elastic_config(
aiss's avatar
aiss committed
64
        ds_config=ds_config, target_deepspeed_version=ds_version, world_size=64)
aiss's avatar
aiss committed
65
66
67
68
69
70
    assert mbsize == 17


def test_invalid_world_size(ds_config):
    with pytest.raises(deepspeed.elasticity.config.ElasticityIncompatibleWorldSize):
        final_batch_size, valid_gpus, mbsize = deepspeed.elasticity.compute_elastic_config(
aiss's avatar
aiss committed
71
            ds_config=ds_config, target_deepspeed_version=ds_version, world_size=128)
aiss's avatar
aiss committed
72
73
74
75
76


def test_future_elastic_version(ds_config):
    ds_config['elasticity']['version'] = '0.3'
    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
aiss's avatar
aiss committed
77
        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
aiss's avatar
aiss committed
78
79
80
81
82


def test_missing_max_batch(ds_config):
    del ds_config['elasticity']['max_train_batch_size']
    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
aiss's avatar
aiss committed
83
        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
aiss's avatar
aiss committed
84
85
86
87
88


def test_missing_micro_batch(ds_config):
    del ds_config['elasticity']['micro_batch_sizes']
    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
aiss's avatar
aiss committed
89
        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
aiss's avatar
aiss committed
90
91
92
93
94


def test_empty_config():
    ds_config = {"elasticity": {"enabled": True}}
    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
aiss's avatar
aiss committed
95
        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
aiss's avatar
aiss committed
96
97
98
99
100
101
102
103


def test_model_parallel_v1_invalid(ds_config):
    ds_config["elasticity"]["model_parallel_size"] = 4
    ds_config["elasticity"]["num_gpus_per_node"] = 8
    ds_config["elasticity"]["version"] = 0.1

    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
aiss's avatar
aiss committed
104
        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
aiss's avatar
aiss committed
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123


def test_model_parallel_v2_invalid(ds_config):
    ds_config["elasticity"]["model_parallel_size"] = 16
    ds_config["elasticity"]["num_gpus_per_node"] = 8
    ds_config["elasticity"]["version"] = 0.2

    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
                                                    target_deepspeed_version=ds_version,
                                                    world_size=16)


def test_model_parallel_v2_valid(ds_config):
    ds_config["elasticity"]["model_parallel_size"] = 4
    ds_config["elasticity"]["num_gpus_per_node"] = 8
    ds_config["elasticity"]["version"] = 0.2

    os.environ["WORLD_SIZE"] = str(16)
aiss's avatar
aiss committed
124
    deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
aiss's avatar
aiss committed
125
126
127
    os.environ.pop("WORLD_SIZE")


aiss's avatar
aiss committed
128
129
130
@pytest.mark.parametrize('key, value', [('micro_batch_sizes', [1, 4, -1, 2, -10]), ('min_gpus', -1), ('max_gpus', -1),
                                        ('micro_batch_sizes', 5), ('micro_batch_sizes', ['a', None, 0.5]),
                                        ('micro_batch_sizes', [2, 0.5, 4])])
aiss's avatar
aiss committed
131
132
133
def test_invalid_config_values(key, value, ds_config):
    ds_config['elasticity'][key] = value
    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
aiss's avatar
aiss committed
134
        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
aiss's avatar
aiss committed
135
136
137
138
139
140
141


def test_proper_mbsz(ds_config):
    ds_config["elasticity"]["max_train_batch_size"] = 32
    ds_config["elasticity"]["micro_batch_sizes"] = [1, 2, 3, 7]
    ds_config["elasticity"]["min_gpus"] = 1
    final_batch_size, valid_gpus, mbsize = deepspeed.elasticity.compute_elastic_config(
aiss's avatar
aiss committed
142
        ds_config=ds_config, target_deepspeed_version=ds_version, world_size=7)
aiss's avatar
aiss committed
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
    assert mbsize == 3


class TestNonElasticBatchParams(DistributedTest):
    world_size = 2

    def test(self):
        config_dict = {
            "train_batch_size": 2,
            "steps_per_print": 1,
            "optimizer": {
                "type": "Lamb",
                "params": {
                    "lr": 0.00015
                }
            },
            "gradient_clipping": 1.0,
            "elasticity": {
                "enabled": True,
                "max_train_batch_size": 4,
aiss's avatar
aiss committed
163
                "micro_batch_sizes": [1, 2, 3, 4],
aiss's avatar
aiss committed
164
165
166
167
168
169
170
171
172
173
174
                "min_gpus": 1,
                "max_gpus": 4,
                "min_time": 20,
                "version": 0.1
            }
        }
        hidden_dim = 10

        model = SimpleModel(hidden_dim, empty_grad=False)

        with pytest.raises(deepspeed.elasticity.config.ElasticityError):
aiss's avatar
aiss committed
175
            model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
aiss's avatar
aiss committed
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194


class TestNonElasticBatchParamsWithOverride(DistributedTest):
    world_size = 2

    def test(self):
        config_dict = {
            "train_batch_size": 2,
            "steps_per_print": 1,
            "optimizer": {
                "type": "Lamb",
                "params": {
                    "lr": 0.00015
                }
            },
            "gradient_clipping": 1.0,
            "elasticity": {
                "enabled": True,
                "max_train_batch_size": 4,
aiss's avatar
aiss committed
195
                "micro_batch_sizes": [1, 2, 3, 4],
aiss's avatar
aiss committed
196
197
198
199
200
201
202
203
204
205
                "min_gpus": 1,
                "max_gpus": 4,
                "min_time": 20,
                "version": 0.1,
                "ignore_non_elastic_batch_info": True
            }
        }
        hidden_dim = 10

        model = SimpleModel(hidden_dim, empty_grad=False)
aiss's avatar
aiss committed
206
        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
aiss's avatar
aiss committed
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225


class TestElasticConfigChanged(DistributedTest):
    world_size = 2

    def test(self):
        config_dict = {
            "train_batch_size": 2,
            "steps_per_print": 1,
            "optimizer": {
                "type": "Lamb",
                "params": {
                    "lr": 0.00015
                }
            },
            "gradient_clipping": 1.0,
            "elasticity": {
                "enabled": True,
                "max_train_batch_size": 4,
aiss's avatar
aiss committed
226
                "micro_batch_sizes": [1, 2, 3, 4],
aiss's avatar
aiss committed
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
                "min_gpus": 1,
                "max_gpus": 4,
                "min_time": 20,
                "version": 0.1,
                "ignore_non_elastic_batch_info": True
            }
        }
        import json, os
        scheduler_elastic_config = config_dict.copy()
        scheduler_elastic_config["elasticity"]["max_train_batch_size"] = 27
        os.environ['DEEPSPEED_ELASTICITY_CONFIG'] = json.dumps(scheduler_elastic_config)
        hidden_dim = 10

        model = SimpleModel(hidden_dim, empty_grad=False)

        with pytest.raises(deepspeed.elasticity.config.ElasticityError):
aiss's avatar
aiss committed
243
            model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())