Unverified Commit bc046dc4 authored by Jeff Rasley's avatar Jeff Rasley Committed by GitHub
Browse files

add additional validation checks in elastic config (#646)

parent 828d75ba
...@@ -63,9 +63,39 @@ class ElasticityConfig: ...@@ -63,9 +63,39 @@ class ElasticityConfig:
MAX_ACCEPTABLE_BATCH_SIZE, MAX_ACCEPTABLE_BATCH_SIZE,
MAX_ACCEPTABLE_BATCH_SIZE_DEFAULT) MAX_ACCEPTABLE_BATCH_SIZE_DEFAULT)
self.micro_batches = param_dict.get(MICRO_BATCHES, MICRO_BATCHES_DEFAULT) self.micro_batches = param_dict.get(MICRO_BATCHES, MICRO_BATCHES_DEFAULT)
if not isinstance(self.micro_batches, list):
raise ElasticityConfigError(
f"Elasticity expected value of {MICRO_BATCHES} to be a "
f"list of micro batches, instead is: {type(self.micro_batches)}, containing: {self.micro_batches}"
)
if not all(map(lambda m: isinstance(m, int), self.micro_batches)):
raise ElasticityConfigError(
f"Elasticity expected {MICRO_BATCHES} to only contain a list of integers, "
f"instead contains: f{self.micro_batches}")
if not all(map(lambda m: m > 0, self.micro_batches)):
raise ElasticityConfigError(
f"Elasticity expected {MICRO_BATCHES} to only contain positive integers, "
f"instead contains: f{self.micro_batches}")
self.min_gpus = param_dict.get(MIN_GPUS, MIN_GPUS_DEFAULT) self.min_gpus = param_dict.get(MIN_GPUS, MIN_GPUS_DEFAULT)
self.max_gpus = param_dict.get(MAX_GPUS, MAX_GPUS_DEFAULT) self.max_gpus = param_dict.get(MAX_GPUS, MAX_GPUS_DEFAULT)
if self.min_gpus < 1 or self.max_gpus < 1:
raise ElasticityConfigError(
"Elasticity min/max gpus must be > 0, "
f"given min_gpus: {self.min_gpus}, max_gpus: {self.max_gpus}")
if self.max_gpus < self.min_gpus:
raise ElasticityConfigError(
"Elasticity min_gpus cannot be greater than max_gpus, "
f"given min_gpus: {self.min_gpus}, max_gpus: {self.max_gpus}")
self.min_time = param_dict.get(MIN_TIME, MIN_TIME_DEFAULT) self.min_time = param_dict.get(MIN_TIME, MIN_TIME_DEFAULT)
if self.min_time < 0:
raise ElasticityConfigError(
f"Elasticity min time needs to be >= 0: given {self.min_time}")
self.version = param_dict.get(VERSION, VERSION_DEFAULT) self.version = param_dict.get(VERSION, VERSION_DEFAULT)
self.prefer_larger_batch_size = param_dict.get(PREFER_LARGER_BATCH, self.prefer_larger_batch_size = param_dict.get(PREFER_LARGER_BATCH,
PREFER_LARGER_BATCH_DEFAULT) PREFER_LARGER_BATCH_DEFAULT)
......
...@@ -46,9 +46,9 @@ MIN_GPUS_DEFAULT = 1 ...@@ -46,9 +46,9 @@ MIN_GPUS_DEFAULT = 1
MAX_GPUS = 'max_gpus' MAX_GPUS = 'max_gpus'
MAX_GPUS_DEFAULT = 10000 MAX_GPUS_DEFAULT = 10000
# Minimum running time (minutes) before the scheduler will scale us # Minimum running time (minutes) before the scheduler will scale us, 0 implies it's unknown
MIN_TIME = "min_time" MIN_TIME = "min_time"
MIN_TIME_DEFAULT = "20" MIN_TIME_DEFAULT = 0
# When finding a suitable batch size, attempt to find one that is closest # When finding a suitable batch size, attempt to find one that is closest
# to the max train batch size given. # to the max train batch size given.
......
...@@ -107,6 +107,35 @@ def test_empty_config(): ...@@ -107,6 +107,35 @@ def test_empty_config():
target_deepspeed_version=ds_version) target_deepspeed_version=ds_version)
@pytest.mark.parametrize('key, value',
[('micro_batch_sizes',
[1,
4,
-1,
2,
-10]),
('min_gpus',
-1),
('max_gpus',
-1),
('micro_batch_sizes',
5),
('micro_batch_sizes',
['a',
None,
0.5]),
('micro_batch_sizes',
[2,
0.5,
4])])
def test_invalid_config_values(key, value):
ds_config = base_ds_config.copy()
ds_config['elasticity'][key] = value
with pytest.raises(deepspeed.elasticity.config.ElasticityError):
deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
target_deepspeed_version=ds_version)
def test_proper_mbsz(): def test_proper_mbsz():
ds_config = base_ds_config.copy() ds_config = base_ds_config.copy()
ds_config["elasticity"]["max_train_batch_size"] = 32 ds_config["elasticity"]["max_train_batch_size"] = 32
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment