Unverified Commit 53ac7947 authored by Jeff Rasley's avatar Jeff Rasley Committed by GitHub
Browse files

reduce size of megatron tests (#223)

parent 8a18e73e
...@@ -3,7 +3,10 @@ ...@@ -3,7 +3,10 @@
"gradient_accumulation_steps": 1, "gradient_accumulation_steps": 1,
"steps_per_print": 1, "steps_per_print": 1,
"zero_optimization": { "zero_optimization": {
"stage":2 "stage":2,
"reduce_bucket_size": 7000000,
"allgather_bucket_size": 7000000,
"reduce_scatter": true
}, },
"optimizer": { "optimizer": {
"type": "Adam", "type": "Adam",
......
...@@ -3,7 +3,10 @@ ...@@ -3,7 +3,10 @@
"gradient_accumulation_steps": 1, "gradient_accumulation_steps": 1,
"steps_per_print": 1, "steps_per_print": 1,
"zero_optimization": { "zero_optimization": {
"stage":2 "stage":2,
"reduce_bucket_size": 7000000,
"allgather_bucket_size": 7000000,
"reduce_scatter": true
}, },
"optimizer": { "optimizer": {
"type": "Adam", "type": "Adam",
......
...@@ -20,8 +20,8 @@ helpFunction() ...@@ -20,8 +20,8 @@ helpFunction()
exit 1 exit 1
} }
layers=24 layers=2
hidden_size=1024 hidden_size=128
seq_length=1024 seq_length=1024
ckpt_num_layers=1 ckpt_num_layers=1
other_args="" other_args=""
......
...@@ -10,6 +10,16 @@ import time ...@@ -10,6 +10,16 @@ import time
import re import re
from .test_common import BaseTestCase from .test_common import BaseTestCase
LAYERS = 2
HIDDEN_SIZE = 128
ATTN_HEADS = 8
def remove_file(test_id, filename):
cmd = f"if [ -f {filename} ] ; then rm -v {filename}; fi"
print(f"{test_id} cmd: {cmd}")
subprocess.run(cmd, shell=True, check=False, executable='/bin/bash')
def grep_loss_from_file(file_name): def grep_loss_from_file(file_name):
loss = 0.0 loss = 0.0
...@@ -50,10 +60,10 @@ class GPT2CheckpointTestCase(BaseTestCase): ...@@ -50,10 +60,10 @@ class GPT2CheckpointTestCase(BaseTestCase):
"nodes": 1, "nodes": 1,
"bs": 8, "bs": 8,
"steps": 1100, "steps": 1100,
"layers": 2, "layers": LAYERS,
"hidden_size": 256, "hidden_size": HIDDEN_SIZE,
"seq_length": 256, "seq_length": 256,
"heads": 16, "heads": ATTN_HEADS,
"deepspeed": True, "deepspeed": True,
"tag": "ds_zero1", "tag": "ds_zero1",
"zero": True, "zero": True,
...@@ -72,10 +82,10 @@ class GPT2CheckpointTestCase(BaseTestCase): ...@@ -72,10 +82,10 @@ class GPT2CheckpointTestCase(BaseTestCase):
"nodes": 1, "nodes": 1,
"bs": 8, "bs": 8,
"steps": 1100, "steps": 1100,
"layers": 2, "layers": LAYERS,
"hidden_size": 256, "hidden_size": HIDDEN_SIZE,
"seq_length": 256, "seq_length": 256,
"heads": 16, "heads": ATTN_HEADS,
"deepspeed": True, "deepspeed": True,
"tag": "ds_zero2", "tag": "ds_zero2",
"zero": True, "zero": True,
...@@ -94,10 +104,10 @@ class GPT2CheckpointTestCase(BaseTestCase): ...@@ -94,10 +104,10 @@ class GPT2CheckpointTestCase(BaseTestCase):
"nodes": 1, "nodes": 1,
"bs": 8, "bs": 8,
"steps": 1100, "steps": 1100,
"layers": 2, "layers": LAYERS,
"hidden_size": 256, "hidden_size": HIDDEN_SIZE,
"seq_length": 256, "seq_length": 256,
"heads": 16, "heads": ATTN_HEADS,
"deepspeed": True, "deepspeed": True,
"zero": False, "zero": False,
"other_args": "", "other_args": "",
......
...@@ -10,6 +10,11 @@ import time ...@@ -10,6 +10,11 @@ import time
import re import re
from .test_common import BaseTestCase from .test_common import BaseTestCase
LAYERS = 2
HIDDEN_SIZE = 128
ATTN_HEADS = 8
SEQ_LEN = 64
def grep_loss_from_file(file_name): def grep_loss_from_file(file_name):
loss = 0.0 loss = 0.0
...@@ -50,10 +55,10 @@ class GPT2FuncTestCase(BaseTestCase): ...@@ -50,10 +55,10 @@ class GPT2FuncTestCase(BaseTestCase):
"nodes": 1, "nodes": 1,
"bs": 4, "bs": 4,
"steps": 1000, "steps": 1000,
"layers": 12, "layers": LAYERS,
"hidden_size": 768, "hidden_size": HIDDEN_SIZE,
"seq_length": 256, "seq_length": SEQ_LEN,
"heads": 12, "heads": ATTN_HEADS,
"deepspeed": False, "deepspeed": False,
"json": "ds_config_func_bs4_zero1.json", "json": "ds_config_func_bs4_zero1.json",
} }
...@@ -68,10 +73,10 @@ class GPT2FuncTestCase(BaseTestCase): ...@@ -68,10 +73,10 @@ class GPT2FuncTestCase(BaseTestCase):
"nodes": 1, "nodes": 1,
"bs": 8, "bs": 8,
"steps": 1000, "steps": 1000,
"layers": 12, "layers": LAYERS,
"hidden_size": 768, "hidden_size": HIDDEN_SIZE,
"seq_length": 256, "seq_length": SEQ_LEN,
"heads": 12, "heads": ATTN_HEADS,
"deepspeed": False, "deepspeed": False,
"json": "ds_config_func_bs8_zero1.json", "json": "ds_config_func_bs8_zero1.json",
} }
...@@ -86,10 +91,10 @@ class GPT2FuncTestCase(BaseTestCase): ...@@ -86,10 +91,10 @@ class GPT2FuncTestCase(BaseTestCase):
"nodes": 1, "nodes": 1,
"bs": 8, "bs": 8,
"steps": 1000, "steps": 1000,
"layers": 12, "layers": LAYERS,
"hidden_size": 768, "hidden_size": HIDDEN_SIZE,
"seq_length": 256, "seq_length": SEQ_LEN,
"heads": 12, "heads": ATTN_HEADS,
"deepspeed": False, "deepspeed": False,
"json": "ds_config_func_bs8_zero1.json", "json": "ds_config_func_bs8_zero1.json",
} }
...@@ -104,10 +109,10 @@ class GPT2FuncTestCase(BaseTestCase): ...@@ -104,10 +109,10 @@ class GPT2FuncTestCase(BaseTestCase):
"nodes": 1, "nodes": 1,
"bs": 8, "bs": 8,
"steps": 1000, "steps": 1000,
"layers": 12, "layers": LAYERS,
"hidden_size": 768, "hidden_size": HIDDEN_SIZE,
"seq_length": 256, "seq_length": SEQ_LEN,
"heads": 12, "heads": ATTN_HEADS,
"deepspeed": False, "deepspeed": False,
"json": "ds_config_func_bs8_zero1.json", "json": "ds_config_func_bs8_zero1.json",
} }
...@@ -122,10 +127,10 @@ class GPT2FuncTestCase(BaseTestCase): ...@@ -122,10 +127,10 @@ class GPT2FuncTestCase(BaseTestCase):
"nodes": 1, "nodes": 1,
"bs": 4, "bs": 4,
"steps": 1000, "steps": 1000,
"layers": 12, "layers": LAYERS,
"hidden_size": 768, "hidden_size": HIDDEN_SIZE,
"seq_length": 256, "seq_length": SEQ_LEN,
"heads": 12, "heads": ATTN_HEADS,
"deepspeed": False, "deepspeed": False,
"json": "ds_config_func_bs4_zero2.json", "json": "ds_config_func_bs4_zero2.json",
} }
...@@ -140,10 +145,10 @@ class GPT2FuncTestCase(BaseTestCase): ...@@ -140,10 +145,10 @@ class GPT2FuncTestCase(BaseTestCase):
"nodes": 1, "nodes": 1,
"bs": 8, "bs": 8,
"steps": 1000, "steps": 1000,
"layers": 12, "layers": LAYERS,
"hidden_size": 768, "hidden_size": HIDDEN_SIZE,
"seq_length": 256, "seq_length": SEQ_LEN,
"heads": 12, "heads": ATTN_HEADS,
"deepspeed": False, "deepspeed": False,
"json": "ds_config_func_bs8_zero2.json", "json": "ds_config_func_bs8_zero2.json",
} }
...@@ -158,10 +163,10 @@ class GPT2FuncTestCase(BaseTestCase): ...@@ -158,10 +163,10 @@ class GPT2FuncTestCase(BaseTestCase):
"nodes": 1, "nodes": 1,
"bs": 8, "bs": 8,
"steps": 1000, "steps": 1000,
"layers": 12, "layers": LAYERS,
"hidden_size": 768, "hidden_size": HIDDEN_SIZE,
"seq_length": 256, "seq_length": SEQ_LEN,
"heads": 12, "heads": ATTN_HEADS,
"deepspeed": False, "deepspeed": False,
"json": "ds_config_func_bs8_zero2.json", "json": "ds_config_func_bs8_zero2.json",
} }
...@@ -179,10 +184,10 @@ class GPT2FuncTestCase(BaseTestCase): ...@@ -179,10 +184,10 @@ class GPT2FuncTestCase(BaseTestCase):
"nodes": 1, "nodes": 1,
"bs": 8, "bs": 8,
"steps": 1000, "steps": 1000,
"layers": 12, "layers": LAYERS,
"hidden_size": 768, "hidden_size": HIDDEN_SIZE,
"seq_length": 256, "seq_length": SEQ_LEN,
"heads": 12, "heads": ATTN_HEADS,
"deepspeed": False, "deepspeed": False,
"json": "ds_config_func_bs8_zero2.json", "json": "ds_config_func_bs8_zero2.json",
} }
...@@ -200,10 +205,10 @@ class GPT2FuncTestCase(BaseTestCase): ...@@ -200,10 +205,10 @@ class GPT2FuncTestCase(BaseTestCase):
"nodes": 1, "nodes": 1,
"bs": 4, "bs": 4,
"steps": 20, "steps": 20,
"layers": 12, "layers": LAYERS,
"hidden_size": 768, "hidden_size": HIDDEN_SIZE,
"seq_length": 256, "seq_length": SEQ_LEN,
"heads": 12, "heads": ATTN_HEADS,
"deepspeed": False, "deepspeed": False,
"json": "ds_config_func_scheduler.json", "json": "ds_config_func_scheduler.json",
} }
......
...@@ -29,14 +29,14 @@ def pytest_hack(runner_result): ...@@ -29,14 +29,14 @@ def pytest_hack(runner_result):
assert runner_result.wasSuccessful() # fail the test assert runner_result.wasSuccessful() # fail the test
#def test_megatron(): def test_megatron():
# runner = unittest.TextTestRunner(failfast=True) runner = unittest.TextTestRunner(failfast=True)
# pytest_hack(runner.run(Megatron_GPT2.suite())) pytest_hack(runner.run(Megatron_GPT2.suite()))
#
#
#def test_megatron_checkpoint(): def test_megatron_checkpoint():
# runner = unittest.TextTestRunner(failfast=True) runner = unittest.TextTestRunner(failfast=True)
# pytest_hack(runner.run(Megatron_GPT2.checkpoint_suite())) pytest_hack(runner.run(Megatron_GPT2.checkpoint_suite()))
def test_squad(): def test_squad():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment