Unverified Commit 53ac7947 authored by Jeff Rasley's avatar Jeff Rasley Committed by GitHub
Browse files

reduce size of megatron tests (#223)

parent 8a18e73e
......@@ -3,7 +3,10 @@
"gradient_accumulation_steps": 1,
"steps_per_print": 1,
"zero_optimization": {
"stage":2
"stage":2,
"reduce_bucket_size": 7000000,
"allgather_bucket_size": 7000000,
"reduce_scatter": true
},
"optimizer": {
"type": "Adam",
......
......@@ -3,7 +3,10 @@
"gradient_accumulation_steps": 1,
"steps_per_print": 1,
"zero_optimization": {
"stage":2
"stage":2,
"reduce_bucket_size": 7000000,
"allgather_bucket_size": 7000000,
"reduce_scatter": true
},
"optimizer": {
"type": "Adam",
......
......@@ -20,8 +20,8 @@ helpFunction()
exit 1
}
layers=24
hidden_size=1024
layers=2
hidden_size=128
seq_length=1024
ckpt_num_layers=1
other_args=""
......
......@@ -10,6 +10,16 @@ import time
import re
from .test_common import BaseTestCase
LAYERS = 2
HIDDEN_SIZE = 128
ATTN_HEADS = 8
def remove_file(test_id, filename):
cmd = f"if [ -f {filename} ] ; then rm -v {filename}; fi"
print(f"{test_id} cmd: {cmd}")
subprocess.run(cmd, shell=True, check=False, executable='/bin/bash')
def grep_loss_from_file(file_name):
loss = 0.0
......@@ -50,10 +60,10 @@ class GPT2CheckpointTestCase(BaseTestCase):
"nodes": 1,
"bs": 8,
"steps": 1100,
"layers": 2,
"hidden_size": 256,
"layers": LAYERS,
"hidden_size": HIDDEN_SIZE,
"seq_length": 256,
"heads": 16,
"heads": ATTN_HEADS,
"deepspeed": True,
"tag": "ds_zero1",
"zero": True,
......@@ -72,10 +82,10 @@ class GPT2CheckpointTestCase(BaseTestCase):
"nodes": 1,
"bs": 8,
"steps": 1100,
"layers": 2,
"hidden_size": 256,
"layers": LAYERS,
"hidden_size": HIDDEN_SIZE,
"seq_length": 256,
"heads": 16,
"heads": ATTN_HEADS,
"deepspeed": True,
"tag": "ds_zero2",
"zero": True,
......@@ -94,10 +104,10 @@ class GPT2CheckpointTestCase(BaseTestCase):
"nodes": 1,
"bs": 8,
"steps": 1100,
"layers": 2,
"hidden_size": 256,
"layers": LAYERS,
"hidden_size": HIDDEN_SIZE,
"seq_length": 256,
"heads": 16,
"heads": ATTN_HEADS,
"deepspeed": True,
"zero": False,
"other_args": "",
......
......@@ -10,6 +10,11 @@ import time
import re
from .test_common import BaseTestCase
LAYERS = 2
HIDDEN_SIZE = 128
ATTN_HEADS = 8
SEQ_LEN = 64
def grep_loss_from_file(file_name):
loss = 0.0
......@@ -50,10 +55,10 @@ class GPT2FuncTestCase(BaseTestCase):
"nodes": 1,
"bs": 4,
"steps": 1000,
"layers": 12,
"hidden_size": 768,
"seq_length": 256,
"heads": 12,
"layers": LAYERS,
"hidden_size": HIDDEN_SIZE,
"seq_length": SEQ_LEN,
"heads": ATTN_HEADS,
"deepspeed": False,
"json": "ds_config_func_bs4_zero1.json",
}
......@@ -68,10 +73,10 @@ class GPT2FuncTestCase(BaseTestCase):
"nodes": 1,
"bs": 8,
"steps": 1000,
"layers": 12,
"hidden_size": 768,
"seq_length": 256,
"heads": 12,
"layers": LAYERS,
"hidden_size": HIDDEN_SIZE,
"seq_length": SEQ_LEN,
"heads": ATTN_HEADS,
"deepspeed": False,
"json": "ds_config_func_bs8_zero1.json",
}
......@@ -86,10 +91,10 @@ class GPT2FuncTestCase(BaseTestCase):
"nodes": 1,
"bs": 8,
"steps": 1000,
"layers": 12,
"hidden_size": 768,
"seq_length": 256,
"heads": 12,
"layers": LAYERS,
"hidden_size": HIDDEN_SIZE,
"seq_length": SEQ_LEN,
"heads": ATTN_HEADS,
"deepspeed": False,
"json": "ds_config_func_bs8_zero1.json",
}
......@@ -104,10 +109,10 @@ class GPT2FuncTestCase(BaseTestCase):
"nodes": 1,
"bs": 8,
"steps": 1000,
"layers": 12,
"hidden_size": 768,
"seq_length": 256,
"heads": 12,
"layers": LAYERS,
"hidden_size": HIDDEN_SIZE,
"seq_length": SEQ_LEN,
"heads": ATTN_HEADS,
"deepspeed": False,
"json": "ds_config_func_bs8_zero1.json",
}
......@@ -122,10 +127,10 @@ class GPT2FuncTestCase(BaseTestCase):
"nodes": 1,
"bs": 4,
"steps": 1000,
"layers": 12,
"hidden_size": 768,
"seq_length": 256,
"heads": 12,
"layers": LAYERS,
"hidden_size": HIDDEN_SIZE,
"seq_length": SEQ_LEN,
"heads": ATTN_HEADS,
"deepspeed": False,
"json": "ds_config_func_bs4_zero2.json",
}
......@@ -140,10 +145,10 @@ class GPT2FuncTestCase(BaseTestCase):
"nodes": 1,
"bs": 8,
"steps": 1000,
"layers": 12,
"hidden_size": 768,
"seq_length": 256,
"heads": 12,
"layers": LAYERS,
"hidden_size": HIDDEN_SIZE,
"seq_length": SEQ_LEN,
"heads": ATTN_HEADS,
"deepspeed": False,
"json": "ds_config_func_bs8_zero2.json",
}
......@@ -158,10 +163,10 @@ class GPT2FuncTestCase(BaseTestCase):
"nodes": 1,
"bs": 8,
"steps": 1000,
"layers": 12,
"hidden_size": 768,
"seq_length": 256,
"heads": 12,
"layers": LAYERS,
"hidden_size": HIDDEN_SIZE,
"seq_length": SEQ_LEN,
"heads": ATTN_HEADS,
"deepspeed": False,
"json": "ds_config_func_bs8_zero2.json",
}
......@@ -179,10 +184,10 @@ class GPT2FuncTestCase(BaseTestCase):
"nodes": 1,
"bs": 8,
"steps": 1000,
"layers": 12,
"hidden_size": 768,
"seq_length": 256,
"heads": 12,
"layers": LAYERS,
"hidden_size": HIDDEN_SIZE,
"seq_length": SEQ_LEN,
"heads": ATTN_HEADS,
"deepspeed": False,
"json": "ds_config_func_bs8_zero2.json",
}
......@@ -200,10 +205,10 @@ class GPT2FuncTestCase(BaseTestCase):
"nodes": 1,
"bs": 4,
"steps": 20,
"layers": 12,
"hidden_size": 768,
"seq_length": 256,
"heads": 12,
"layers": LAYERS,
"hidden_size": HIDDEN_SIZE,
"seq_length": SEQ_LEN,
"heads": ATTN_HEADS,
"deepspeed": False,
"json": "ds_config_func_scheduler.json",
}
......
......@@ -29,14 +29,14 @@ def pytest_hack(runner_result):
assert runner_result.wasSuccessful() # fail the test
#def test_megatron():
# runner = unittest.TextTestRunner(failfast=True)
# pytest_hack(runner.run(Megatron_GPT2.suite()))
#
#
#def test_megatron_checkpoint():
# runner = unittest.TextTestRunner(failfast=True)
# pytest_hack(runner.run(Megatron_GPT2.checkpoint_suite()))
def test_megatron():
runner = unittest.TextTestRunner(failfast=True)
pytest_hack(runner.run(Megatron_GPT2.suite()))
def test_megatron_checkpoint():
runner = unittest.TextTestRunner(failfast=True)
pytest_hack(runner.run(Megatron_GPT2.checkpoint_suite()))
def test_squad():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment