reduce size of megatron tests (#223)

53ac7947 · Jeff Rasley · GitHub · 8a18e73e · 53ac7947 · 53ac7947
Unverified Commit 53ac7947 authored May 20, 2020 by Jeff Rasley Committed by GitHub May 20, 2020
6 changed files
--- a/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2.json
@@ -3,7 +3,10 @@
  "gradient_accumulation_steps": 1,
  "steps_per_print": 1,
  "zero_optimization": {
-    "stage":2
+    "stage":2,
+    "reduce_bucket_size": 7000000,
+    "allgather_bucket_size": 7000000,
+    "reduce_scatter": true
  },
  "optimizer": {
    "type": "Adam",

--- a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2.json
@@ -3,7 +3,10 @@
  "gradient_accumulation_steps": 1,
  "steps_per_print": 1,
  "zero_optimization": {
-    "stage":2
+    "stage":2,
+    "reduce_bucket_size": 7000000,
+    "allgather_bucket_size": 7000000,
+    "reduce_scatter": true
  },
  "optimizer": {
    "type": "Adam",

--- a/tests/model/Megatron_GPT2/ds_gpt2_test.sh
+++ b/tests/model/Megatron_GPT2/ds_gpt2_test.sh
@@ -20,8 +20,8 @@ helpFunction()
    exit 1
 }

-layers=24
-hidden_size=1024
+layers=2
+hidden_size=128
 seq_length=1024
 ckpt_num_layers=1
 other_args=""

--- a/tests/model/Megatron_GPT2/run_checkpoint_test.py
+++ b/tests/model/Megatron_GPT2/run_checkpoint_test.py
@@ -10,6 +10,16 @@ import time
 import re
 from .test_common import BaseTestCase

+LAYERS = 2
+HIDDEN_SIZE = 128
+ATTN_HEADS = 8
+
+
+def remove_file(test_id, filename):
+    cmd = f"if [ -f {filename} ] ; then rm -v {filename}; fi"
+    print(f"{test_id} cmd: {cmd}")
+    subprocess.run(cmd, shell=True, check=False, executable='/bin/bash')
+

 def grep_loss_from_file(file_name):
    loss = 0.0
@@ -50,10 +60,10 @@ class GPT2CheckpointTestCase(BaseTestCase):
            "nodes": 1,
            "bs": 8,
            "steps": 1100,
-            "layers": 2,
-            "hidden_size": 256,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
            "seq_length": 256,
-            "heads": 16,
+            "heads": ATTN_HEADS,
            "deepspeed": True,
            "tag": "ds_zero1",
            "zero": True,
@@ -72,10 +82,10 @@ class GPT2CheckpointTestCase(BaseTestCase):
            "nodes": 1,
            "bs": 8,
            "steps": 1100,
-            "layers": 2,
-            "hidden_size": 256,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
            "seq_length": 256,
-            "heads": 16,
+            "heads": ATTN_HEADS,
            "deepspeed": True,
            "tag": "ds_zero2",
            "zero": True,
@@ -94,10 +104,10 @@ class GPT2CheckpointTestCase(BaseTestCase):
            "nodes": 1,
            "bs": 8,
            "steps": 1100,
-            "layers": 2,
-            "hidden_size": 256,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
            "seq_length": 256,
-            "heads": 16,
+            "heads": ATTN_HEADS,
            "deepspeed": True,
            "zero": False,
            "other_args": "",

--- a/tests/model/Megatron_GPT2/run_func_test.py
+++ b/tests/model/Megatron_GPT2/run_func_test.py
@@ -10,6 +10,11 @@ import time
 import re
 from .test_common import BaseTestCase

+LAYERS = 2
+HIDDEN_SIZE = 128
+ATTN_HEADS = 8
+SEQ_LEN = 64
+

 def grep_loss_from_file(file_name):
    loss = 0.0
@@ -50,10 +55,10 @@ class GPT2FuncTestCase(BaseTestCase):
            "nodes": 1,
            "bs": 4,
            "steps": 1000,
-            "layers": 12,
-            "hidden_size": 768,
-            "seq_length": 256,
-            "heads": 12,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
            "deepspeed": False,
            "json": "ds_config_func_bs4_zero1.json",
        }
@@ -68,10 +73,10 @@ class GPT2FuncTestCase(BaseTestCase):
            "nodes": 1,
            "bs": 8,
            "steps": 1000,
-            "layers": 12,
-            "hidden_size": 768,
-            "seq_length": 256,
-            "heads": 12,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
            "deepspeed": False,
            "json": "ds_config_func_bs8_zero1.json",
        }
@@ -86,10 +91,10 @@ class GPT2FuncTestCase(BaseTestCase):
            "nodes": 1,
            "bs": 8,
            "steps": 1000,
-            "layers": 12,
-            "hidden_size": 768,
-            "seq_length": 256,
-            "heads": 12,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
            "deepspeed": False,
            "json": "ds_config_func_bs8_zero1.json",
        }
@@ -104,10 +109,10 @@ class GPT2FuncTestCase(BaseTestCase):
            "nodes": 1,
            "bs": 8,
            "steps": 1000,
-            "layers": 12,
-            "hidden_size": 768,
-            "seq_length": 256,
-            "heads": 12,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
            "deepspeed": False,
            "json": "ds_config_func_bs8_zero1.json",
        }
@@ -122,10 +127,10 @@ class GPT2FuncTestCase(BaseTestCase):
            "nodes": 1,
            "bs": 4,
            "steps": 1000,
-            "layers": 12,
-            "hidden_size": 768,
-            "seq_length": 256,
-            "heads": 12,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
            "deepspeed": False,
            "json": "ds_config_func_bs4_zero2.json",
        }
@@ -140,10 +145,10 @@ class GPT2FuncTestCase(BaseTestCase):
            "nodes": 1,
            "bs": 8,
            "steps": 1000,
-            "layers": 12,
-            "hidden_size": 768,
-            "seq_length": 256,
-            "heads": 12,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
            "deepspeed": False,
            "json": "ds_config_func_bs8_zero2.json",
        }
@@ -158,10 +163,10 @@ class GPT2FuncTestCase(BaseTestCase):
            "nodes": 1,
            "bs": 8,
            "steps": 1000,
-            "layers": 12,
-            "hidden_size": 768,
-            "seq_length": 256,
-            "heads": 12,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
            "deepspeed": False,
            "json": "ds_config_func_bs8_zero2.json",
        }
@@ -179,10 +184,10 @@ class GPT2FuncTestCase(BaseTestCase):
            "nodes": 1,
            "bs": 8,
            "steps": 1000,
-            "layers": 12,
-            "hidden_size": 768,
-            "seq_length": 256,
-            "heads": 12,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
            "deepspeed": False,
            "json": "ds_config_func_bs8_zero2.json",
        }
@@ -200,10 +205,10 @@ class GPT2FuncTestCase(BaseTestCase):
            "nodes": 1,
            "bs": 4,
            "steps": 20,
-            "layers": 12,
-            "hidden_size": 768,
-            "seq_length": 256,
-            "heads": 12,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
            "deepspeed": False,
            "json": "ds_config_func_scheduler.json",
        }

--- a/tests/model/run_sanity_check.py
+++ b/tests/model/run_sanity_check.py
@@ -29,14 +29,14 @@ def pytest_hack(runner_result):
        assert runner_result.wasSuccessful()  # fail the test


-#def test_megatron():
-#    runner = unittest.TextTestRunner(failfast=True)
-#    pytest_hack(runner.run(Megatron_GPT2.suite()))
-#
-#
-#def test_megatron_checkpoint():
-#    runner = unittest.TextTestRunner(failfast=True)
-#    pytest_hack(runner.run(Megatron_GPT2.checkpoint_suite()))
+def test_megatron():
+    runner = unittest.TextTestRunner(failfast=True)
+    pytest_hack(runner.run(Megatron_GPT2.suite()))
+
+
+def test_megatron_checkpoint():
+    runner = unittest.TextTestRunner(failfast=True)
+    pytest_hack(runner.run(Megatron_GPT2.checkpoint_suite()))


 def test_squad():