add test model Megatron_GPT2

98f5131b · Elton Zheng · 3bc21cd0 · 98f5131b · 98f5131b · 98f5131b
Commit 98f5131b authored Feb 03, 2020 by Elton Zheng
15 changed files
--- a/tests/model/Megatron_GPT2/__init__.py
+++ b/tests/model/Megatron_GPT2/__init__.py
+# coding=utf-8
+# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
+#
+# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+from .run_func_test import GPT2FuncTestCase
+from .run_checkpoint_test import GPT2CheckpointTestCase, checkpoint_suite
+from .run_func_test import suite
--- a/tests/model/Megatron_GPT2/ds_config_func_bs4.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs4.json
+{
+  "train_batch_size": 4,
+  "gradient_accumulation_steps": 1,
+  "steps_per_print": 1,
+  "zero_optimization": true,
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.00015,
+      "max_grad_norm": 1.0
+    }
+  },
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  }
+}
--- a/tests/model/Megatron_GPT2/ds_config_func_bs8.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs8.json
+{
+  "train_batch_size": 8,
+  "gradient_accumulation_steps": 1,
+  "steps_per_print": 1,
+  "zero_optimization": true,
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.00015,
+      "max_grad_norm": 1.0
+    }
+  },
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  }
+}
--- a/tests/model/Megatron_GPT2/ds_config_func_bs8_no_zero.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_no_zero.json
+{
+  "train_batch_size": 8,
+  "gradient_accumulation_steps": 1,
+  "steps_per_print": 1,
+  "zero_optimization": false,
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.00015,
+      "max_grad_norm": 1.0
+    }
+  },
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  }
+}
--- a/tests/model/Megatron_GPT2/ds_config_func_scheduler.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_scheduler.json
+{
+  "train_batch_size": 4,
+  "gradient_accumulation_steps": 1,
+  "steps_per_print": 1,
+  "zero_optimization": true,
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.00015,
+      "max_grad_norm": 1.0
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params": {
+      "warmup_min_lr": 0,
+      "warmup_max_lr": 0.001,
+      "warmup_num_steps": 10
+    }
+  },
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  }
+}
--- a/tests/model/Megatron_GPT2/ds_config_perf_bs16.json
+++ b/tests/model/Megatron_GPT2/ds_config_perf_bs16.json
+{
+  "train_batch_size": 16,
+  "gradient_accumulation_steps": 1,
+  "steps_per_print": 1,
+  "zero_optimization": true,
+  "disable_allgather": true,
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.00015,
+      "max_grad_norm": 1.0
+    }
+  },
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  }
+}
--- a/tests/model/Megatron_GPT2/ds_config_perf_bs32.json
+++ b/tests/model/Megatron_GPT2/ds_config_perf_bs32.json
+{
+  "train_batch_size": 32,
+  "gradient_accumulation_steps": 1,
+  "steps_per_print": 1,
+  "zero_optimization": true,
+  "disable_allgather": true,
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.00015,
+      "max_grad_norm": 1.0
+    }
+  },
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  }
+}
--- a/tests/model/Megatron_GPT2/ds_config_perf_bs8.json
+++ b/tests/model/Megatron_GPT2/ds_config_perf_bs8.json
+{
+  "train_batch_size": 8,
+  "gradient_accumulation_steps": 1,
+  "steps_per_print": 1,
+  "zero_optimization": true,
+  "disable_allgather": true,
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.00015,
+      "max_grad_norm": 1.0
+    }
+  },
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  }
+}
--- a/tests/model/Megatron_GPT2/ds_gpt2_test.sh
+++ b/tests/model/Megatron_GPT2/ds_gpt2_test.sh
+#! /bin/bash
+helpFunction()
+{
+    echo ""
+    echo "Usage: $0 -m model-parallelism -g gpu-per-node -n node# -b batch-size -s stpes -l layers -h hidden_size -q seq_length -e heads -c ckpt_num_layers [-d]"
+    echo -e "\t-m model parallelism"
+    echo -e "\t-g gpus per node"
+    echo -e "\t-n node count"
+    echo -e "\t-b batch size"
+    echo -e "\t-s training steps"
+    echo -e "\t-l layers"
+    echo -e "\t-h hidden size"
+    echo -e "\t-q sequence length"
+    echo -e "\t-e attention heads"
+    echo -e "\t-c checkpoint num_layers"
+    echo -e "\t-o other args"
+    echo -e "\t-d DeepSpeed config json file"
+    echo -e "\t-z Enable Zero optimization"
+    exit 1
+}
+layers=24
+hidden_size=1024
+seq_length=1024
+ckpt_num_layers=1
+other_args=""
+ds_opt=""
+zero_opt=""
+script_path=$(realpath $0)
+script_dir=$(dirname $script_path)
+while getopts "m:g:n:b:s:l:h:q:e:c:o:d:z" opt
+do
+    case "$opt" in
+        m ) mp="$OPTARG" ;;
+        g ) gpus="$OPTARG" ;;
+        n ) nodes="$OPTARG" ;;
+        b ) bs="$OPTARG" ;;
+        s ) steps="$OPTARG" ;;
+        l ) layers="$OPTARG" ;;
+        h ) hidden_size="$OPTARG" ;;
+        q ) seq_length="$OPTARG" ;;
+        e ) heads="$OPTARG" ;;
+        c ) ckpt_num_layers="$OPTARG" ;;
+        o ) other_args="$OPTARG" ;;
+        d ) ds_opt="--deepspeed --deepspeed_config $script_dir/$OPTARG" ;;
+        z ) zero_opt="--zero_optimization" ;;
+        ? ) helpFunction ;;
+    esac
+done
+# Print helpFunction in case parameters are empty
+if [ -z "$mp" ] || [ -z "$gpus" ] || [ -z "$nodes" ] || [ -z "$bs" ] || [ -z "$steps" ]
+then
+    echo "Some or all of the parameters are empty";
+    helpFunction
+fi
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+gpt_options=" \
+       --model-parallel-size ${mp} \
+       --num-layers ${layers} \
+       --hidden-size ${hidden_size} \
+       --num-attention-heads ${heads} \
+       --batch-size ${bs} \
+       --seq-length ${seq_length} \
+       --max-position-embeddings ${seq_length} \
+       --train-iters ${steps} \
+       --train-data webtext \
+       --lazy-loader \
+       --tokenizer-type GPT2BPETokenizer \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.00015 \
+       --no-load-optim \
+       --lr-decay-style cosine \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --warmup .01 \
+       --checkpoint-activations \
+       --checkpoint-num-layers ${ckpt_num_layers} \
+       --fp16 \
+       --log-interval 1 \
+       ${other_args} \
+       ${ds_opt} \
+       ${zero_opt} \
+"
+work_dir="../../../examples/Megatron-LM/"
+include_str=`seq 0 $(( $gpus - 1 )) | paste -sd "," -`
+run_cmd="(cd ${work_dir} && deepspeed.pt -i localhost:${include_str} pretrain_gpt2.py ${gpt_options})"
+echo ${run_cmd}
+eval ${run_cmd}
+set +x
--- a/tests/model/Megatron_GPT2/run_checkpoint_test.py
+++ b/tests/model/Megatron_GPT2/run_checkpoint_test.py
+# coding=utf-8
+# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
+#
+# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+import unittest
+import subprocess
+import os
+import time
+import re
+from .test_common import BaseTestCase
+def grep_loss_from_file(file_name):
+    loss = 0.0
+    with open(file_name, 'r') as f:
+        lines = f.readlines()
+        line_filter = "validation loss at the end of training for test data | LM loss:"
+        match_number = re.compile('LM loss: ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)')
+        for line in lines:
+            if line_filter in line:
+                loss = re.findall(match_number, line)
+                loss = float(loss[0])
+    if loss == 0.0:
+        print("no loss found in file ", file_name)
+    return loss
+class GPT2CheckpointTestCase(BaseTestCase):
+    def __init__(self, methodName="DeepSpeed function test on GPT2 model"):
+        super(GPT2CheckpointTestCase, self).__init__(methodName)
+    def setUp(self):
+        self.save_dir = os.getcwd()
+        new_dir = os.path.dirname(__file__)
+        if new_dir:
+            os.chdir(new_dir)
+    def tearDown(self):
+        os.chdir(self.save_dir)
+    def test_mp4_gpu16_node1_with_zero(self):
+        test_config = {
+            "mp": 2,
+            "gpus": 4,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1100,
+            "layers": 2,
+            "hidden_size": 256,
+            "seq_length": 256,
+            "heads": 16,
+            "deepspeed": True,
+            "tag": "ds_zero",
+            "zero": True,
+            "other_args": "",
+            "checkpoint_name": "ckpt_mp4_gpu16_w_zero",
+            "checkpoint_interval": 1000,
+            "json": "ds_config_func_bs8.json",
+        }
+        succ = self.run_test(test_config, 0.01)
+        self.assertTrue(succ)
+    def test_mp4_gpu16_node1_without_zero(self):
+        test_config = {
+            "mp": 2,
+            "gpus": 4,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1100,
+            "layers": 2,
+            "hidden_size": 256,
+            "seq_length": 256,
+            "heads": 16,
+            "deepspeed": True,
+            "zero": False,
+            "other_args": "",
+            "tag": "ds_without_zero",
+            "checkpoint_name": "ckpt_mp4_gpu16_wo_zero",
+            "checkpoint_interval": 1000,
+            "json": "ds_config_func_bs8_no_zero.json",
+        }
+        succ = self.run_test(test_config, 0.01)
+        self.assertTrue(succ)
+    def gen_name(self, test_config, prefix):
+        save_dir = "checkpoint_test_logs"
+        tag = test_config["tag"]
+        file_name = f"_{tag}.log"
+        return os.path.join(save_dir, prefix + file_name)
+    def run_test(self, test_config, r_tol):
+        print("\n")
+        print("{0}: starting......".format(self.id()))
+        # save to current directory.
+        checkpoint_folder = test_config["checkpoint_name"]
+        checkpoint_interval = test_config["checkpoint_interval"]
+        checkpoint_name = test_config["checkpoint_name"]
+        #---------------remove old checkpoint---------------#
+        try:
+            cmd = f"rm -rf {checkpoint_name}"
+            print(f"{self.id()} cmd: {cmd}")
+            subprocess.run(cmd,
+                           shell=True,
+                           check=False,
+                           executable='/bin/bash',
+                           stdout=f,
+                           stderr=f)
+        except:
+            print("No old checkpoint")
+        #-----------------Saving Checkpoint-----------------#
+        #building checkpoint arguments
+        test_config[
+            "other_args"] = f"\"--save {checkpoint_folder} --save-interval {checkpoint_interval}\""
+        prefix = "gpt2_saving_checkpoint"
+        # create checkpoint run...
+        base_file = self.gen_name(test_config, prefix)
+        # remove previous test log
+        try:
+            cmd = f"rm {base_file}"
+            subprocess.run(cmd, shell=True, check=False, executable='/bin/bash')
+        except:
+            print(f"{self.id()} No old logs")
+        print("{0}: Run for saving checkpoint".format(self.id()))
+        self.run_gpt2_test(test_config, base_file)
+        #-----------------Loading Checkpoint-----------------#
+        #building checkpoint arguments
+        test_config["other_args"] = f"\"--load {checkpoint_folder}\""
+        #set checkpoint load iteration
+        try:
+            cmd = f"echo {checkpoint_interval} > {checkpoint_name}/latest_checkpointed_iteration.txt"
+            print(f"{self.id()} running cmd: {cmd}")
+            subprocess.run(cmd, shell=True, check=False, executable='/bin/bash')
+        except:
+            print(f"{self.id()} Failed to update the checkpoint iteration file")
+            return False
+        prefix = "gpt2_loading_checkpoint"
+        print("{0}: Second run loading checkpoint and continuing.".format(self.id()))
+        test_file = self.gen_name(test_config, prefix)
+        # remove previous test log
+        try:
+            cmd = f"rm {test_file}"
+            subprocess.run(cmd, shell=True, check=False, executable='/bin/bash')
+        except:
+            print(f"{self.id()} no previous logs for")
+        self.run_gpt2_test(test_config, test_file)
+        return self.check_parity(base_file, test_file, r_tol)
+    def has_loss_data(self, file_name):
+        has_loss = False
+        if os.path.exists(file_name):
+            loss = grep_loss_from_file(file_name)
+            if loss != 0.0:
+                has_loss = True
+        return has_loss
+    def check_parity(self, base_file, test_file, r_tol):
+        base_loss = grep_loss_from_file(base_file)
+        test_loss = grep_loss_from_file(test_file)
+        print("baseline loss: {0}, test loss: {1}".format(base_loss, test_loss))
+        if base_loss == 0.0 or test_loss == 0.0:
+            return False
+        if abs((base_loss - test_loss) / base_loss) > r_tol:
+            return False
+        return True
+def checkpoint_suite():
+    suite = unittest.TestSuite()
+    suite.addTest(GPT2CheckpointTestCase('test_mp4_gpu16_node1_with_zero'))
+    suite.addTest(GPT2CheckpointTestCase('test_mp4_gpu16_node1_without_zero'))
+    return suite
+if __name__ == '__main__':
+    runner = unittest.TextTestRunner(failfast=True)
+    runner.run(checkpoint_suite())
--- a/tests/model/Megatron_GPT2/run_func_test.py
+++ b/tests/model/Megatron_GPT2/run_func_test.py
+# coding=utf-8
+# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
+#
+# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+import unittest
+import subprocess
+import os
+import time
+import re
+from .test_common import BaseTestCase
+def grep_loss_from_file(file_name):
+    loss = 0.0
+    with open(file_name, 'r') as f:
+        lines = f.readlines()
+        line_filter = "validation loss at the end of training for test data | LM loss:"
+        match_number = re.compile('LM loss: ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)')
+        for line in lines:
+            if line_filter in line:
+                loss = re.findall(match_number, line)
+                loss = float(loss[0])
+    if loss == 0.0:
+        print("no loss found in file ", file_name)
+    return loss
+class GPT2FuncTestCase(BaseTestCase):
+    def __init__(self, methodName="DeepSpeed function test on GPT2 model"):
+        super(GPT2FuncTestCase, self).__init__(methodName)
+    def setUp(self):
+        self.save_dir = os.getcwd()
+        new_dir = os.path.dirname(__file__)
+        if new_dir:
+            os.chdir(new_dir)
+    def tearDown(self):
+        os.chdir(self.save_dir)
+    def test_mp1_gpu1_node1(self):
+        test_config = {
+            "mp": 1,
+            "gpus": 1,
+            "nodes": 1,
+            "bs": 4,
+            "steps": 1000,
+            "layers": 12,
+            "hidden_size": 768,
+            "seq_length": 256,
+            "heads": 12,
+            "deepspeed": False,
+            "json": "ds_config_func_bs4.json",
+        }
+        succ = self.run_test(test_config, 0.01)
+        self.assertTrue(succ)
+    def test_mp1_gpu2_node1(self):
+        test_config = {
+            "mp": 1,
+            "gpus": 2,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1000,
+            "layers": 12,
+            "hidden_size": 768,
+            "seq_length": 256,
+            "heads": 12,
+            "deepspeed": False,
+            "json": "ds_config_func_bs8.json",
+        }
+        succ = self.run_test(test_config, 0.01)
+        self.assertTrue(succ)
+    def test_mp2_gpu4_node1(self):
+        test_config = {
+            "mp": 2,
+            "gpus": 4,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1000,
+            "layers": 12,
+            "hidden_size": 768,
+            "seq_length": 256,
+            "heads": 12,
+            "deepspeed": False,
+            "json": "ds_config_func_bs8.json",
+        }
+        succ = self.run_test(test_config, 0.01)
+        self.assertTrue(succ)
+        succ = self.run_partition_activations_test(test_config, 0.01)
+        self.assertTrue(succ)
+    def test_mp4_gpu4_node1(self):
+        test_config = {
+            "mp": 4,
+            "gpus": 4,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1000,
+            "layers": 12,
+            "hidden_size": 768,
+            "seq_length": 256,
+            "heads": 12,
+            "deepspeed": False,
+            "json": "ds_config_func_bs8.json",
+        }
+        succ = self.run_test(test_config, 0.01)
+        self.assertTrue(succ)
+        succ = self.run_partition_activations_test(test_config, 0.01)
+        self.assertTrue(succ)
+    def test_optimizer_scheduler(self):
+        test_config = {
+            "mp": 1,
+            "gpus": 1,
+            "nodes": 1,
+            "bs": 4,
+            "steps": 20,
+            "layers": 12,
+            "hidden_size": 768,
+            "seq_length": 256,
+            "heads": 12,
+            "deepspeed": False,
+            "json": "ds_config_func_scheduler.json",
+        }
+        succ = self.run_test(test_config, 0.01)
+        # assure no crash.
+        self.assertTrue(True)
+    def run_partition_activations_test(self, test_config, r_tol):
+        print("\n")
+        print("{0}: starting......".format(self.id()))
+        prefix = "gpt2_partition_activation_"
+        # baseline run...
+        test_config["deepspeed"] = False
+        base_file = self.gen_output_name(test_config, prefix)
+        # skip baseline run if it exists.
+        if not self.has_loss_data(base_file):
+            print("{0}: baseline run.".format(self.id()))
+            self.run_gpt2_test(test_config, base_file)
+        else:
+            print("{0}: baseline exists.".format(self.id()))
+        # DeepSpeed run...
+        test_config["deepspeed"] = True
+        test_config["other_args"] = "--partition-activations"
+        print("{0}: DeepSpeed run.".format(self.id()))
+        test_file = self.gen_output_name(test_config, prefix)
+        self.run_gpt2_test(test_config, test_file)
+        return self.check_parity(base_file, test_file, r_tol)
+    def run_test(self, test_config, r_tol):
+        print("\n")
+        print("{0}: starting......".format(self.id()))
+        prefix = "gpt2_func"
+        # baseline run...
+        test_config["deepspeed"] = False
+        base_file = self.gen_output_name(test_config, prefix)
+        # skip baseline run if it exists.
+        if not self.has_loss_data(base_file):
+            print("{0}: baseline run.".format(self.id()))
+            self.run_gpt2_test(test_config, base_file)
+        else:
+            print("{0}: baseline exists.".format(self.id()))
+        # DeepSpeed run...
+        test_config["deepspeed"] = True
+        print("{0}: DeepSpeed run.".format(self.id()))
+        test_file = self.gen_output_name(test_config, prefix)
+        self.run_gpt2_test(test_config, test_file)
+        return self.check_parity(base_file, test_file, r_tol)
+    def has_loss_data(self, file_name):
+        has_loss = False
+        if os.path.exists(file_name):
+            loss = grep_loss_from_file(file_name)
+            if loss != 0.0:
+                has_loss = True
+        return has_loss
+    def check_parity(self, base_file, test_file, r_tol):
+        base_loss = grep_loss_from_file(base_file)
+        test_loss = grep_loss_from_file(test_file)
+        print("baseline loss: {0}, test loss: {1}".format(base_loss, test_loss))
+        if base_loss == 0.0 or test_loss == 0.0:
+            return False
+        if abs((base_loss - test_loss) / base_loss) > r_tol:
+            return False
+        return True
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1'))
+    suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1'))
+    suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1'))
+    suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1'))
+    suite.addTest(GPT2FuncTestCase('test_optimizer_scheduler'))
+    return suite
+if __name__ == '__main__':
+    runner = unittest.TextTestRunner(failfast=True)
+    runner.run(suite())
--- a/tests/model/Megatron_GPT2/run_perf_baseline.py
+++ b/tests/model/Megatron_GPT2/run_perf_baseline.py
+# coding=utf-8
+# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
+#
+# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+import unittest
+import subprocess
+import os
+import time
+import re
+from test_common import BaseTestCase
+class GPT2PerfBaselineTestCase(BaseTestCase):
+    def __init__(self, methodName="DeepSpeed performance test on GPT2 model"):
+        super(GPT2PerfBaselineTestCase, self).__init__(methodName)
+    def test_perf_1_5B(self):
+        test_config = {
+            "mp": 2,
+            "gpus": 16,
+            "nodes": 4,
+            "bs": 16,
+            "steps": 100,
+            "layers": 48,
+            "hidden_size": 1600,
+            "seq_length": 1024,
+            "heads": 16,
+            "deepspeed": False,
+        }
+        self.run_test(test_config)
+    def test_perf_4B(self):
+        test_config = {
+            "mp": 4,
+            "gpus": 16,
+            "nodes": 4,
+            "bs": 8,
+            "steps": 100,
+            "layers": 64,
+            "hidden_size": 2304,
+            "seq_length": 1024,
+            "heads": 16,
+            "deepspeed": False,
+        }
+        self.run_test(test_config)
+    def test_perf_8B(self):
+        test_config = {
+            "mp": 4,
+            "gpus": 16,
+            "nodes": 4,
+            "bs": 8,
+            "steps": 100,
+            "layers": 72,
+            "hidden_size": 3072,
+            "seq_length": 1024,
+            "heads": 24,
+            "deepspeed": False,
+        }
+        self.run_test(test_config)
+    def test_perf_20B(self):
+        test_config = {
+            "mp": 16,
+            "gpus": 16,
+            "nodes": 4,
+            "bs": 4,
+            "steps": 50,
+            "layers": 111,
+            "hidden_size": 3808,
+            "seq_length": 1024,
+            "heads": 32,
+            "ckpt_num_layers": 1,
+            "deepspeed": False,
+        }
+        self.run_test(test_config)
+    def run_test(self, test_config):
+        print("\n")
+        print("{0}: starting......".format(self.id()))
+        prefix = "gpt2_perf"
+        test_file = self.gen_output_name(test_config, prefix)
+        self.run_gpt2_test(test_config, test_file)
+        exec_time = self.grep_latency_from_file(test_file)
+        if exec_time == 0.0:
+            print("{0}: no latency found in file {1}".format(self.id(), test_file))
+        else:
+            print("{0}: execution time per iteration is {1}ms.".format(
+                self.id(),
+                exec_time))
+    def grep_latency_from_file(self, file_name):
+        latency = 0.0
+        count = 0
+        with open(file_name, 'r') as f:
+            lines = f.readlines()
+            line_filter = "elapsed time per iteration"
+            match_number = re.compile(
+                'elapsed time per iteration \(ms\): ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)'
+            )
+            for line in lines:
+                if line_filter in line:
+                    ms_per_iter = re.findall(match_number, line)
+                    latency += float(ms_per_iter[0])
+                    count += 1
+        if count > 0:
+            latency /= count
+        return latency
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(GPT2PerfBaselineTestCase('test_perf_1_5B'))
+    suite.addTest(GPT2PerfBaselineTestCase('test_perf_4B'))
+    suite.addTest(GPT2PerfBaselineTestCase('test_perf_8B'))
+    suite.addTest(GPT2PerfBaselineTestCase('test_perf_20B'))
+    return suite
+if __name__ == '__main__':
+    runner = unittest.TextTestRunner(failfast=True)
+    runner.run(suite())
--- a/tests/model/Megatron_GPT2/run_perf_test.py
+++ b/tests/model/Megatron_GPT2/run_perf_test.py
+# coding=utf-8
+# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
+#
+# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+import unittest
+import subprocess
+import os
+import time
+import re
+from test_common import BaseTestCase
+class GPT2PerfTestCase(BaseTestCase):
+    def __init__(self, methodName="DeepSpeed performance test on GPT2 model"):
+        super(GPT2PerfTestCase, self).__init__(methodName)
+    def test_perf_1_5B(self):
+        test_config = {
+            "mp": 1,
+            "gpus": 16,
+            "nodes": 4,
+            "bs": 32,
+            "steps": 100,
+            "layers": 48,
+            "hidden_size": 1600,
+            "seq_length": 1024,
+            "heads": 16,
+            "deepspeed": True,
+            "json": "ds_config_perf_bs32.json",
+        }
+        self.run_test(test_config)
+    def test_perf_4B(self):
+        test_config = {
+            "mp": 1,
+            "gpus": 16,
+            "nodes": 4,
+            "bs": 8,
+            "steps": 100,
+            "layers": 64,
+            "hidden_size": 2304,
+            "seq_length": 1024,
+            "heads": 16,
+            "deepspeed": True,
+            "json": "ds_config_perf_bs8.json",
+        }
+        self.run_test(test_config)
+    def test_perf_8B(self):
+        test_config = {
+            "mp": 2,
+            "gpus": 16,
+            "nodes": 4,
+            "bs": 16,
+            "steps": 100,
+            "layers": 72,
+            "hidden_size": 3072,
+            "seq_length": 1024,
+            "heads": 24,
+            "deepspeed": True,
+            "json": "ds_config_perf_bs16.json",
+        }
+        self.run_test(test_config)
+    def test_perf_20B(self):
+        test_config = {
+            "mp": 4,
+            "gpus": 16,
+            "nodes": 4,
+            "bs": 8,
+            "steps": 50,
+            "layers": 111,
+            "hidden_size": 3808,
+            "seq_length": 1024,
+            "heads": 32,
+            "ckpt_num_layers": 1,
+            "deepspeed": True,
+            "json": "ds_config_perf_bs8.json",
+        }
+        self.run_test(test_config)
+    def run_test(self, test_config):
+        print("\n")
+        print("{0}: starting......".format(self.id()))
+        prefix = "gpt2_perf"
+        test_file = self.gen_output_name(test_config, prefix)
+        self.run_gpt2_test(test_config, test_file)
+        exec_time = self.grep_latency_from_file(test_file)
+        if exec_time == 0.0:
+            print("{0}: no latency found in file {1}".format(self.id(), test_file))
+        else:
+            print("{0}: execution time per iteration is {1}ms.".format(
+                self.id(),
+                exec_time))
+    def grep_latency_from_file(self, file_name):
+        latency = 0.0
+        count = 0
+        with open(file_name, 'r') as f:
+            lines = f.readlines()
+            line_filter = "elapsed time per iteration"
+            match_number = re.compile(
+                'elapsed time per iteration \(ms\): ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)'
+            )
+            for line in lines:
+                if line_filter in line:
+                    ms_per_iter = re.findall(match_number, line)
+                    latency += float(ms_per_iter[0])
+                    count += 1
+        if count > 0:
+            latency /= count
+        return latency
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(GPT2PerfTestCase('test_perf_1_5B'))
+    suite.addTest(GPT2PerfTestCase('test_perf_4B'))
+    suite.addTest(GPT2PerfTestCase('test_perf_8B'))
+    suite.addTest(GPT2PerfTestCase('test_perf_20B'))
+    return suite
+if __name__ == '__main__':
+    runner = unittest.TextTestRunner(failfast=True)
+    runner.run(suite())
--- a/tests/model/Megatron_GPT2/test_common.py
+++ b/tests/model/Megatron_GPT2/test_common.py
+# coding=utf-8
+# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
+#
+import unittest
+import subprocess
+import os
+import time
+import re
+class BaseTestCase(unittest.TestCase):
+    def __init__(self, methodName="DeepSpeed performance test"):
+        super(BaseTestCase, self).__init__(methodName)
+        self.test_dir = "./test"
+        self.baseline_dir = "./baseline"
+        self.timestr = time.strftime("%Y%m%d-%H%M%S")
+    def gen_output_name(self, test_config, prefix):
+        other_args = test_config["other_args"] if "other_args" in test_config else ""
+        zero_args = "_zero" if "zero" in test_config and test_config["zero"] else ""
+        other_args = other_args.strip(' -\\').replace(" ", "").replace("\"", "")
+        if other_args:
+            other_args = "_" + other_args
+        if test_config["deepspeed"]:
+            file_name = "_mp{0}_gpu{1}_node{2}_bs{3}_step{4}_layer{5}_hidden{6}_seq{7}_head{8}{9}_ds{10}-{11}.log".format(
+                test_config["mp"],
+                test_config["gpus"],
+                test_config["nodes"],
+                test_config["bs"],
+                test_config["steps"],
+                test_config["layers"],
+                test_config["hidden_size"],
+                test_config["seq_length"],
+                test_config["heads"],
+                other_args,
+                zero_args,
+                self.timestr)
+            save_dir = self.test_dir
+        else:
+            file_name = "_mp{0}_gpu{1}_node{2}_bs{3}_step{4}_layer{5}_hidden{6}_seq{7}_head{8}{9}.log".format(
+                test_config["mp"],
+                test_config["gpus"],
+                test_config["nodes"],
+                test_config["bs"],
+                test_config["steps"],
+                test_config["layers"],
+                test_config["hidden_size"],
+                test_config["seq_length"],
+                test_config["heads"],
+                other_args)
+            save_dir = self.baseline_dir
+        return os.path.join(save_dir, prefix + file_name)
+    def ensure_directory_exists(self, filename):
+        dirname = os.path.dirname(filename)
+        if not os.path.exists(dirname):
+            os.makedirs(dirname)
+    def clean_test_env(self):
+        cmd = "dlts_ssh pkill -9 -f /usr/bin/python"
+        print(cmd)
+        subprocess.run(cmd, shell=True, check=False, executable='/bin/bash')
+        time.sleep(20)
+    def run_gpt2_test(self, test_config, output):
+        ds_flag = "-d " + test_config["json"] if test_config["deepspeed"] else ""
+        ckpt_num = test_config[
+            "ckpt_num_layers"] if "ckpt_num_layers" in test_config else 1
+        other_args = "-o " + test_config[
+            "other_args"] if "other_args" in test_config else ""
+        cmd = "./ds_gpt2_test.sh -m {0} -g {1} -n {2} -b {3} -s {4} -l {5} -h {6} -q {7} -e {8} -c {9} {10} {11}".format(
+            test_config["mp"],
+            test_config["gpus"],
+            test_config["nodes"],
+            test_config["bs"],
+            test_config["steps"],
+            test_config["layers"],
+            test_config["hidden_size"],
+            test_config["seq_length"],
+            test_config["heads"],
+            ckpt_num,
+            other_args,
+            ds_flag)
+        self.ensure_directory_exists(output)
+        with open(output, "w") as f:
+            print(cmd)
+            subprocess.run(cmd,
+                           shell=True,
+                           check=False,
+                           executable='/bin/bash',
+                           stdout=f,
+                           stderr=f)
--- a/tests/model/run_sanity_check.py
+++ b/tests/model/run_sanity_check.py
+# coding=utf-8
+# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
+#
+# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+import sys
+import unittest
+sys.path.append('../examples/Megatron_GPT2')
+sys.path.append('../examples/BingBertSquad')
+sys.path.append('../examples/QANet-Pytorch')
+sys.path.append('../examples/bing_bert')
+import os
+# Import the test cases here.
+import Megatron_GPT2
+import BingBertSquad
+import bing_bert
+def pytest_hack(runner_result):
+    '''This is an ugly hack to get the unittest suites to play nicely with
+    pytest. Otherwise failed tests are not reported by pytest for some reason.
+    Long-term, these model tests should be adapted to pytest.
+    '''
+    if not runner_result.wasSuccessful():
+        print('SUITE UNSUCCESSFUL:', file=sys.stderr)
+        for fails in runner_result.failures:
+            print(fails, file=sys.stderr)
+        assert runner_result.wasSuccessful()  # fail the test
+def test_run():
+    runner = unittest.TextTestRunner(failfast=True)
+    # Add test suites here.
+    pytest_hack(runner.run(Megatron_GPT2.suite()))
+    pytest_hack(runner.run(Megatron_GPT2.checkpoint_suite()))
+    #pytest_hack(runner.run(BingBertSquad.suite()))
+    #pytest_hack(runner.run(bing_bert.checkpoint_suite()))
+    #pytest_hack(runner.run(bing_bert.pretrain_suite()))
+if __name__ == '__main__':
+    test_run()