test_common.py 3.52 KB
Newer Older
Elton Zheng's avatar
Elton Zheng committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# coding=utf-8
# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
#

import unittest
import subprocess
import os
import time
import re


class BaseTestCase(unittest.TestCase):
    def __init__(self, methodName="DeepSpeed performance test"):
        super(BaseTestCase, self).__init__(methodName)
        self.test_dir = "./test"
        self.baseline_dir = "./baseline"
        self.timestr = time.strftime("%Y%m%d-%H%M%S")

    def gen_output_name(self, test_config, prefix):
        other_args = test_config["other_args"] if "other_args" in test_config else ""
        zero_args = "_zero" if "zero" in test_config and test_config["zero"] else ""
        other_args = other_args.strip(' -\\').replace(" ", "").replace("\"", "")

        if other_args:
            other_args = "_" + other_args

        if test_config["deepspeed"]:
            file_name = "_mp{0}_gpu{1}_node{2}_bs{3}_step{4}_layer{5}_hidden{6}_seq{7}_head{8}{9}_ds{10}-{11}.log".format(
                test_config["mp"],
                test_config["gpus"],
                test_config["nodes"],
                test_config["bs"],
                test_config["steps"],
                test_config["layers"],
                test_config["hidden_size"],
                test_config["seq_length"],
                test_config["heads"],
                other_args,
                zero_args,
                self.timestr)
            save_dir = self.test_dir
        else:
            file_name = "_mp{0}_gpu{1}_node{2}_bs{3}_step{4}_layer{5}_hidden{6}_seq{7}_head{8}{9}.log".format(
                test_config["mp"],
                test_config["gpus"],
                test_config["nodes"],
                test_config["bs"],
                test_config["steps"],
                test_config["layers"],
                test_config["hidden_size"],
                test_config["seq_length"],
                test_config["heads"],
                other_args)
            save_dir = self.baseline_dir

        return os.path.join(save_dir, prefix + file_name)

    def ensure_directory_exists(self, filename):
        dirname = os.path.dirname(filename)
        if not os.path.exists(dirname):
            os.makedirs(dirname)

    def clean_test_env(self):
        cmd = "dlts_ssh pkill -9 -f /usr/bin/python"
        print(cmd)
        subprocess.run(cmd, shell=True, check=False, executable='/bin/bash')
        time.sleep(20)

    def run_gpt2_test(self, test_config, output):
        ds_flag = "-d " + test_config["json"] if test_config["deepspeed"] else ""
        ckpt_num = test_config[
            "ckpt_num_layers"] if "ckpt_num_layers" in test_config else 1
        other_args = "-o " + test_config[
            "other_args"] if "other_args" in test_config else ""

        cmd = "./ds_gpt2_test.sh -m {0} -g {1} -n {2} -b {3} -s {4} -l {5} -h {6} -q {7} -e {8} -c {9} {10} {11}".format(
            test_config["mp"],
            test_config["gpus"],
            test_config["nodes"],
            test_config["bs"],
            test_config["steps"],
            test_config["layers"],
            test_config["hidden_size"],
            test_config["seq_length"],
            test_config["heads"],
            ckpt_num,
            other_args,
            ds_flag)

        self.ensure_directory_exists(output)
        with open(output, "w") as f:
            print(cmd)
            subprocess.run(cmd,
                           shell=True,
                           check=False,
                           executable='/bin/bash',
                           stdout=f,
                           stderr=f)