gpt_scaling_test.py

import subprocess
import os

from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE


def run_gpt(cmd):
    args = list(cmd.split(" "))
    p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    outs, errs = p.communicate()
    outs = list(str((outs).decode("utf-8")).splitlines())
    success = False
    runtime = 0
    num_params = 0
    for out in outs:
        out = str(out)
        if "Average Iteration Time:" in str(out):
            slicey = out[out.find(":") + 2 :]
            try:
                runtime = float(slicey)
            except:
                print(slicey)
                quit()
        if "Number of Parameters:" in str(out):
            slicey = out[out.find(":") + 2 :]
            try:
                num_params = int(slicey)
            except:
                print(slicey)
                quit()
        if str(out) == str(TEST_SUCCESS_MESSAGE):
            success = True
    return runtime, round(float(int(num_params)) / 10.0 ** 9, 3), success, errs


def plot(runtimes):
    import matplotlib.pyplot as plt

    for distributed_setting in runtimes.keys():
        plt.scatter(
            runtimes[distributed_setting].keys(),
            runtimes[distributed_setting].values(),
            label=distributed_setting,
        )
    plt.legend()
    plt.xlabel("Parameters (Billions)")
    plt.ylabel("Training Iteration time (s)")
    plt.title(str("GPT Scaling w/ Offloading"))
    plt.savefig("offload_gpt_scaling.png")
    plt.close()
    if not os.path.exists("/my_workspace/"):
        os.system("mkdir /my_workspace/")
    os.system("cp *.png /my_workspace/")


def main():
    runtimes = {}
    nlist = (
        list(range(2000, 10000, 2000))
        + list(range(10000, 50000, 5000))
        + list(range(50000, 100000, 10000))
    )
    print("N-List:", nlist)
    for data_parr, tens_parr, pipe_parr in [(8, 1, 1), (4, 2, 1), (2, 1, 4), (1, 2, 4)]:
        for offload in [True, False]:
            dist_setting = (
                "ddp="
                + str(data_parr)
                + ", tensor_parr="
                + str(tens_parr)
                + ", pipe_parr="
                + str(pipe_parr)
                + ", offload="
                + str(offload)
            )
            runtimes[dist_setting] = {}
            print("Beginning Testing for", dist_setting)
            for n in nlist:
                cmd = "python3 -m torch.distributed.launch --nproc_per_node=8 run_gpt_minimal_test.py"
                cmd += (
                    " --micro-batch-size 1 --num-layers "
                    + str(n)
                    + " --hidden-size 128 --num-attention-heads 16"
                )
                cmd += (
                    " --max-position-embeddings 128 --seq-length 128 --tensor-model-parallel-size "
                    + str(tens_parr)
                )
                cmd += (
                    " --pipeline-model-parallel-size "
                    + str(pipe_parr)
                    + (" --cpu-offload" if offload else "")
                )
                print(cmd)
                runtime, bill_params, success, errs = run_gpt(cmd)
                if success:
                    runtimes[dist_setting][bill_params] = runtime
                    print(
                        str(runtime) + "s per training iter for",
                        str(bill_params) + "B parameter GPT-2",
                    )
                    if n >= 10000:
                        plot(runtimes)
                else:
                    print("GPT-2 w/", n, "layers failed using", dist_setting)
                    print("Moving on to the next distributed setting...")
                    print("#" * (25))
                    print()
                    plot(runtimes)
                    break
    print(runtimes)
    plot(runtimes)


if __name__ == "__main__":
    main()