Unverified Commit f525d1f5 authored by ver217's avatar ver217 Committed by GitHub
Browse files

[example] update gpt gemini example ci test (#2477)

parent fef5c949
...@@ -65,6 +65,7 @@ def parse_args(): ...@@ -65,6 +65,7 @@ def parse_args():
default="gpt2_medium", default="gpt2_medium",
help="model model scale", help="model model scale",
) )
parser.add_argument("--steps", type=int, default=10, help="num of training steps")
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -236,7 +237,7 @@ def main(): ...@@ -236,7 +237,7 @@ def main():
SEQ_LEN = 1024 SEQ_LEN = 1024
VOCAB_SIZE = 50257 VOCAB_SIZE = 50257
NUM_STEPS = 10 NUM_STEPS = args.steps
WARMUP_STEPS = 1 WARMUP_STEPS = 1
assert WARMUP_STEPS < NUM_STEPS, "warmup steps should smaller than the total steps" assert WARMUP_STEPS < NUM_STEPS, "warmup steps should smaller than the total steps"
assert (NUM_STEPS - WARMUP_STEPS) % 2 == 1, "the number of valid steps should be odd to take the median " assert (NUM_STEPS - WARMUP_STEPS) % 2 == 1, "the number of valid steps should be odd to take the median "
...@@ -290,14 +291,12 @@ def main(): ...@@ -290,14 +291,12 @@ def main():
from torch.distributed.optim import ZeroRedundancyOptimizer from torch.distributed.optim import ZeroRedundancyOptimizer
optimizer = ZeroRedundancyOptimizer(model.parameters(), optimizer_class=torch.optim.Adam, lr=0.01) optimizer = ZeroRedundancyOptimizer(model.parameters(), optimizer_class=torch.optim.Adam, lr=0.01)
elif args.distplan.startswith("zero"): elif args.distplan.startswith("zero"):
pg = ProcessGroup()
model = model.half() model = model.half()
partition_flag = (args.distplan == "zero2") partition_flag = (args.distplan == "zero2")
optimizer = torch.optim.Adam(model.parameters(), lr=0.01) optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
optimizer = LowLevelZeroOptimizer( optimizer = LowLevelZeroOptimizer(
optimizer, optimizer,
pg=pg,
reduce_bucket_size=12 * 1024 * 1024, reduce_bucket_size=12 * 1024 * 1024,
overlap_communication=True, overlap_communication=True,
partition_grad=partition_flag, partition_grad=partition_flag,
......
pip install -r requirements.txt pip install -r requirements.txt
# distplan in ["colossalai", "zero1", "zero2", "torch_ddp", "torch_zero"] # test colossalai
export DISTPAN="colossalai" for TP in 1 2; do
for PLACEMENT in "cpu" "cuda" "auto" "const"; do
# The following options only valid when DISTPAN="colossalai" for SHARD in "True" "False"; do
export TPDEGREE=2 colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan colossalai --tp_degree $TP --placement $PLACEMENT --shardinit $SHARD || exit 1
export GPUNUM=4 done
export PLACEMENT='cpu' done
export USE_SHARD_INIT=False done
export BATCH_SIZE=8
export MODEL_TYPE="gpt2_medium" # test zero1&2
for DIST in "zero1" "zero2"; do
colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan $DIST || exit 1
mkdir -p logs done
torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --model_type=${MODEL_TYPE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee ./logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}.log
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment