+ export HSA_FORCE_FINE_GRAIN_PCIE=1 + HSA_FORCE_FINE_GRAIN_PCIE=1 + export OMP_NUM_THREADS=1 + OMP_NUM_THREADS=1 + export HSA_FORCE_FINE_GRAIN_PCIE=1 + HSA_FORCE_FINE_GRAIN_PCIE=1 + export OMP_NUM_THREADS=1 + OMP_NUM_THREADS=1 + export NCCL_P2P_LEVEL=PXB + NCCL_P2P_LEVEL=PXB + export NCCL_ALGO=Ring + NCCL_ALGO=Ring + export NCCL_NCHANNELS_PER_PEER=16 + NCCL_NCHANNELS_PER_PEER=16 + export NCCL_P2P_LEVEL=PXB + NCCL_P2P_LEVEL=PXB + export NCCL_ALGO=Ring + NCCL_ALGO=Ring + export NCCL_NCHANNELS_PER_PEER=16 + NCCL_NCHANNELS_PER_PEER=16 + export NCCL_MIN_NCHANNELS=32 + NCCL_MIN_NCHANNELS=32 + export NCCL_MIN_NCHANNELS=32 + NCCL_MIN_NCHANNELS=32 + export NCCL_MAX_NCHANNELS=32 + NCCL_MAX_NCHANNELS=32 + export NCCL_MAX_NCHANNELS=32 + NCCL_MAX_NCHANNELS=32 + export NCCL_IB_TIMEOUT=22 + NCCL_IB_TIMEOUT=22 + export CUDA_DEVICE_MAX_CONNECTIONS=1 + export NCCL_IB_TIMEOUT=22 + NCCL_IB_TIMEOUT=22 + export CUDA_DEVICE_MAX_CONNECTIONS=1 + CUDA_DEVICE_MAX_CONNECTIONS=1 + export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 + NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 + CUDA_DEVICE_MAX_CONNECTIONS=1 + export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 + NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 + export NCCL_NET_GDR_LEVEL=7 + NCCL_NET_GDR_LEVEL=7 + export NCCL_NET_GDR_LEVEL=7 + NCCL_NET_GDR_LEVEL=7 + export NCCL_NET_GDR_READ=1 + export NCCL_NET_GDR_READ=1 + NCCL_NET_GDR_READ=1 + NCCL_NET_GDR_READ=1 + export RCCL_SDMA_COPY_ENABLE=0 + RCCL_SDMA_COPY_ENABLE=0 + export RCCL_SDMA_COPY_ENABLE=0 + RCCL_SDMA_COPY_ENABLE=0 + export GLOG_minloglevel=3 + GLOG_minloglevel=3 + export GLOG_minloglevel=3 + GLOG_minloglevel=3 + export ALLREDUCE_STREAM_WITH_COMPUTE=1 + ALLREDUCE_STREAM_WITH_COMPUTE=1 + export SENDRECV_STREAM_WITH_COMPUTE=1 + export ALLREDUCE_STREAM_WITH_COMPUTE=1 + ALLREDUCE_STREAM_WITH_COMPUTE=1 + export SENDRECV_STREAM_WITH_COMPUTE=1 + SENDRECV_STREAM_WITH_COMPUTE=1 + SENDRECV_STREAM_WITH_COMPUTE=1 + export cache_size_limit=64 + cache_size_limit=64 + export cache_size_limit=64 + cache_size_limit=64 + SAVE_PATH=./tmp_7b + TENSORBOARD_LOGS_PATH=./tmp_7b + SAVE_PATH=./tmp_7b + TENSORBOARD_LOGS_PATH=./tmp_7b + DATA_PATH=/models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document + DATA_PATH=/models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document + GPT_MODEL_ARGS=(--num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights) + export HSA_FORCE_FINE_GRAIN_PCIE=1 + HSA_FORCE_FINE_GRAIN_PCIE=1 + export OMP_NUM_THREADS=1 + OMP_NUM_THREADS=1 + export NCCL_P2P_LEVEL=PXB + GPT_MODEL_ARGS=(--num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights) + NCCL_P2P_LEVEL=PXB + export NCCL_ALGO=Ring + NCCL_ALGO=Ring + export NCCL_NCHANNELS_PER_PEER=16 + NCCL_NCHANNELS_PER_PEER=16 + export NCCL_MIN_NCHANNELS=32 + NCCL_MIN_NCHANNELS=32 + export NVTE_FLASH_ATTN=1 + NVTE_FLASH_ATTN=1 + export NVTE_FLASH_ATTN=1 + NVTE_FLASH_ATTN=1 + export NCCL_MAX_NCHANNELS=32 + NCCL_MAX_NCHANNELS=32 + export NCCL_IB_TIMEOUT=22 + NCCL_IB_TIMEOUT=22 + TRAINING_ARGS=(--transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass) + TRAINING_ARGS=(--transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass) + export CUDA_DEVICE_MAX_CONNECTIONS=1 + CUDA_DEVICE_MAX_CONNECTIONS=1 + export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 + NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 + export NCCL_NET_GDR_LEVEL=7 + NCCL_NET_GDR_LEVEL=7 + MODEL_PARALLEL_ARGS=(--sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2) + DATA_ARGS=(--data-path $DATA_PATH --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model) + MODEL_PARALLEL_ARGS=(--sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2) + DATA_ARGS=(--data-path $DATA_PATH --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model) + export NCCL_NET_GDR_READ=1 + NCCL_NET_GDR_READ=1 + export RCCL_SDMA_COPY_ENABLE=0 + RCCL_SDMA_COPY_ENABLE=0 + export GLOG_minloglevel=3 + GLOG_minloglevel=3 + EVAL_AND_LOGGING_ARGS=(--log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir $TENSORBOARD_LOGS_PATH) + EVAL_AND_LOGGING_ARGS=(--log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir $TENSORBOARD_LOGS_PATH) + export ALLREDUCE_STREAM_WITH_COMPUTE=1 + ALLREDUCE_STREAM_WITH_COMPUTE=1 + export SENDRECV_STREAM_WITH_COMPUTE=1 + SENDRECV_STREAM_WITH_COMPUTE=1 + export cache_size_limit=64 + PROFILE_ARGS=(--profile --profile-step-start 4 --profile-step-end 5 --use-pytorch-profiler --profile-ranks 0 1 2 3 4 5 6 7 --profile-dir prof_data) + PROFILE_ARGS=(--profile --profile-step-start 4 --profile-step-end 5 --use-pytorch-profiler --profile-ranks 0 1 2 3 4 5 6 7 --profile-dir prof_data) + cache_size_limit=64 + SAVE_PATH=./tmp_7b + TENSORBOARD_LOGS_PATH=./tmp_7b + RANK=1 + LOCAL_RANK=1 + WORLD_SIZE=8 + RANK=0 + LOCAL_RANK=0 + WORLD_SIZE=8 + DATA_PATH=/models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document + GPT_MODEL_ARGS=(--num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights) + DIST_URL=localhost + DIST_PORT=34577 + DIST_URL=localhost + DIST_PORT=34577 + DISTRIBUTED_ARGS=(--rank ${RANK} --world-size ${WORLD_SIZE} --local-rank ${LOCAL_RANK} --dist-url tcp://${DIST_URL}:${DIST_PORT}) + DISTRIBUTED_ARGS=(--rank ${RANK} --world-size ${WORLD_SIZE} --local-rank ${LOCAL_RANK} --dist-url tcp://${DIST_URL}:${DIST_PORT}) + export NVTE_FLASH_ATTN=1 + NVTE_FLASH_ATTN=1 + APP='python -u pretrain_gpt.py --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b --rank 0 --world-size 8 --local-rank 0 --dist-url tcp://localhost:34577 ' + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + APP='python -u pretrain_gpt.py --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b --rank 1 --world-size 8 --local-rank 1 --dist-url tcp://localhost:34577 ' + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + TRAINING_ARGS=(--transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass) + HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + case ${LOCAL_RANK} in + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + case ${LOCAL_RANK} in + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + MODEL_PARALLEL_ARGS=(--sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2) + DATA_ARGS=(--data-path $DATA_PATH --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model) + EVAL_AND_LOGGING_ARGS=(--log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir $TENSORBOARD_LOGS_PATH) + numactl --cpunodebind=1 --membind=1 python -u pretrain_gpt.py --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b --rank 1 --world-size 8 --local-rank 1 --dist-url tcp://localhost:34577 + numactl --cpunodebind=0 --membind=0 python -u pretrain_gpt.py --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b --rank 0 --world-size 8 --local-rank 0 --dist-url tcp://localhost:34577 + PROFILE_ARGS=(--profile --profile-step-start 4 --profile-step-end 5 --use-pytorch-profiler --profile-ranks 0 1 2 3 4 5 6 7 --profile-dir prof_data) + RANK=3 + LOCAL_RANK=3 + WORLD_SIZE=8 + DIST_URL=localhost + DIST_PORT=34577 + DISTRIBUTED_ARGS=(--rank ${RANK} --world-size ${WORLD_SIZE} --local-rank ${LOCAL_RANK} --dist-url tcp://${DIST_URL}:${DIST_PORT}) + APP='python -u pretrain_gpt.py --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b --rank 3 --world-size 8 --local-rank 3 --dist-url tcp://localhost:34577 ' + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + case ${LOCAL_RANK} in + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + numactl --cpunodebind=3 --membind=3 python -u pretrain_gpt.py --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b --rank 3 --world-size 8 --local-rank 3 --dist-url tcp://localhost:34577 + export HSA_FORCE_FINE_GRAIN_PCIE=1 + HSA_FORCE_FINE_GRAIN_PCIE=1 + export HSA_FORCE_FINE_GRAIN_PCIE=1 + HSA_FORCE_FINE_GRAIN_PCIE=1 + export OMP_NUM_THREADS=1 + OMP_NUM_THREADS=1 + export OMP_NUM_THREADS=1 + OMP_NUM_THREADS=1 + export NCCL_P2P_LEVEL=PXB + NCCL_P2P_LEVEL=PXB + export NCCL_P2P_LEVEL=PXB + NCCL_P2P_LEVEL=PXB + export NCCL_ALGO=Ring + NCCL_ALGO=Ring + export NCCL_ALGO=Ring + NCCL_ALGO=Ring + export NCCL_NCHANNELS_PER_PEER=16 + export NCCL_NCHANNELS_PER_PEER=16 + NCCL_NCHANNELS_PER_PEER=16 + export NCCL_MIN_NCHANNELS=32 + NCCL_MIN_NCHANNELS=32 + NCCL_NCHANNELS_PER_PEER=16 + export NCCL_MIN_NCHANNELS=32 + NCCL_MIN_NCHANNELS=32 + export NCCL_MAX_NCHANNELS=32 + NCCL_MAX_NCHANNELS=32 + export NCCL_IB_TIMEOUT=22 + NCCL_IB_TIMEOUT=22 + export NCCL_MAX_NCHANNELS=32 + NCCL_MAX_NCHANNELS=32 + export NCCL_IB_TIMEOUT=22 + NCCL_IB_TIMEOUT=22 + export CUDA_DEVICE_MAX_CONNECTIONS=1 + export CUDA_DEVICE_MAX_CONNECTIONS=1 + CUDA_DEVICE_MAX_CONNECTIONS=1 + CUDA_DEVICE_MAX_CONNECTIONS=1 + export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 + NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 + export NCCL_NET_GDR_LEVEL=7 + NCCL_NET_GDR_LEVEL=7 + export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 + NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 + export NCCL_NET_GDR_LEVEL=7 + NCCL_NET_GDR_LEVEL=7 + export NCCL_NET_GDR_READ=1 + export NCCL_NET_GDR_READ=1 + NCCL_NET_GDR_READ=1 + export RCCL_SDMA_COPY_ENABLE=0 + NCCL_NET_GDR_READ=1 + export RCCL_SDMA_COPY_ENABLE=0 + RCCL_SDMA_COPY_ENABLE=0 + RCCL_SDMA_COPY_ENABLE=0 + export GLOG_minloglevel=3 + GLOG_minloglevel=3 + export GLOG_minloglevel=3 + GLOG_minloglevel=3 + export ALLREDUCE_STREAM_WITH_COMPUTE=1 + ALLREDUCE_STREAM_WITH_COMPUTE=1 + export SENDRECV_STREAM_WITH_COMPUTE=1 + export ALLREDUCE_STREAM_WITH_COMPUTE=1 + ALLREDUCE_STREAM_WITH_COMPUTE=1 + export SENDRECV_STREAM_WITH_COMPUTE=1 + SENDRECV_STREAM_WITH_COMPUTE=1 + SENDRECV_STREAM_WITH_COMPUTE=1 + export cache_size_limit=64 + cache_size_limit=64 + SAVE_PATH=./tmp_7b + export cache_size_limit=64 + cache_size_limit=64 + SAVE_PATH=./tmp_7b + TENSORBOARD_LOGS_PATH=./tmp_7b + TENSORBOARD_LOGS_PATH=./tmp_7b + DATA_PATH=/models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document + DATA_PATH=/models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document + GPT_MODEL_ARGS=(--num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights) + GPT_MODEL_ARGS=(--num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights) + export NVTE_FLASH_ATTN=1 + NVTE_FLASH_ATTN=1 + export NVTE_FLASH_ATTN=1 + NVTE_FLASH_ATTN=1 + TRAINING_ARGS=(--transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass) + TRAINING_ARGS=(--transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass) + MODEL_PARALLEL_ARGS=(--sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2) + MODEL_PARALLEL_ARGS=(--sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2) + DATA_ARGS=(--data-path $DATA_PATH --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model) + DATA_ARGS=(--data-path $DATA_PATH --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model) + EVAL_AND_LOGGING_ARGS=(--log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir $TENSORBOARD_LOGS_PATH) + EVAL_AND_LOGGING_ARGS=(--log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir $TENSORBOARD_LOGS_PATH) + PROFILE_ARGS=(--profile --profile-step-start 4 --profile-step-end 5 --use-pytorch-profiler --profile-ranks 0 1 2 3 4 5 6 7 --profile-dir prof_data) + PROFILE_ARGS=(--profile --profile-step-start 4 --profile-step-end 5 --use-pytorch-profiler --profile-ranks 0 1 2 3 4 5 6 7 --profile-dir prof_data) + RANK=4 + LOCAL_RANK=4 + RANK=6 + LOCAL_RANK=6 + WORLD_SIZE=8 + WORLD_SIZE=8 + DIST_URL=localhost + DIST_URL=localhost + DIST_PORT=34577 + DIST_PORT=34577 + DISTRIBUTED_ARGS=(--rank ${RANK} --world-size ${WORLD_SIZE} --local-rank ${LOCAL_RANK} --dist-url tcp://${DIST_URL}:${DIST_PORT}) + DISTRIBUTED_ARGS=(--rank ${RANK} --world-size ${WORLD_SIZE} --local-rank ${LOCAL_RANK} --dist-url tcp://${DIST_URL}:${DIST_PORT}) + APP='python -u pretrain_gpt.py --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b --rank 6 --world-size 8 --local-rank 6 --dist-url tcp://localhost:34577 ' + APP='python -u pretrain_gpt.py --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b --rank 4 --world-size 8 --local-rank 4 --dist-url tcp://localhost:34577 ' + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + case ${LOCAL_RANK} in + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + case ${LOCAL_RANK} in + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + numactl --cpunodebind=6 --membind=6 python -u pretrain_gpt.py --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b --rank 6 --world-size 8 --local-rank 6 --dist-url tcp://localhost:34577 + numactl --cpunodebind=4 --membind=4 python -u pretrain_gpt.py --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b --rank 4 --world-size 8 --local-rank 4 --dist-url tcp://localhost:34577 + export HSA_FORCE_FINE_GRAIN_PCIE=1 + HSA_FORCE_FINE_GRAIN_PCIE=1 + export OMP_NUM_THREADS=1 + OMP_NUM_THREADS=1 + export NCCL_P2P_LEVEL=PXB + NCCL_P2P_LEVEL=PXB + export NCCL_ALGO=Ring + NCCL_ALGO=Ring + export NCCL_NCHANNELS_PER_PEER=16 + NCCL_NCHANNELS_PER_PEER=16 + export NCCL_MIN_NCHANNELS=32 + NCCL_MIN_NCHANNELS=32 + export NCCL_MAX_NCHANNELS=32 + NCCL_MAX_NCHANNELS=32 + export NCCL_IB_TIMEOUT=22 + NCCL_IB_TIMEOUT=22 + export CUDA_DEVICE_MAX_CONNECTIONS=1 + CUDA_DEVICE_MAX_CONNECTIONS=1 + export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 + NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 + export NCCL_NET_GDR_LEVEL=7 + NCCL_NET_GDR_LEVEL=7 + export NCCL_NET_GDR_READ=1 + NCCL_NET_GDR_READ=1 + export RCCL_SDMA_COPY_ENABLE=0 + RCCL_SDMA_COPY_ENABLE=0 + export GLOG_minloglevel=3 + GLOG_minloglevel=3 + export ALLREDUCE_STREAM_WITH_COMPUTE=1 + ALLREDUCE_STREAM_WITH_COMPUTE=1 + export SENDRECV_STREAM_WITH_COMPUTE=1 + SENDRECV_STREAM_WITH_COMPUTE=1 + export cache_size_limit=64 + cache_size_limit=64 + SAVE_PATH=./tmp_7b + TENSORBOARD_LOGS_PATH=./tmp_7b + DATA_PATH=/models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document + GPT_MODEL_ARGS=(--num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights) + export NVTE_FLASH_ATTN=1 + NVTE_FLASH_ATTN=1 + TRAINING_ARGS=(--transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass) + MODEL_PARALLEL_ARGS=(--sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2) + DATA_ARGS=(--data-path $DATA_PATH --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model) + EVAL_AND_LOGGING_ARGS=(--log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir $TENSORBOARD_LOGS_PATH) + PROFILE_ARGS=(--profile --profile-step-start 4 --profile-step-end 5 --use-pytorch-profiler --profile-ranks 0 1 2 3 4 5 6 7 --profile-dir prof_data) + RANK=2 + LOCAL_RANK=2 + WORLD_SIZE=8 + export HSA_FORCE_FINE_GRAIN_PCIE=1 + HSA_FORCE_FINE_GRAIN_PCIE=1 + DIST_URL=localhost + DIST_PORT=34577 + export OMP_NUM_THREADS=1 + OMP_NUM_THREADS=1 + DISTRIBUTED_ARGS=(--rank ${RANK} --world-size ${WORLD_SIZE} --local-rank ${LOCAL_RANK} --dist-url tcp://${DIST_URL}:${DIST_PORT}) + export NCCL_P2P_LEVEL=PXB + NCCL_P2P_LEVEL=PXB + export NCCL_ALGO=Ring + NCCL_ALGO=Ring + export NCCL_NCHANNELS_PER_PEER=16 + NCCL_NCHANNELS_PER_PEER=16 + export NCCL_MIN_NCHANNELS=32 + NCCL_MIN_NCHANNELS=32 + export NCCL_MAX_NCHANNELS=32 + NCCL_MAX_NCHANNELS=32 + export NCCL_IB_TIMEOUT=22 + NCCL_IB_TIMEOUT=22 + APP='python -u pretrain_gpt.py --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b --rank 2 --world-size 8 --local-rank 2 --dist-url tcp://localhost:34577 ' + export CUDA_DEVICE_MAX_CONNECTIONS=1 + CUDA_DEVICE_MAX_CONNECTIONS=1 + export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 + export NCCL_NET_GDR_LEVEL=7 + NCCL_NET_GDR_LEVEL=7 + export NCCL_NET_GDR_READ=1 + NCCL_NET_GDR_READ=1 + export RCCL_SDMA_COPY_ENABLE=0 + RCCL_SDMA_COPY_ENABLE=0 + export GLOG_minloglevel=3 + GLOG_minloglevel=3 + case ${LOCAL_RANK} in + export ALLREDUCE_STREAM_WITH_COMPUTE=1 + ALLREDUCE_STREAM_WITH_COMPUTE=1 + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + export SENDRECV_STREAM_WITH_COMPUTE=1 + SENDRECV_STREAM_WITH_COMPUTE=1 + export cache_size_limit=64 + cache_size_limit=64 + SAVE_PATH=./tmp_7b + TENSORBOARD_LOGS_PATH=./tmp_7b + DATA_PATH=/models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document + numactl --cpunodebind=2 --membind=2 python -u pretrain_gpt.py --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b --rank 2 --world-size 8 --local-rank 2 --dist-url tcp://localhost:34577 + GPT_MODEL_ARGS=(--num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights) + export NVTE_FLASH_ATTN=1 + NVTE_FLASH_ATTN=1 + TRAINING_ARGS=(--transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass) + MODEL_PARALLEL_ARGS=(--sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2) + DATA_ARGS=(--data-path $DATA_PATH --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model) + EVAL_AND_LOGGING_ARGS=(--log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir $TENSORBOARD_LOGS_PATH) + PROFILE_ARGS=(--profile --profile-step-start 4 --profile-step-end 5 --use-pytorch-profiler --profile-ranks 0 1 2 3 4 5 6 7 --profile-dir prof_data) + RANK=5 + LOCAL_RANK=5 + WORLD_SIZE=8 + DIST_URL=localhost + DIST_PORT=34577 + DISTRIBUTED_ARGS=(--rank ${RANK} --world-size ${WORLD_SIZE} --local-rank ${LOCAL_RANK} --dist-url tcp://${DIST_URL}:${DIST_PORT}) + APP='python -u pretrain_gpt.py --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b --rank 5 --world-size 8 --local-rank 5 --dist-url tcp://localhost:34577 ' + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + case ${LOCAL_RANK} in + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + numactl --cpunodebind=5 --membind=5 python -u pretrain_gpt.py --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b --rank 5 --world-size 8 --local-rank 5 --dist-url tcp://localhost:34577 + export HSA_FORCE_FINE_GRAIN_PCIE=1 + HSA_FORCE_FINE_GRAIN_PCIE=1 + export OMP_NUM_THREADS=1 + OMP_NUM_THREADS=1 + export NCCL_P2P_LEVEL=PXB + NCCL_P2P_LEVEL=PXB + export NCCL_ALGO=Ring + NCCL_ALGO=Ring + export NCCL_NCHANNELS_PER_PEER=16 + NCCL_NCHANNELS_PER_PEER=16 + export NCCL_MIN_NCHANNELS=32 + NCCL_MIN_NCHANNELS=32 + export NCCL_MAX_NCHANNELS=32 + NCCL_MAX_NCHANNELS=32 + export NCCL_IB_TIMEOUT=22 + NCCL_IB_TIMEOUT=22 + export CUDA_DEVICE_MAX_CONNECTIONS=1 + CUDA_DEVICE_MAX_CONNECTIONS=1 + export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 + NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 + export NCCL_NET_GDR_LEVEL=7 + NCCL_NET_GDR_LEVEL=7 + export NCCL_NET_GDR_READ=1 + NCCL_NET_GDR_READ=1 + export RCCL_SDMA_COPY_ENABLE=0 + RCCL_SDMA_COPY_ENABLE=0 + export GLOG_minloglevel=3 + GLOG_minloglevel=3 + export ALLREDUCE_STREAM_WITH_COMPUTE=1 + ALLREDUCE_STREAM_WITH_COMPUTE=1 + export SENDRECV_STREAM_WITH_COMPUTE=1 + SENDRECV_STREAM_WITH_COMPUTE=1 + export cache_size_limit=64 + cache_size_limit=64 + SAVE_PATH=./tmp_7b + TENSORBOARD_LOGS_PATH=./tmp_7b + DATA_PATH=/models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document + GPT_MODEL_ARGS=(--num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights) + export NVTE_FLASH_ATTN=1 + NVTE_FLASH_ATTN=1 + TRAINING_ARGS=(--transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass) + MODEL_PARALLEL_ARGS=(--sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2) + DATA_ARGS=(--data-path $DATA_PATH --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model) + EVAL_AND_LOGGING_ARGS=(--log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir $TENSORBOARD_LOGS_PATH) + PROFILE_ARGS=(--profile --profile-step-start 4 --profile-step-end 5 --use-pytorch-profiler --profile-ranks 0 1 2 3 4 5 6 7 --profile-dir prof_data) + RANK=7 + LOCAL_RANK=7 + WORLD_SIZE=8 + DIST_URL=localhost + DIST_PORT=34577 + DISTRIBUTED_ARGS=(--rank ${RANK} --world-size ${WORLD_SIZE} --local-rank ${LOCAL_RANK} --dist-url tcp://${DIST_URL}:${DIST_PORT}) + APP='python -u pretrain_gpt.py --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b --rank 7 --world-size 8 --local-rank 7 --dist-url tcp://localhost:34577 ' + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + case ${LOCAL_RANK} in + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + numactl --cpunodebind=7 --membind=7 python -u pretrain_gpt.py --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b --rank 7 --world-size 8 --local-rank 7 --dist-url tcp://localhost:34577 /megatron-lm/megatron/training/arguments.py:601: UserWarning: Disabling sequence parallelism because tensor model parallelism is disabled warnings.warn("Disabling sequence parallelism because tensor model parallelism is disabled") [WARNING | megatron.core.rerun_state_machine]: RerunStateMachine initialized in mode disabled /megatron-lm/megatron/training/arguments.py:601: UserWarning: Disabling sequence parallelism because tensor model parallelism is disabled warnings.warn("Disabling sequence parallelism because tensor model parallelism is disabled") [WARNING | megatron.core.rerun_state_machine]: RerunStateMachine initialized in mode disabled /megatron-lm/megatron/training/arguments.py:601: UserWarning: Disabling sequence parallelism because tensor model parallelism is disabled warnings.warn("Disabling sequence parallelism because tensor model parallelism is disabled") [WARNING | megatron.core.rerun_state_machine]: RerunStateMachine initialized in mode disabled /megatron-lm/megatron/training/arguments.py:601: UserWarning: Disabling sequence parallelism because tensor model parallelism is disabled warnings.warn("Disabling sequence parallelism because tensor model parallelism is disabled") /megatron-lm/megatron/training/arguments.py:601: UserWarning: Disabling sequence parallelism because tensor model parallelism is disabled warnings.warn("Disabling sequence parallelism because tensor model parallelism is disabled") /megatron-lm/megatron/training/arguments.py:601: UserWarning: Disabling sequence parallelism because tensor model parallelism is disabled warnings.warn("Disabling sequence parallelism because tensor model parallelism is disabled") [WARNING | megatron.core.rerun_state_machine]: RerunStateMachine initialized in mode disabled /megatron-lm/megatron/training/arguments.py:601: UserWarning: Disabling sequence parallelism because tensor model parallelism is disabled warnings.warn("Disabling sequence parallelism because tensor model parallelism is disabled") using world size: 8, data-parallel size: 4, context-parallel size: 1, hierarchical context-parallel sizes: Nonetensor-model-parallel size: 1, encoder-tensor-model-parallel size: 0, pipeline-model-parallel size: 2, encoder-pipeline-model-parallel size: 0 WARNING: overriding default arguments for tokenizer_type:GPT2BPETokenizer with tokenizer_type:Llama2Tokenizer WARNING: Setting args.overlap_p2p_comm and args.align_param_gather to False since non-interleaved schedule does not support overlapping p2p communication and aligned param AG accumulate and all-reduce gradients in fp32 for bfloat16 data type. using torch.bfloat16 for parameters ... /megatron-lm/megatron/training/arguments.py:601: UserWarning: Disabling sequence parallelism because tensor model parallelism is disabled warnings.warn("Disabling sequence parallelism because tensor model parallelism is disabled") ------------------------ arguments ------------------------ accumulate_allreduce_grads_in_fp32 .............. True adam_beta1 ...................................... 0.9 adam_beta2 ...................................... 0.95 adam_eps ........................................ 1e-08 add_bias_linear ................................. False add_position_embedding .......................... True add_qkv_bias .................................... False adlr_autoresume ................................. False adlr_autoresume_interval ........................ 1000 align_grad_reduce ............................... True align_param_gather .............................. False app_tag_run_name ................................ None app_tag_run_version ............................. 0.0.0 apply_layernorm_1p .............................. False apply_query_key_layer_scaling ................... False apply_residual_connection_post_layernorm ........ False apply_rope_fusion ............................... True async_save ...................................... None async_tensor_model_parallel_allreduce ........... True attention_backend ............................... AttnBackend.auto attention_dropout ............................... 0.0 attention_softmax_in_fp32 ....................... False auto_detect_ckpt_format ......................... False barrier_with_L1_time ............................ True bert_binary_head ................................ True bert_embedder_type .............................. megatron bert_load ....................................... None bf16 ............................................ True bias_dropout_fusion ............................. True bias_gelu_fusion ................................ False bias_swiglu_fusion .............................. True biencoder_projection_dim ........................ 0 biencoder_shared_query_context_model ............ False block_data_path ................................. None calculate_per_token_loss ........................ False check_for_nan_in_loss_and_grad .................. True check_for_spiky_loss ............................ False check_weight_hash_across_dp_replicas_interval ... None ckpt_assume_constant_structure .................. False ckpt_convert_format ............................. None ckpt_convert_save ............................... None ckpt_convert_update_legacy_dist_opt_format ...... False ckpt_format ..................................... torch ckpt_fully_parallel_load ........................ False ckpt_fully_parallel_save ........................ True ckpt_fully_parallel_save_deprecated ............. False ckpt_step ....................................... None classes_fraction ................................ 1.0 clip_grad ....................................... 1.0 clone_scatter_output_in_embedding ............... True config_logger_dir ............................... consumed_train_samples .......................... 0 consumed_valid_samples .......................... 0 context_parallel_size ........................... 1 cp_comm_type .................................... ['p2p'] create_attention_mask_in_dataloader ............. True cross_entropy_loss_fusion ....................... False data_args_path .................................. None data_cache_path ................................. None data_parallel_random_init ....................... False data_parallel_size .............................. 4 data_path ....................................... ['/models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document'] data_per_class_fraction ......................... 1.0 data_sharding ................................... True dataloader_type ................................. single ddp_average_in_collective ....................... True ddp_bucket_size ................................. None decoder_first_pipeline_num_layers ............... None decoder_last_pipeline_num_layers ................ None decoder_num_layers .............................. None decoder_seq_length .............................. None decoupled_lr .................................... None decoupled_min_lr ................................ None decrease_batch_size_if_needed ................... False defer_embedding_wgrad_compute ................... False deprecated_use_mcore_models ..................... False deterministic_mode .............................. False dino_bottleneck_size ............................ 256 dino_freeze_last_layer .......................... 1 dino_head_hidden_size ........................... 2048 dino_local_crops_number ......................... 10 dino_local_img_size ............................. 96 dino_norm_last_layer ............................ False dino_teacher_temp ............................... 0.07 dino_warmup_teacher_temp ........................ 0.04 dino_warmup_teacher_temp_epochs ................. 30 disable_straggler_on_startup .................... False dist_ckpt_format_deprecated ..................... None dist_ckpt_strictness ............................ assume_ok_unexpected dist_url ........................................ tcp://localhost:34577 distribute_saved_activations .................... False distributed_backend ............................. nccl distributed_timeout_minutes ..................... 10 embedding_path .................................. None empty_unused_memory_level ....................... 0 enable_ft_package ............................... False enable_one_logger ............................... True encoder_num_layers .............................. 32 encoder_pipeline_model_parallel_size ............ 0 encoder_seq_length .............................. 4096 encoder_tensor_model_parallel_size .............. 0 end_weight_decay ................................ 0.1 eod_mask_loss ................................... False error_injection_rate ............................ 0 error_injection_type ............................ transient_error eval_interval ................................... 1000 eval_iters ...................................... 3 evidence_data_path .............................. None exit_duration_in_mins ........................... None exit_interval ................................... None exit_on_missing_checkpoint ...................... False exit_signal_handler ............................. False exp_avg_dtype ................................... torch.float32 exp_avg_sq_dtype ................................ torch.float32 expert_model_parallel_size ...................... 1 expert_tensor_parallel_size ..................... 1 ffn_hidden_size ................................. 11008 finetune ........................................ False flash_decode .................................... False fp16 ............................................ False fp16_lm_cross_entropy ........................... False fp32_residual_connection ........................ False fp8 ............................................. None fp8_amax_compute_algo ........................... most_recent fp8_amax_history_len ............................ 1 fp8_interval .................................... 1 fp8_margin ...................................... 0 fp8_param_gather ................................ False fp8_wgrad ....................................... True global_batch_size ............................... 64 gradient_accumulation_fusion .................... True group_query_attention ........................... False head_lr_mult .................................... 1.0 hidden_dropout .................................. 0.0 hidden_size ..................................... 4096 hierarchical_context_parallel_sizes ............. None hybrid_attention_ratio .......................... 0.0 hybrid_mlp_ratio ................................ 0.0 hybrid_override_pattern ......................... None hysteresis ...................................... 2 ict_head_size ................................... None ict_load ........................................ None img_h ........................................... 224 img_w ........................................... 224 indexer_batch_size .............................. 128 indexer_log_interval ............................ 1000 inference_batch_times_seqlen_threshold .......... -1 inference_max_seq_length ........................ 2560 init_method_std ................................. 0.006 init_method_xavier_uniform ...................... False initial_loss_scale .............................. 4294967296 iter_per_epoch .................................. 1250 kv_channels ..................................... 128 kv_lora_rank .................................... 32 lazy_mpu_init ................................... None load ............................................ None local_rank ...................................... 0 log_interval .................................... 1 log_loss_scale_to_tensorboard ................... True log_memory_to_tensorboard ....................... False log_num_zeros_in_grad ........................... False log_params_norm ................................. False log_progress .................................... False log_straggler ................................... False log_throughput .................................. True log_timers_to_tensorboard ....................... False log_validation_ppl_to_tensorboard ............... False log_world_size_to_tensorboard ................... False logging_level ................................... None loss_scale ...................................... None loss_scale_window ............................... 1000 lr .............................................. 3e-05 lr_decay_iters .................................. None lr_decay_samples ................................ None lr_decay_style .................................. cosine lr_warmup_fraction .............................. None lr_warmup_init .................................. 0.0 lr_warmup_iters ................................. 1 lr_warmup_samples ............................... 0 lr_wsd_decay_iters .............................. None lr_wsd_decay_samples ............................ None lr_wsd_decay_style .............................. exponential main_grads_dtype ................................ torch.float32 main_params_dtype ............................... torch.float32 make_vocab_size_divisible_by .................... 128 manual_gc ....................................... False manual_gc_eval .................................. True manual_gc_interval .............................. 0 mask_factor ..................................... 1.0 mask_prob ....................................... 0.15 mask_type ....................................... random masked_softmax_fusion ........................... True max_position_embeddings ......................... 4096 max_tokens_to_oom ............................... 12000 memory_snapshot_path ............................ snapshot.pickle merge_file ...................................... None micro_batch_size ................................ 1 microbatch_group_size_per_vp_stage .............. None min_loss_scale .................................. 1.0 min_lr .......................................... 3e-06 mmap_bin_files .................................. True mock_data ....................................... False moe_aux_loss_coeff .............................. 0.0 moe_expert_capacity_factor ...................... None moe_extended_tp ................................. False moe_ffn_hidden_size ............................. 11008 moe_grouped_gemm ................................ False moe_input_jitter_eps ............................ None moe_layer_freq .................................. 1 moe_layer_recompute ............................. False moe_pad_expert_input_to_capacity ................ False moe_per_layer_logging ........................... False moe_router_load_balancing_type .................. aux_loss moe_router_pre_softmax .......................... False moe_router_topk ................................. 2 moe_router_topk_limited_devices ................. None moe_router_topk_scaling_factor .................. None moe_shared_expert_intermediate_size ............. None moe_shared_expert_overlap ....................... False moe_token_dispatcher_type ....................... allgather moe_token_drop_policy ........................... probs moe_use_legacy_grouped_gemm ..................... False moe_use_upcycling ............................... False moe_z_loss_coeff ................................ None multi_latent_attention .......................... False nccl_communicator_config_path ................... None no_load_optim ................................... None no_load_rng ..................................... None no_persist_layer_norm ........................... False no_save_optim ................................... None no_save_rng ..................................... None non_persistent_ckpt_type ........................ None non_persistent_global_ckpt_dir .................. None non_persistent_local_ckpt_algo .................. fully_parallel non_persistent_local_ckpt_dir ................... None non_persistent_save_interval .................... None norm_epsilon .................................... 1e-05 normalization ................................... RMSNorm num_attention_heads ............................. 32 num_channels .................................... 3 num_classes ..................................... 1000 num_dataset_builder_threads ..................... 1 num_distributed_optimizer_instances ............. 1 num_experts ..................................... None num_layers ...................................... 32 num_layers_per_virtual_pipeline_stage ........... None num_query_groups ................................ 1 num_workers ..................................... 2 one_logger_async ................................ False one_logger_project .............................. megatron-lm one_logger_run_name ............................. None onnx_safe ....................................... None openai_gelu ..................................... False optimizer ....................................... adam output_bert_embeddings .......................... False overlap_grad_reduce ............................. True overlap_p2p_comm ................................ False overlap_p2p_comm_warmup_flush ................... False overlap_param_gather ............................ False overlap_param_gather_with_optimizer_step ........ False override_opt_param_scheduler .................... False params_dtype .................................... torch.bfloat16 patch_dim ....................................... 16 per_split_data_args_path ........................ None perform_initialization .......................... True pipeline_model_parallel_size .................... 2 pipeline_model_parallel_split_rank .............. None position_embedding_type ......................... rope pretrained_checkpoint ........................... None profile ......................................... False profile_dir ..................................... ./ profile_ranks ................................... [0] profile_step_end ................................ 12 profile_step_start .............................. 10 q_lora_rank ..................................... None qk_head_dim ..................................... 128 qk_layernorm .................................... False qk_pos_emb_head_dim ............................. 64 query_in_block_prob ............................. 0.1 rampup_batch_size ............................... None rank ............................................ 0 recompute_granularity ........................... None recompute_method ................................ None recompute_num_layers ............................ None record_memory_history ........................... False renormalize_blend_weights ....................... False rerun_mode ...................................... disabled reset_attention_mask ............................ False reset_position_ids .............................. False retriever_report_topk_accuracies ................ [] retriever_score_scaling ......................... False retriever_seq_length ............................ 256 retro_add_retriever ............................. False retro_attention_gate ............................ 1 retro_cyclic_train_iters ........................ None retro_encoder_attention_dropout ................. 0.1 retro_encoder_hidden_dropout .................... 0.1 retro_encoder_layers ............................ 2 retro_num_neighbors ............................. 2 retro_num_retrieved_chunks ...................... 2 retro_project_dir ............................... None retro_verify_neighbor_count ..................... True rotary_base ..................................... 10000 rotary_interleaved .............................. False rotary_percent .................................. 1.0 rotary_scaling_factor ........................... 1.0 rotary_seq_len_interpolation_factor ............. None s3_cache_path ................................... None sample_rate ..................................... 1.0 save ............................................ None save_interval ................................... 1000 scatter_gather_tensors_in_pipeline .............. True seed ............................................ 1234 seq_length ...................................... 4096 sequence_parallel ............................... False sgd_momentum .................................... 0.9 short_seq_prob .................................. 0.1 skip_train ...................................... False skipped_train_samples ........................... 0 spec ............................................ None split ........................................... 949,50,1 squared_relu .................................... False standalone_embedding_stage ...................... False start_weight_decay .............................. 0.1 straggler_ctrlr_port ............................ 65535 straggler_minmax_count .......................... 1 swiglu .......................................... True swin_backbone_type .............................. tiny tensor_model_parallel_size ...................... 1 tensorboard_dir ................................. ./tmp_7b tensorboard_log_interval ........................ 1 tensorboard_queue_size .......................... 1000 test_data_path .................................. None test_mode ....................................... False tiktoken_num_special_tokens ..................... 1000 tiktoken_pattern ................................ None tiktoken_special_tokens ......................... None timing_log_level ................................ 0 timing_log_option ............................... minmax titles_data_path ................................ None tokenizer_model ................................. /models1/Llama-2-7b-chat-hf/tokenizer.model tokenizer_type .................................. Llama2Tokenizer tp_comm_bootstrap_backend ....................... nccl tp_comm_bulk_dgrad .............................. True tp_comm_bulk_wgrad .............................. True tp_comm_overlap ................................. False tp_comm_overlap_ag .............................. True tp_comm_overlap_cfg ............................. None tp_comm_overlap_rs .............................. True tp_comm_overlap_rs_dgrad ........................ False tp_comm_split_ag ................................ True tp_comm_split_rs ................................ True train_data_path ................................. None train_iters ..................................... 50 train_samples ................................... None train_sync_interval ............................. None transformer_impl ................................ local transformer_pipeline_model_parallel_size ........ 2 untie_embeddings_and_output_weights ............. True use_checkpoint_args ............................. False use_checkpoint_opt_param_scheduler .............. False use_cpu_initialization .......................... None use_dist_ckpt ................................... False use_dist_ckpt_deprecated ........................ False use_distributed_optimizer ....................... True use_flash_attn .................................. True use_flash_attn_cutlass .......................... True use_flash_attn_torch ............................ False use_flash_attn_triton ........................... False use_hip_profiler ................................ False use_legacy_models ............................... True use_mp_args_from_checkpoint_args ................ False use_one_sent_docs ............................... False use_precision_aware_optimizer ................... False use_pytorch_profiler ............................ False use_ring_exchange_p2p ........................... False use_rope_scaling ................................ False use_rotary_position_embeddings .................. False use_tokenizer_model_from_checkpoint_args ........ True use_torch_fsdp2 ................................. False use_tp_pp_dp_mapping ............................ False v_head_dim ...................................... 128 valid_data_path ................................. None variable_seq_lengths ............................ False virtual_pipeline_model_parallel_size ............ None vision_backbone_type ............................ vit vision_pretraining .............................. False vision_pretraining_type ......................... classify vocab_extra_ids ................................. 0 vocab_file ...................................... None vocab_size ...................................... None wandb_exp_name .................................. wandb_project ................................... wandb_save_dir .................................. weight_decay .................................... 0.1 weight_decay_incr_style ......................... constant wgrad_deferral_limit ............................ 0 world_size ...................................... 8 yaml_cfg ........................................ None -------------------- end of arguments --------------------- > building Llama2Tokenizer tokenizer ... [WARNING | megatron.core.rerun_state_machine]: RerunStateMachine initialized in mode disabled [WARNING | megatron.core.rerun_state_machine]: RerunStateMachine initialized in mode disabled > padded vocab (size: 32000) with 0 dummy tokens (new size: 32000) [WARNING | megatron.core.rerun_state_machine]: RerunStateMachine initialized in mode disabled > initializing torch distributed ... 2025-10-30 15:48:22.868274: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. Traceback (most recent call last): File "/usr/local/lib/python3.10/dist-packages/tensorboard/compat/__init__.py", line 42, in tf from tensorboard.compat import notf # noqa: F401 ImportError: cannot import name 'notf' from 'tensorboard.compat' (/usr/local/lib/python3.10/dist-packages/tensorboard/compat/__init__.py) During handling of the above exception, another exception occurred: AttributeError: 'MessageFactory' object has no attribute 'GetPrototype' Traceback (most recent call last): File "/usr/local/lib/python3.10/dist-packages/tensorboard/compat/__init__.py", line 42, in tf from tensorboard.compat import notf # noqa: F401 ImportError: cannot import name 'notf' from 'tensorboard.compat' (/usr/local/lib/python3.10/dist-packages/tensorboard/compat/__init__.py) During handling of the above exception, another exception occurred: AttributeError: 'MessageFactory' object has no attribute 'GetPrototype' > setting tensorboard ... WARNING: one_logger package is required to enable e2e metrics tracking. please go to https://confluence.nvidia.com/display/MLWFO/Package+Repositories for details to install it [WARNING | megatron.core.rerun_state_machine]: RerunStateMachine initialized in mode disabled > initialized tensor model parallel with size 1 > initialized pipeline model parallel with size 2 > setting random seeds to 1234 ... > compiling dataset index builder ... make: Entering directory '/megatron-lm/megatron/core/datasets' make: Nothing to be done for 'default'. make: Leaving directory '/megatron-lm/megatron/core/datasets' >>> done with dataset index builder. Compilation time: 0.032 seconds > compiling and loading fused kernels ... >>> done with compiling and loading fused kernels. Compilation time: 1.511 seconds time to initialize megatron (seconds): 5.644 [after megatron is initialized] datetime: 2025-10-30 15:48:27 building GPT model ... > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 3369209856 GPTModel( (language_model): TransformerLanguageModel( (embedding): Embedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (encoder): ParallelTransformer( (layers): ModuleList( (0-15): 16 x ParallelTransformerLayer( (input_norm): RMSNorm() (self_attention): ParallelAttention( (query_key_value): ColumnParallelLinear() (core_attention): CoreAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (core_attention_flash): FlashSelfAttention() (dense): RowParallelLinear() ) (post_attention_norm): RMSNorm() (mlp): ParallelMLP( (dense_h_to_4h): ColumnParallelLinear() (dense_4h_to_h): RowParallelLinear() ) ) ) ) ) ) > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 3369205760 [after model, optimizer, and learning rate scheduler are built] datetime: 2025-10-30 15:48:27 > building train, validation, and test datasets ... > datasets target sizes (minimum size): train: 3200 validation: 192 test: 192 > building train, validation, and test datasets for GPT ... > finished creating GPT datasets ... [after dataloaders are built] datetime: 2025-10-30 15:48:27 done with setup ... training ... (min, max) time across ranks (ms): model-and-optimizer-setup ......................: (199.88, 204.83) train/valid/test-data-iterators-setup ..........: (360.81, 415.69) [before the start of training step] datetime: 2025-10-30 15:48:27 Number of parameters in transformer layers in billions: 6.48 Number of parameters in embedding layers in billions: 0.26 Total number of parameters in billions: 6.74 Number of parameters in most loaded shard in billions: 3.3693 [2025-10-30 15:49:58] iteration 1/ 50 | consumed samples: 64 | elapsed time per iteration (ms): 90286.8 | throughput per GPU (TFLOP/s/GPU): 16.7 | learning rate: 3.000000E-05 | global batch size: 64 | lm loss: 1.045589E+01 | loss scale: 1.0 | grad norm: 270.234 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | Number of parameters in other shards in billions: 3.2383 Theoretical memory footprints: weight and optimizer=28919.29 MB [Rank 4] (after 1 iterations) memory (MB) | allocated: 28987.10498046875 | max allocated: 32813.6396484375 | reserved: 36168.0 | max reserved: 36168.0 [Rank 0] (after 1 iterations) memory (MB) | allocated: 28920.36181640625 | max allocated: 39419.125 | reserved: 40814.0 | max reserved: 40814.0 Could not open /var/log/hylog/. [2025-10-30 15:50:38] iteration 2/ 50 | consumed samples: 128 | elapsed time per iteration (ms): 40295.7 | throughput per GPU (TFLOP/s/GPU): 37.5 | learning rate: 2.997226E-05 | global batch size: 64 | lm loss: 1.046932E+01 | loss scale: 1.0 | grad norm: 242.090 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 15:51:18] iteration 3/ 50 | consumed samples: 192 | elapsed time per iteration (ms): 39819.8 | throughput per GPU (TFLOP/s/GPU): 37.9 | learning rate: 2.988916E-05 | global batch size: 64 | lm loss: 8.424404E+00 | loss scale: 1.0 | grad norm: 502.275 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 15:51:57] iteration 4/ 50 | consumed samples: 256 | elapsed time per iteration (ms): 39600.0 | throughput per GPU (TFLOP/s/GPU): 38.1 | learning rate: 2.975105E-05 | global batch size: 64 | lm loss: 1.290415E+01 | loss scale: 1.0 | grad norm: 174.010 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 15:52:37] iteration 5/ 50 | consumed samples: 320 | elapsed time per iteration (ms): 39826.9 | throughput per GPU (TFLOP/s/GPU): 37.9 | learning rate: 2.955848E-05 | global batch size: 64 | lm loss: 9.753544E+00 | loss scale: 1.0 | grad norm: 47.956 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 15:53:17] iteration 6/ 50 | consumed samples: 384 | elapsed time per iteration (ms): 39826.8 | throughput per GPU (TFLOP/s/GPU): 37.9 | learning rate: 2.931225E-05 | global batch size: 64 | lm loss: 9.120786E+00 | loss scale: 1.0 | grad norm: 165.685 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 15:53:57] iteration 7/ 50 | consumed samples: 448 | elapsed time per iteration (ms): 39536.7 | throughput per GPU (TFLOP/s/GPU): 38.2 | learning rate: 2.901338E-05 | global batch size: 64 | lm loss: 8.215652E+00 | loss scale: 1.0 | grad norm: 40.534 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 15:54:37] iteration 8/ 50 | consumed samples: 512 | elapsed time per iteration (ms): 40029.6 | throughput per GPU (TFLOP/s/GPU): 37.7 | learning rate: 2.866308E-05 | global batch size: 64 | lm loss: 7.065186E+00 | loss scale: 1.0 | grad norm: 10.479 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 15:55:16] iteration 9/ 50 | consumed samples: 576 | elapsed time per iteration (ms): 39653.0 | throughput per GPU (TFLOP/s/GPU): 38.1 | learning rate: 2.826280E-05 | global batch size: 64 | lm loss: 7.098128E+00 | loss scale: 1.0 | grad norm: 8.814 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 15:55:56] iteration 10/ 50 | consumed samples: 640 | elapsed time per iteration (ms): 39674.7 | throughput per GPU (TFLOP/s/GPU): 38.1 | learning rate: 2.781419E-05 | global batch size: 64 | lm loss: 6.366463E+00 | loss scale: 1.0 | grad norm: 6.373 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 15:56:36] iteration 11/ 50 | consumed samples: 704 | elapsed time per iteration (ms): 40139.3 | throughput per GPU (TFLOP/s/GPU): 37.6 | learning rate: 2.731908E-05 | global batch size: 64 | lm loss: 6.430417E+00 | loss scale: 1.0 | grad norm: 6.818 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 15:57:16] iteration 12/ 50 | consumed samples: 768 | elapsed time per iteration (ms): 39490.8 | throughput per GPU (TFLOP/s/GPU): 38.2 | learning rate: 2.677952E-05 | global batch size: 64 | lm loss: 6.327631E+00 | loss scale: 1.0 | grad norm: 3.020 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 15:57:55] iteration 13/ 50 | consumed samples: 832 | elapsed time per iteration (ms): 39506.9 | throughput per GPU (TFLOP/s/GPU): 38.2 | learning rate: 2.619772E-05 | global batch size: 64 | lm loss: 6.092177E+00 | loss scale: 1.0 | grad norm: 2.616 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 15:58:35] iteration 14/ 50 | consumed samples: 896 | elapsed time per iteration (ms): 39718.0 | throughput per GPU (TFLOP/s/GPU): 38.0 | learning rate: 2.557606E-05 | global batch size: 64 | lm loss: 6.129852E+00 | loss scale: 1.0 | grad norm: 4.508 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 15:59:14] iteration 15/ 50 | consumed samples: 960 | elapsed time per iteration (ms): 39658.4 | throughput per GPU (TFLOP/s/GPU): 38.1 | learning rate: 2.491711E-05 | global batch size: 64 | lm loss: 6.379290E+00 | loss scale: 1.0 | grad norm: 15.828 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 15:59:54] iteration 16/ 50 | consumed samples: 1024 | elapsed time per iteration (ms): 39884.8 | throughput per GPU (TFLOP/s/GPU): 37.9 | learning rate: 2.422357E-05 | global batch size: 64 | lm loss: 6.207567E+00 | loss scale: 1.0 | grad norm: 3.418 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:00:34] iteration 17/ 50 | consumed samples: 1088 | elapsed time per iteration (ms): 39599.3 | throughput per GPU (TFLOP/s/GPU): 38.1 | learning rate: 2.349830E-05 | global batch size: 64 | lm loss: 6.430919E+00 | loss scale: 1.0 | grad norm: 18.031 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:01:14] iteration 18/ 50 | consumed samples: 1152 | elapsed time per iteration (ms): 39926.3 | throughput per GPU (TFLOP/s/GPU): 37.8 | learning rate: 2.274427E-05 | global batch size: 64 | lm loss: 6.162337E+00 | loss scale: 1.0 | grad norm: 14.185 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:01:54] iteration 19/ 50 | consumed samples: 1216 | elapsed time per iteration (ms): 39747.6 | throughput per GPU (TFLOP/s/GPU): 38.0 | learning rate: 2.196458E-05 | global batch size: 64 | lm loss: 5.844732E+00 | loss scale: 1.0 | grad norm: 3.080 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:02:34] iteration 20/ 50 | consumed samples: 1280 | elapsed time per iteration (ms): 39922.5 | throughput per GPU (TFLOP/s/GPU): 37.8 | learning rate: 2.116243E-05 | global batch size: 64 | lm loss: 5.706470E+00 | loss scale: 1.0 | grad norm: 7.213 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:03:13] iteration 21/ 50 | consumed samples: 1344 | elapsed time per iteration (ms): 39735.6 | throughput per GPU (TFLOP/s/GPU): 38.0 | learning rate: 2.034112E-05 | global batch size: 64 | lm loss: 5.828917E+00 | loss scale: 1.0 | grad norm: 5.298 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:03:53] iteration 22/ 50 | consumed samples: 1408 | elapsed time per iteration (ms): 39615.3 | throughput per GPU (TFLOP/s/GPU): 38.1 | learning rate: 1.950403E-05 | global batch size: 64 | lm loss: 5.981213E+00 | loss scale: 1.0 | grad norm: 3.407 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:04:38] iteration 23/ 50 | consumed samples: 1472 | elapsed time per iteration (ms): 44924.4 | throughput per GPU (TFLOP/s/GPU): 33.6 | learning rate: 1.865460E-05 | global batch size: 64 | lm loss: 5.460212E+00 | loss scale: 1.0 | grad norm: 4.123 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:05:44] iteration 24/ 50 | consumed samples: 1536 | elapsed time per iteration (ms): 65925.0 | throughput per GPU (TFLOP/s/GPU): 22.9 | learning rate: 1.779631E-05 | global batch size: 64 | lm loss: 5.226260E+00 | loss scale: 1.0 | grad norm: 5.918 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:06:24] iteration 25/ 50 | consumed samples: 1600 | elapsed time per iteration (ms): 40182.0 | throughput per GPU (TFLOP/s/GPU): 37.6 | learning rate: 1.693270E-05 | global batch size: 64 | lm loss: 5.305700E+00 | loss scale: 1.0 | grad norm: 1.561 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:07:04] iteration 26/ 50 | consumed samples: 1664 | elapsed time per iteration (ms): 39762.6 | throughput per GPU (TFLOP/s/GPU): 38.0 | learning rate: 1.606730E-05 | global batch size: 64 | lm loss: 5.153278E+00 | loss scale: 1.0 | grad norm: 2.196 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:07:44] iteration 27/ 50 | consumed samples: 1728 | elapsed time per iteration (ms): 39921.2 | throughput per GPU (TFLOP/s/GPU): 37.8 | learning rate: 1.520369E-05 | global batch size: 64 | lm loss: 5.105300E+00 | loss scale: 1.0 | grad norm: 1.382 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:08:23] iteration 28/ 50 | consumed samples: 1792 | elapsed time per iteration (ms): 39815.2 | throughput per GPU (TFLOP/s/GPU): 37.9 | learning rate: 1.434540E-05 | global batch size: 64 | lm loss: 4.925309E+00 | loss scale: 1.0 | grad norm: 1.777 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:09:03] iteration 29/ 50 | consumed samples: 1856 | elapsed time per iteration (ms): 39752.5 | throughput per GPU (TFLOP/s/GPU): 38.0 | learning rate: 1.349597E-05 | global batch size: 64 | lm loss: 5.181439E+00 | loss scale: 1.0 | grad norm: 1.845 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:09:43] iteration 30/ 50 | consumed samples: 1920 | elapsed time per iteration (ms): 39643.3 | throughput per GPU (TFLOP/s/GPU): 38.1 | learning rate: 1.265888E-05 | global batch size: 64 | lm loss: 5.208538E+00 | loss scale: 1.0 | grad norm: 2.059 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:10:23] iteration 31/ 50 | consumed samples: 1984 | elapsed time per iteration (ms): 39988.1 | throughput per GPU (TFLOP/s/GPU): 37.8 | learning rate: 1.183757E-05 | global batch size: 64 | lm loss: 4.890507E+00 | loss scale: 1.0 | grad norm: 1.327 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:11:03] iteration 32/ 50 | consumed samples: 2048 | elapsed time per iteration (ms): 40034.3 | throughput per GPU (TFLOP/s/GPU): 37.7 | learning rate: 1.103542E-05 | global batch size: 64 | lm loss: 4.993505E+00 | loss scale: 1.0 | grad norm: 1.328 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:11:43] iteration 33/ 50 | consumed samples: 2112 | elapsed time per iteration (ms): 39777.8 | throughput per GPU (TFLOP/s/GPU): 38.0 | learning rate: 1.025573E-05 | global batch size: 64 | lm loss: 5.216469E+00 | loss scale: 1.0 | grad norm: 1.112 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:12:22] iteration 34/ 50 | consumed samples: 2176 | elapsed time per iteration (ms): 39744.7 | throughput per GPU (TFLOP/s/GPU): 38.0 | learning rate: 9.501700E-06 | global batch size: 64 | lm loss: 5.064697E+00 | loss scale: 1.0 | grad norm: 1.134 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:13:02] iteration 35/ 50 | consumed samples: 2240 | elapsed time per iteration (ms): 39795.0 | throughput per GPU (TFLOP/s/GPU): 37.9 | learning rate: 8.776425E-06 | global batch size: 64 | lm loss: 4.957899E+00 | loss scale: 1.0 | grad norm: 0.992 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:13:42] iteration 36/ 50 | consumed samples: 2304 | elapsed time per iteration (ms): 39734.5 | throughput per GPU (TFLOP/s/GPU): 38.0 | learning rate: 8.082888E-06 | global batch size: 64 | lm loss: 5.244042E+00 | loss scale: 1.0 | grad norm: 1.144 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:14:22] iteration 37/ 50 | consumed samples: 2368 | elapsed time per iteration (ms): 39786.1 | throughput per GPU (TFLOP/s/GPU): 38.0 | learning rate: 7.423938E-06 | global batch size: 64 | lm loss: 5.032987E+00 | loss scale: 1.0 | grad norm: 0.960 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:15:01] iteration 38/ 50 | consumed samples: 2432 | elapsed time per iteration (ms): 39439.9 | throughput per GPU (TFLOP/s/GPU): 38.3 | learning rate: 6.802284E-06 | global batch size: 64 | lm loss: 4.990102E+00 | loss scale: 1.0 | grad norm: 0.900 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:15:41] iteration 39/ 50 | consumed samples: 2496 | elapsed time per iteration (ms): 39764.0 | throughput per GPU (TFLOP/s/GPU): 38.0 | learning rate: 6.220479E-06 | global batch size: 64 | lm loss: 4.856393E+00 | loss scale: 1.0 | grad norm: 1.125 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:16:20] iteration 40/ 50 | consumed samples: 2560 | elapsed time per iteration (ms): 39466.0 | throughput per GPU (TFLOP/s/GPU): 38.3 | learning rate: 5.680916E-06 | global batch size: 64 | lm loss: 5.073430E+00 | loss scale: 1.0 | grad norm: 1.028 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:17:00] iteration 41/ 50 | consumed samples: 2624 | elapsed time per iteration (ms): 39639.2 | throughput per GPU (TFLOP/s/GPU): 38.1 | learning rate: 5.185811E-06 | global batch size: 64 | lm loss: 5.006877E+00 | loss scale: 1.0 | grad norm: 0.856 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:17:40] iteration 42/ 50 | consumed samples: 2688 | elapsed time per iteration (ms): 39736.3 | throughput per GPU (TFLOP/s/GPU): 38.0 | learning rate: 4.737197E-06 | global batch size: 64 | lm loss: 4.772885E+00 | loss scale: 1.0 | grad norm: 0.983 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:18:19] iteration 43/ 50 | consumed samples: 2752 | elapsed time per iteration (ms): 39481.7 | throughput per GPU (TFLOP/s/GPU): 38.2 | learning rate: 4.336920E-06 | global batch size: 64 | lm loss: 4.907492E+00 | loss scale: 1.0 | grad norm: 0.823 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:18:59] iteration 44/ 50 | consumed samples: 2816 | elapsed time per iteration (ms): 39949.9 | throughput per GPU (TFLOP/s/GPU): 37.8 | learning rate: 3.986624E-06 | global batch size: 64 | lm loss: 4.758832E+00 | loss scale: 1.0 | grad norm: 1.009 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:19:39] iteration 45/ 50 | consumed samples: 2880 | elapsed time per iteration (ms): 39817.1 | throughput per GPU (TFLOP/s/GPU): 37.9 | learning rate: 3.687747E-06 | global batch size: 64 | lm loss: 4.631381E+00 | loss scale: 1.0 | grad norm: 0.807 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:20:19] iteration 46/ 50 | consumed samples: 2944 | elapsed time per iteration (ms): 39566.5 | throughput per GPU (TFLOP/s/GPU): 38.2 | learning rate: 3.441519E-06 | global batch size: 64 | lm loss: 4.772638E+00 | loss scale: 1.0 | grad norm: 1.379 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:20:58] iteration 47/ 50 | consumed samples: 3008 | elapsed time per iteration (ms): 39582.4 | throughput per GPU (TFLOP/s/GPU): 38.2 | learning rate: 3.248951E-06 | global batch size: 64 | lm loss: 4.898998E+00 | loss scale: 1.0 | grad norm: 0.859 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:21:38] iteration 48/ 50 | consumed samples: 3072 | elapsed time per iteration (ms): 39651.1 | throughput per GPU (TFLOP/s/GPU): 38.1 | learning rate: 3.110835E-06 | global batch size: 64 | lm loss: 5.095502E+00 | loss scale: 1.0 | grad norm: 0.887 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:22:17] iteration 49/ 50 | consumed samples: 3136 | elapsed time per iteration (ms): 39759.3 | throughput per GPU (TFLOP/s/GPU): 38.0 | learning rate: 3.027737E-06 | global batch size: 64 | lm loss: 4.849247E+00 | loss scale: 1.0 | grad norm: 0.706 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [2025-10-30 16:22:57] iteration 50/ 50 | consumed samples: 3200 | elapsed time per iteration (ms): 39629.9 | throughput per GPU (TFLOP/s/GPU): 38.1 | learning rate: 3.000000E-06 | global batch size: 64 | lm loss: 5.294223E+00 | loss scale: 1.0 | grad norm: 0.789 | num zeros: 0 | number of skipped iterations: 0 | number of nan iterations: 0 | [after training is done] datetime: 2025-10-30 16:22:57 [WARNING | megatron.core.rerun_state_machine]: Setting RerunStateMachine mode disabled Evaluating on 192 samples Evaluating iter 1/3 Evaluating iter 2/3 Evaluating iter 3/3 (min, max) time across ranks (ms): evaluate .......................................: (16322.27, 16446.91) ---------------------------------------------------------------------------------------------------------------- [WARNING | megatron.core.rerun_state_machine]: Setting RerunStateMachine mode disabled [WARNING | megatron.core.rerun_state_machine]: Setting RerunStateMachine mode disabled validation loss at iteration 50 on validation set | lm loss value: 4.873671E+00 | lm loss PPL: 1.308002E+02 | ---------------------------------------------------------------------------------------------------------------- [WARNING | megatron.core.rerun_state_machine]: Setting RerunStateMachine mode disabled Evaluating on 192 samples Evaluating iter 1/3 Evaluating iter 2/3 Evaluating iter 3/3 (min, max) time across ranks (ms): evaluate .......................................: (5993.56, 6118.05) ---------------------------------------------------------------------------------------------------------- validation loss at iteration 50 on test set | lm loss value: 5.035580E+00 | lm loss PPL: 1.537888E+02 | ---------------------------------------------------------------------------------------------------------- [WARNING | megatron.core.rerun_state_machine]: Setting RerunStateMachine mode disabled [WARNING | megatron.core.rerun_state_machine]: Setting RerunStateMachine mode disabled WARNING: Logging before InitGoogleLogging() is written to STDERR W1030 16:23:21.049338 936590 ProcessGroupNCCL.cpp:1197] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator()) WARNING: Logging before InitGoogleLogging() is written to STDERR W1030 16:23:21.139221 936594 ProcessGroupNCCL.cpp:1197] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator()) WARNING: Logging before InitGoogleLogging() is written to STDERR W1030 16:23:21.723956 936591 ProcessGroupNCCL.cpp:1197] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator()) WARNING: Logging before InitGoogleLogging() is written to STDERR W1030 16:23:21.764384 936596 ProcessGroupNCCL.cpp:1197] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator()) WARNING: Logging before InitGoogleLogging() is written to STDERR W1030 16:23:21.794720 936589 ProcessGroupNCCL.cpp:1197] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator()) WARNING: Logging before InitGoogleLogging() is written to STDERR W1030 16:23:22.382212 936597 ProcessGroupNCCL.cpp:1197] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator()) WARNING: Logging before InitGoogleLogging() is written to STDERR W1030 16:23:22.453176 936593 ProcessGroupNCCL.cpp:1197] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator()) WARNING: Logging before InitGoogleLogging() is written to STDERR W1030 16:23:52.474910 936599 ProcessGroupNCCL.cpp:1197] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())