PID TTY STAT TIME COMMAND 79819 pts/4 Ss 0:00 -bash 80848 pts/4 S+ 0:00 \_ bash run.sh 80849 pts/4 Sl+ 0:00 \_ /opt/mpi/bin/mpirun --allow-run-as-root -np 64 --hostfile hostfile --tag-output --merge-stderr-to-stdout --output-filename tmp -mca plm_rsh_args -p 3344 -x NCCL_SOCKET_IFNAME=ens1f0 ./llama3_70b.sh node11 80854 pts/4 S 0:00 \_ /usr/bin/ssh -x -p 3344 node12 PATH=/opt/mpi/bin:$PATH ; export PATH ; LD_LIBRARY_PATH=/opt/mpi/lib:$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; DYLD_LIBRARY_PATH=/opt/mpi/lib:$DYLD_LIBRARY_PATH ; export DYLD_LIBRARY_PATH ; /opt/mpi/bin/orted -mca ess "env" -mca ess_base_jobid "2143813632" -mca ess_base_vpid 1 -mca ess_base_num_procs "8" -mca orte_node_regex "node[2:11-18]@0(8)" -mca orte_hnp_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm "rsh" --tree-spawn -mca orte_parent_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm_rsh_args "-p 3344" -mca orte_tag_output "1" -mca orte_output_filename "tmp" -mca pmix "^s1,s2,cray,isolated" 80855 pts/4 S 0:00 \_ /usr/bin/ssh -x -p 3344 node13 PATH=/opt/mpi/bin:$PATH ; export PATH ; LD_LIBRARY_PATH=/opt/mpi/lib:$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; DYLD_LIBRARY_PATH=/opt/mpi/lib:$DYLD_LIBRARY_PATH ; export DYLD_LIBRARY_PATH ; /opt/mpi/bin/orted -mca ess "env" -mca ess_base_jobid "2143813632" -mca ess_base_vpid 2 -mca ess_base_num_procs "8" -mca orte_node_regex "node[2:11-18]@0(8)" -mca orte_hnp_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm "rsh" --tree-spawn -mca orte_parent_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm_rsh_args "-p 3344" -mca orte_tag_output "1" -mca orte_output_filename "tmp" -mca pmix "^s1,s2,cray,isolated" 80856 pts/4 S 0:00 \_ /usr/bin/ssh -x -p 3344 node14 PATH=/opt/mpi/bin:$PATH ; export PATH ; LD_LIBRARY_PATH=/opt/mpi/lib:$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; DYLD_LIBRARY_PATH=/opt/mpi/lib:$DYLD_LIBRARY_PATH ; export DYLD_LIBRARY_PATH ; /opt/mpi/bin/orted -mca ess "env" -mca ess_base_jobid "2143813632" -mca ess_base_vpid 3 -mca ess_base_num_procs "8" -mca orte_node_regex "node[2:11-18]@0(8)" -mca orte_hnp_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm "rsh" --tree-spawn -mca orte_parent_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm_rsh_args "-p 3344" -mca orte_tag_output "1" -mca orte_output_filename "tmp" -mca pmix "^s1,s2,cray,isolated" 80857 pts/4 S 0:00 \_ /usr/bin/ssh -x -p 3344 node15 PATH=/opt/mpi/bin:$PATH ; export PATH ; LD_LIBRARY_PATH=/opt/mpi/lib:$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; DYLD_LIBRARY_PATH=/opt/mpi/lib:$DYLD_LIBRARY_PATH ; export DYLD_LIBRARY_PATH ; /opt/mpi/bin/orted -mca ess "env" -mca ess_base_jobid "2143813632" -mca ess_base_vpid 4 -mca ess_base_num_procs "8" -mca orte_node_regex "node[2:11-18]@0(8)" -mca orte_hnp_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm "rsh" --tree-spawn -mca orte_parent_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm_rsh_args "-p 3344" -mca orte_tag_output "1" -mca orte_output_filename "tmp" -mca pmix "^s1,s2,cray,isolated" 80858 pts/4 S 0:00 \_ /usr/bin/ssh -x -p 3344 node16 PATH=/opt/mpi/bin:$PATH ; export PATH ; LD_LIBRARY_PATH=/opt/mpi/lib:$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; DYLD_LIBRARY_PATH=/opt/mpi/lib:$DYLD_LIBRARY_PATH ; export DYLD_LIBRARY_PATH ; /opt/mpi/bin/orted -mca ess "env" -mca ess_base_jobid "2143813632" -mca ess_base_vpid 5 -mca ess_base_num_procs "8" -mca orte_node_regex "node[2:11-18]@0(8)" -mca orte_hnp_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm "rsh" --tree-spawn -mca orte_parent_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm_rsh_args "-p 3344" -mca orte_tag_output "1" -mca orte_output_filename "tmp" -mca pmix "^s1,s2,cray,isolated" 80859 pts/4 S 0:00 \_ /usr/bin/ssh -x -p 3344 node17 PATH=/opt/mpi/bin:$PATH ; export PATH ; LD_LIBRARY_PATH=/opt/mpi/lib:$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; DYLD_LIBRARY_PATH=/opt/mpi/lib:$DYLD_LIBRARY_PATH ; export DYLD_LIBRARY_PATH ; /opt/mpi/bin/orted -mca ess "env" -mca ess_base_jobid "2143813632" -mca ess_base_vpid 6 -mca ess_base_num_procs "8" -mca orte_node_regex "node[2:11-18]@0(8)" -mca orte_hnp_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm "rsh" --tree-spawn -mca orte_parent_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm_rsh_args "-p 3344" -mca orte_tag_output "1" -mca orte_output_filename "tmp" -mca pmix "^s1,s2,cray,isolated" 80860 pts/4 S 0:00 \_ /usr/bin/ssh -x -p 3344 node18 PATH=/opt/mpi/bin:$PATH ; export PATH ; LD_LIBRARY_PATH=/opt/mpi/lib:$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; DYLD_LIBRARY_PATH=/opt/mpi/lib:$DYLD_LIBRARY_PATH ; export DYLD_LIBRARY_PATH ; /opt/mpi/bin/orted -mca ess "env" -mca ess_base_jobid "2143813632" -mca ess_base_vpid 7 -mca ess_base_num_procs "8" -mca orte_node_regex "node[2:11-18]@0(8)" -mca orte_hnp_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm "rsh" --tree-spawn -mca orte_parent_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm_rsh_args "-p 3344" -mca orte_tag_output "1" -mca orte_output_filename "tmp" -mca pmix "^s1,s2,cray,isolated" 80861 pts/4 S 0:00 \_ /bin/bash ./llama3_70b.sh node11 80883 pts/4 SLl 10:40 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81108 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81124 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81136 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81148 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81161 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81173 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81182 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81196 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81206 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81218 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81229 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81242 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81253 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81263 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81273 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81287 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81294 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81303 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81310 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81318 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81326 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81334 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81342 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81348 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81357 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81365 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81372 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81380 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81388 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81396 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81400 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81406 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81831 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 81894 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 82022 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 82023 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 82027 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 82028 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566 80862 pts/4 S 0:00 \_ /bin/bash ./llama3_70b.sh node11 80888 pts/4 SLl 10:39 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81261 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81281 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81292 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81300 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81308 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81316 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81324 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81332 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81341 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81349 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81358 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81366 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81375 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81383 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81393 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81399 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81405 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81413 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81416 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81418 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81420 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81422 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81424 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81426 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81428 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81430 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81432 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81434 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81436 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81438 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81440 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 81442 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566 80865 pts/4 S 0:00 \_ /bin/bash ./llama3_70b.sh node11 80900 pts/4 SLl 10:37 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81102 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81117 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81129 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81140 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81153 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81166 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81178 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81190 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81200 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81215 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81228 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81240 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81251 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81264 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81276 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81288 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81296 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81304 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81312 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81319 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81327 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81335 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81343 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81350 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81359 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81368 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81376 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81384 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81392 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81398 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81404 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 81411 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566 80868 pts/4 S 0:00 \_ /bin/bash ./llama3_70b.sh node11 80898 pts/4 SLl 10:39 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 80956 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 80961 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 80965 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 80972 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 80979 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 80986 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 80994 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 81001 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 81009 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 81015 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 81025 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 81031 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 81039 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 81047 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 81056 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 81063 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 81072 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 81082 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 81091 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 81098 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 81106 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 81119 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 81131 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 81141 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 81154 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 81167 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 81179 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 81191 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 81202 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 81214 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 81226 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 81237 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566 80872 pts/4 S 0:00 \_ /bin/bash ./llama3_70b.sh node11 80904 pts/4 SLl 10:38 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 80963 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 80973 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 80981 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 80988 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 80995 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 81004 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 81013 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 81020 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 81029 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 81037 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 81045 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 81053 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 81061 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 81071 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 81081 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 81088 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 81096 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 81105 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 81118 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 81128 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 81138 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 81152 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 81165 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 81176 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 81189 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 81201 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 81213 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 81225 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 81238 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 81250 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 81262 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 81274 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566 80876 pts/4 S 0:00 \_ /bin/bash ./llama3_70b.sh node11 80905 pts/4 SLl 10:38 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 80925 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 80927 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 80929 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 80931 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 80933 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 80935 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 80937 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 80939 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 80941 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 80943 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 80945 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 80947 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 80949 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 80951 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 80953 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 80955 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 80959 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 80962 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 80970 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 80976 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 80985 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 80993 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 80999 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 81007 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 81017 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 81024 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 81032 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 81041 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 81049 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 81057 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 81065 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 81075 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566 80882 pts/4 S 0:00 \_ /bin/bash ./llama3_70b.sh node11 80907 pts/4 SLl 10:37 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81068 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81084 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81093 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81101 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81111 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81123 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81133 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81147 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81157 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81172 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81184 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81197 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81208 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81221 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81233 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81245 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81254 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81269 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81279 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81291 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81298 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81306 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81314 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81323 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81331 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81339 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81346 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81355 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81362 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81370 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81378 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 81385 pts/4 Sl 0:00 | \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566 80886 pts/4 S 0:00 \_ /bin/bash ./llama3_70b.sh node11 80908 pts/4 SLl 10:39 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 80969 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 80980 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 80989 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 80998 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81005 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81014 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81021 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81030 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81038 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81046 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81054 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81062 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81070 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81079 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81087 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81095 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81104 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81116 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81127 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81139 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81150 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81164 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81175 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81188 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81199 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81211 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81222 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81235 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81248 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81258 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81270 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 81282 pts/4 Sl 0:00 \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566 82346 pts/15 Ss 0:00 bash 82366 pts/15 S+ 0:00 \_ vim llama3_70b.sh 79835 pts/5 Ss 0:00 bash 82389 pts/5 R+ 0:00 \_ ps af 79661 pts/14 Ss+ 0:00 bash 65721 pts/3 Ss+ 0:00 bash 65679 pts/2 Ss 0:00 /bin/bash 65701 pts/2 S+ 0:00 \_ ssh node27 -p 3344 64008 pts/1 Ss+ 0:00 bash 1 pts/0 Ss+ 0:00 /bin/bash