#!/bin/bash #for llama2-7b pretrain test #mpirun -np 8 --allow-run-as-root ./llama2_7b.sh localhost . >& llama2_7b.log & #for llama2-13b pretrain test #mpirun -np 8 --allow-run-as-root ./llama2_13b.sh localhost . >& llama2_13b.log & /opt/mpi/bin/mpirun \ --allow-run-as-root \ -np 64 \ --hostfile hostfile \ --tag-output \ --merge-stderr-to-stdout \ --output-filename tmp \ -mca plm_rsh_args "-p 3344" \ --bind-to none \ -mca plm_rsh_no_tree_spawn 1 \ ./llama3_70b.sh \ node11 #how to calculate tgs(tokens/s/gpu) #TGS = sequence_length * global_batchsize / elapsed_time_per_iteration(s) / total_gpu_cards