#!/bin/bash

#for llama2-7b pretrain test
#mpirun -np 8 --allow-run-as-root ./llama2_7b.sh localhost . >& llama2_7b.log &

#for llama2-13b pretrain test
#mpirun -np 8 --allow-run-as-root ./llama2_13b.sh localhost . >& llama2_13b.log &

/opt/mpi/bin/mpirun \
	--allow-run-as-root \
	-np 64 \
	--hostfile hostfile \
	--tag-output \
	--merge-stderr-to-stdout \
	--output-filename tmp \
	-mca plm_rsh_args "-p 3344" \
	--bind-to none \
	-mca plm_rsh_no_tree_spawn 1 \
        ./llama3_70b.sh \
	node11
#how to calculate tgs(tokens/s/gpu)
#TGS = sequence_length * global_batchsize / elapsed_time_per_iteration(s) / total_gpu_cards