#!/bin/bash

GPUS=$1

string=""
for ((i=0; i<$GPUS; i++)); do
  string="$string$i,"
done
string=${string%","}
export MASTER_ADDR=${2}
export WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
export RANK=$OMPI_COMM_WORLD_RANK
local_rank=$OMPI_COMM_WORLD_LOCAL_RANK
export HSA_FORCE_FINE_GRAIN_PCIE=1
export LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
export MASTER_PORT=12365
export OMP_NUM_THREADS=1


APP="python3 ../src/train_bash.py --stage sft \
    --model_name_or_path ../../baichuan-7b-base \
    --do_train \
    --template default \
    --dataset alpaca_gpt4_en \
    --finetuning_type lora \
    --lora_rank 16 \
    --lora_target W_pack,o_proj,gate_proj,down_proj,up_proj \
    --output_dir out/baichuan-7b-lora-test7 \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
    --gradient_accumulation_steps 1 \
    --preprocessing_num_workers 8 \
    --lr_scheduler_type cosine \
    --logging_steps 10 \
    --save_steps 2 \
    --eval_steps 2 \
    --learning_rate 1e-4 \
    --max_grad_norm 0.5 \
    --num_train_epochs 1.0 \
    --val_size 0.001 \
    --evaluation_strategy steps \
    --load_best_model_at_end \
    --plot_loss \
    --fp16 \
    --deepspeed deepspeed.json
"

case ${local_rank} in
[0])
  export HIP_VISIBLE_DEVICES=$string
  numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
[1])
  export HIP_VISIBLE_DEVICES=$string
  numactl --cpunodebind=1 --membind=1 ${APP}
  ;;
[2])
  export HIP_VISIBLE_DEVICES=$string
  numactl --cpunodebind=2 --membind=2 ${APP}
  ;;
[3])
  export HIP_VISIBLE_DEVICES=$string
  numactl --cpunodebind=3 --membind=3 ${APP}
  ;;
[4])
  export HIP_VISIBLE_DEVICES=$string
  numactl --cpunodebind=4 --membind=4 ${APP}
  ;;
[5])
  export HIP_VISIBLE_DEVICES=$string
  numactl --cpunodebind=5 --membind=5 ${APP}
  ;;
[6])
  export HIP_VISIBLE_DEVICES=$string
  numactl --cpunodebind=6 --membind=6 ${APP}
  ;;
[7])
  export HIP_VISIBLE_DEVICES=$string
  numactl --cpunodebind=7 --membind=7 ${APP}
  ;;
esac