#!/usr/bin/env bash

# default value
die_id="a"
save_flash="y"
force_train="n"
load_para="y"
ffe="-1"
dfe="-1"
term_offset="-1"

# parse args
while [[ $# -gt 0 ]]
do
    key="$1"
    case $key in
        -h|--help)
        echo "Usage: ./gddr6_update [options]"
        echo " "
        echo "GDDR6 software training script designed by hygon."
        echo " "
        echo "Options:"
        echo "-h, --help             Display this help"
        echo "-d, --die-id           Test device selection; example: 0,1; default: $die_id"
        echo "--save_flash           Whether to save parameters to Flash; default: $save_flash"
        echo "--force_train          force sw training; default: $force_train"
        echo "--load_para            Whether to load parameters when finished; default: $load_para"
        echo "--ffe                  ffe value; -1 to skip; default: $ffe"
        echo "--dfe                  dfe value; -1 to skip; default: $dfe"
        echo "--term_offset          term_offset value; -1 to skip; default: $term_offset"
        exit 0
        ;;

        -d|--die-id)
        die_id="$2"
        shift
        shift
        ;;

        --save_flash)
        save_flash="$2"
        shift
        shift
        ;;

        --force_train)
        force_train="$2"
        shift
        shift
        ;;

        --load_para)
        load_para="$2"
        shift
        shift
        ;;

        --ffe)
        ffe="$2"
        shift
        shift
        ;;

        --dfe)
        dfe="$2"
        shift
        shift
        ;;

        --term_offset)
        term_offset="$2"
        shift
        shift
        ;;

        *)
        echo "Unknown option: $1"
        exit 1
        ;;
    esac
done

function mk_log_dir
{
    # log dir
    export ut_date=$(date "+%Y%m%d-%H%M%S")
    export ut_log_top_dir="/var/log/gddr6_update"
    if [ ! -d $ut_log_top_dir ]; then
        mkdir -p $ut_log_top_dir
    fi
    export ut_log_dir="${ut_log_top_dir}/log_gddr6_update_${ut_date}_$(hostname)"

    mkdir $ut_log_dir
}

function source_env
{
    # umc_test top dir
    script_path=$(readlink -f "$0")
    export ut_top_dir=$(dirname "$script_path")
    cd $ut_top_dir
    # mini rocm lib
    export LD_LIBRARY_PATH=$ut_top_dir/lib
    # hy-smi
    export ut_hy_smi="$ut_top_dir/bin/hy-smi --gddr6_update_force"
    # sw_wdq
    export ut_sw_wdq="$ut_top_dir/bin/sw_wdq"
    # dcu_smu_debug
    export ut_dcu_smu="$ut_top_dir/bin/dcu_smu_debug"
    # hyflash
    export ut_hyflash="$ut_top_dir/bin/hyflash"

    # other
    export ut_save_flash="$save_flash"
    export ut_force_train="$force_train"
    export ut_load_para="$load_para"
    export ut_ffe="$ffe"
    export ut_dfe="$dfe"
    export ut_term_offset="$term_offset"

    return 0
}

function request_access_umc_watchdog
{
    father_pid=$1

    # Get all DCUs
    if [ $dcu_pci_number -eq 0 ];then
        echo "Not find DCU device!!!"
        return 0
    fi

    # get Available DCUs
    all_dcus=$(seq -s , 0 $[dcu_pci_number-1])
    all_dcus="${all_dcus//,/ }"
    available_dcus=""
    for dcu_id in $all_dcus; do
        $ut_dcu_smu -d $dcu_id -f 0 -t 6000 > /dev/null 2>&1
        if [ $? -eq 0 ]; then
            available_dcus="${available_dcus}${dcu_id} "
        fi
    done
    echo "mp1 available dcus: $available_dcus"

    # req
    set -e
    while true
    do
        #echo "watch dog runing"
        for dcu_id in $available_dcus; do
            # temperature req
            $ut_dcu_smu -d $dcu_id -f 0 -t 6000
            # low_power req
            $ut_dcu_smu -d $dcu_id -f 2 -t 6000
        done
        sleep 4
        # makesure father thread exist
        ps -p $father_pid > /dev/null 2>&1
    done
    return 0
}

function delete_redundant_logs
{
    cd $ut_log_top_dir
    ls -t1 |tail -n +11 | xargs rm -rf
    cd -
}

function app_run
{
    echo "ut_log_dir: $ut_log_dir"
    # env
    source_env
    if [ $? -ne 0 ]; then
        echo "source_env failed!!!"
        return 1
    fi

    # get pcie device
    dcu_info=$($ut_sw_wdq -get_dcu_number)
    export dcu_pci_number=$(echo "$dcu_info" |grep "VendorID" |wc -l)
    # skip K100_AI_I
    echo "$dcu_info" |grep "K100_AI_I"
    if [ $? -eq 0 ]; then
        echo "Skip K100_AI_I device."
        return 0
    fi

    # conf_gfxclk
    $ut_hy_smi --setsclk 6
    ret_code=$?
    if [ $ret_code -ne 0 ]; then
        echo "Not load driver!!!"
        return 1
    fi
    # check gfx 1270
    pass_flag=$($ut_hy_smi -s |grep -F "1270Mhz *")
    if [ "$pass_flag" == "" ]; then
        # For K100
        $ut_hy_smi --setsclk 8
        pass_flag=$($ut_hy_smi -s |grep -F "1270Mhz *")
        if [ "$pass_flag" == "" ]; then
            echo "gfx clock need 1.27Ghz!!!"
            return 1
        fi
    fi
    # get dcu number
    if [ "$die_id" == "a" ]; then
        # all
        dcu_num=$(echo "$pass_flag" |wc -l)
        die_id=$(seq -s , 0 $[dcu_num-1])
    fi
    export ut_die_id="${die_id//,/ }"

    # clinfo change
    need_recover=""
    clinfo_dir="/etc/OpenCL/vendors"
    clinfo_path="$clinfo_dir/amdocl64.icd"
    clinfo_bk_path="$clinfo_dir/bk_amdocl64.icd"
    gddr6_libamdocl="/opt/hyhal/gddr6_update/lib/libamdocl64.so"
    if [ -f "$clinfo_path" ]; then
        #echo "cp $clinfo_path $clinfo_bk_path"
        cp $clinfo_path $clinfo_bk_path -f
        need_recover="1"
    else
        if [ ! -d "$clinfo_dir" ];then
            mkdir -p $clinfo_dir
        fi
        need_recover="2"
    fi
    echo "$gddr6_libamdocl" > $clinfo_path

    # mp1
    request_access_umc_watchdog $$ &
    watchdog_pid=$!
    sleep 0.1

    # run
    start_t=$(date "+%Y-%m-%d %H:%M:%S")
    echo "Start date: $start_t"
    dcus_pid=()
    log_files=()
    for dcu_id in $ut_die_id; do

        log_file="$ut_log_dir/gddr6_update_${ut_date}_dcu${dcu_id}.log"
        stdbuf -oL $ut_sw_wdq --gpu ${dcu_id} -sf $ut_save_flash -ft $ut_force_train -lp $ut_load_para \
            -ffe $ut_ffe -dfe $ut_dfe -term_offset $ut_term_offset > $log_file 2>&1 &
        dcus_pid[${#dcus_pid[@]}]=$!
        log_files[${#log_files[@]}]=$log_file
    done
    # delete log files
    echo "delete redundant logs"
    delete_redundant_logs
    # wait complete
    echo "dcus_pid: ${dcus_pid[@]}. wait..."
    echo " "
    ret=0
    for ((i=0; i<${#dcus_pid[@]}; i++)); do
        pid=${dcus_pid[$i]}
        log_file=${log_files[$i]}
        wait $pid
        ret_code=$?
        if [ $ret_code -ne 0 ]; then
            echo " "
            echo "Error: pid=$pid, exit=$ret_code, log=$log_file"
            tail -n 10 $log_file
            ret=$ret_code
        fi
    done

    # clinfo recover
    case "$need_recover" in
        1)
            if [ -f "$clinfo_bk_path" ]; then
                #echo "mv $clinfo_bk_path $clinfo_path"
                mv $clinfo_bk_path $clinfo_path -f
            fi
            ;;
        2)
            echo "rm $clinfo_path"
            rm -rf $clinfo_path
            ;;
    esac

    # mp1
    ps -p $watchdog_pid > /dev/null 2>&1
    if [ $? -eq 0 ]; then
        #echo "kill watch dog:$watchdog_pid"
        kill $watchdog_pid
    fi

    # recover gfxclk
    $ut_hy_smi --setperflevel auto

    # ret
    if [ $ret -ne 0 ]; then
        return $ret
    fi

    end_t=$(date "+%Y-%m-%d %H:%M:%S")
    duration=$(($(date +%s -d "${end_t}")-$(date +%s -d "${start_t}")))
    echo "End date: $end_t, duration: ${duration}s"
    echo "Complete!!!"
    return 0
}

# log dir
mk_log_dir

# run
app_run 2>&1 |tee $ut_log_dir/terminal_${ut_date}.log
exit ${PIPESTATUS[0]}
