# This is an example zero-shot eval script. Please first read the readme_evalharness.md under the ../MoE directory.

# CHECKPOINT_PATH=/blob/users/minjiaz/compression_library/checkpoint/125M10L_Compression_Test_INT8_64gpu_lr6e-5_tokens5.25B_nocl_alpha-no_pp/global_step2000/
# CHECKPOINT_PATH=/blob/users/conglli/project/gpt3_with_pile/checkpoint/gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-64-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-27638-token-60B/global_step71000/
# CHECKPOINT_PATH=/blob/users/minjiaz/compression_library/checkpoint/125M12L_Compression_Test_INT8_64gpu_lr6e-5_tokens5.25B_nocl_alpha-no_pp/global_step5000/
CHECKPOINT_PATH=/blob/users/minjiaz/project/gpt3_distillation/checkpoint/gpt3-kd-test2-alpha1-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-15-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-27638-token-60B/global_step71426/
CONFIG_PATH=ds_config_gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus--1-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-27638-token-60B.json
RESULT_PATH=gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-128-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-20728-token-45B_global_step81566.log

PP_SIZE=1
TP_SIZE=1
NO_PP="true"
EP_PARALLEL_SIZE=1
# Currently eval harness does not support data parallel
# However, for MoE models it's possible to enable a "fake data parallel"
# in order to load experts on multiple gpus. At the same time, it's not
# real data parallel because we load the same data on all gpus.
# On the other hand, it's better to use less number of gpus than training,
# to reduce communication overhead.
NUM_NODE=1
NUM_GPU_PER_NODE=1

# TASKS="lambada"
# WikiText-2, not used in GPT-3 paper but used in GPT-2 paper
TASKS="lambada,wikitext"
# Tasks that appeared in GPT-3 paper (sorted based on the order in paper), plus WikiText-2.
# TASKS="hellaswag,lambada,triviaqa,webqs,winogrande,piqa,arc_challenge,arc_easy,openbookqa,race,boolq,cb,copa,rte,wic,wsc,multirc,record,anli_r1,anli_r2,anli_r3,wikitext"
# All tasks that confirmed to work, there are more tasks on https://github.com/EleutherAI/lm-evaluation-harness that we didn't test.
# TASKS="hellaswag,lambada,triviaqa,webqs,winogrande,piqa,arc_challenge,arc_easy,openbookqa,race,boolq,cb,copa,rte,wic,wsc,multirc,record,anli_r1,anli_r2,anli_r3,wikitext,logiqa,mathqa,mc_taco,mrpc,prost,pubmedqa,qnli,qqp,sciq,sst,wnli"

VOCAB_FILE=/blob/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
MERGE_FILE=/blob/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt

# export HF_DATASETS_OFFLINE=1

# Dummy arguments to make megatron happy. No need to configure them.
# The reason we don't need to configure them and many other arguments is
# because the eval framework will read the arguments from checkpoint file.
MEGATRON_REQUIRED_ARGS="\
    --num-layers -1\
    --hidden-size -1\
    --num-attention-heads -1\
    --seq-length -1 \
    --max-position-embeddings -1
"

CMD="../../tasks/eval_harness/evaluate.py \
    --load $CHECKPOINT_PATH\
    --tensor-model-parallel-size $TP_SIZE \
    --pipeline-model-parallel-size $PP_SIZE\
    --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
    --vocab-file $VOCAB_FILE\
    --merge-file $MERGE_FILE\
    --micro-batch-size 12\
    --no-load-optim \
    --no-load-rng \
    --inference \
    --disable-moe-token-dropping \
    --tokenizer-type GPT2BPETokenizer \
    --adaptive_seq_len\
    --eval_fp32\
    --task_list $TASKS\
    --results_path $RESULT_PATH \
    --deepspeed \
    --deepspeed_config $CONFIG_PATH \
    $MEGATRON_REQUIRED_ARGS\
    "

if [[ "${NO_PP}" = "true" ]]; then
CMD="${CMD} \
    --no-pipeline-parallel"
fi

LAUNCHER="deepspeed --num_nodes $NUM_NODE --num_gpus $NUM_GPU_PER_NODE"
$LAUNCHER $CMD