Add .gitignore entries for evals/ and harness_eval_main_log.txt, and add harness_eval.slurm script

13faf0e8 · Yen-Ting Lin · d62ce606 · 13faf0e8 · 13faf0e8
Commit 13faf0e8 authored May 07, 2024 by Yen-Ting Lin
Show whitespace changes
Inline Side-by-side

Showing with 91 additions and 0 deletions

.gitignore .gitignore +2 -0

harness_eval.slurm harness_eval.slurm +89 -0

No files found.
--- a/.gitignore
+++ b/.gitignore
@@ -21,3 +21,5 @@ lm_eval/caching/.cache
 # don't track files created by wandb
 wandb
 examples/wandb
+evals/
+harness_eval_main_log.txt
--- a/harness_eval.slurm
+++ b/harness_eval.slurm
+#!/bin/bash
+# this is a multi-node SLURM script using `accelerate` launcher
+#SBATCH --job-name=eval_llm
+#SBATCH --partition=defq
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per node
+#SBATCH --gres=gpu:8                 # EDIT this if it's not 8-gpus per node
+#SBATCH --exclusive
+#SBATCH --output=/mnt/home/f08944064/lighteval/logs/%x-%j.out
+#SBATCH --error=/mnt/home/f08944064/lighteval/logs/%x-%j.err
+echo "START TIME: $(date)"
+source ~/.bashrc
+source activate harness
+# auto-fail on any errors in this script
+set -eo pipefail
+# logging script's variables/commands for future debug needs
+set -x
+# EDIT the conda evn and any startup scripts
+# source /path/to/start-xxx-user # if you have something to preload before the job
+# conda activate stas-xxx        # if you have conda env to activate
+LOG_PATH="harness_eval_main_log.txt"
+# EDIT if it's not 8-gpus per node
+GPUS_PER_NODE=8
+NNODES=$SLURM_NNODES
+NUM_PROCESSES=$(expr $NNODES \* $GPUS_PER_NODE)
+# define the node 0 hostname:port
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+# note `\$SLURM_PROCID` we don't want it interpolated till `srun` since otherwise all nodes will get
+# 0 and the launcher will hang
+#
+# same goes for `\$(hostname -s|tr -dc '0-9')` - we want it to interpolate at `srun` time
+LAUNCHER="lm_eval --model hf \
+"
+# EDIT the path+name of the python script and whatever args it needs
+export PROGRAM="\
+--model_args pretrained={$1},parallelize=True \
+--tasks {$2} \
+--batch_size auto \
+--verbosity DEBUG \
+"
+    #--tasks examples/tasks/open_llm_leaderboard_tasks.txt \
+    #--model_args "pretrained=${1},trust_remote_code=True" \
+    #--model_parallel \
+    #--use_chat_template \
+export CMD="$LAUNCHER $PROGRAM"
+echo $CMD
+# EDIT if you want to redirect /tmp to /scratch (some local SSD path) since /tmp is tiny on compute nodes
+# export TMPDIR=/scratch
+# EDIT: useful for debug if needed
+#
+# to debug NCCL issues
+# export NCCL_DEBUG=INFO
+#
+# to unravel async errors w/o the correct traceback - potentially makes everything very slower
+# export CUDA_LAUNCH_BLOCKING=1
+#
+# to force crashing on nccl issues like hanging broadcast
+# export NCCL_ASYNC_ERROR_HANDLING=1
+# srun error handling:
+# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
+# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
+SRUN_ARGS=" \
+    --wait=60 \
+    --kill-on-bad-exit=1 \
+    --jobid $SLURM_JOB_ID \
+    "
+# bash -c is needed for the delayed interpolation of env vars to work
+srun $SRUN_ARGS bash -c "$CMD" 2>&1 | tee -a $LOG_PATH
+echo "END TIME: $(date)"