Commit 13faf0e8 authored by Yen-Ting Lin's avatar Yen-Ting Lin
Browse files

Add .gitignore entries for evals/ and harness_eval_main_log.txt, and add harness_eval.slurm script

parent d62ce606
...@@ -21,3 +21,5 @@ lm_eval/caching/.cache ...@@ -21,3 +21,5 @@ lm_eval/caching/.cache
# don't track files created by wandb # don't track files created by wandb
wandb wandb
examples/wandb examples/wandb
evals/
harness_eval_main_log.txt
#!/bin/bash
# this is a multi-node SLURM script using `accelerate` launcher
#SBATCH --job-name=eval_llm
#SBATCH --partition=defq
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1 # crucial - only 1 task per node
#SBATCH --gres=gpu:8 # EDIT this if it's not 8-gpus per node
#SBATCH --exclusive
#SBATCH --output=/mnt/home/f08944064/lighteval/logs/%x-%j.out
#SBATCH --error=/mnt/home/f08944064/lighteval/logs/%x-%j.err
echo "START TIME: $(date)"
source ~/.bashrc
source activate harness
# auto-fail on any errors in this script
set -eo pipefail
# logging script's variables/commands for future debug needs
set -x
# EDIT the conda evn and any startup scripts
# source /path/to/start-xxx-user # if you have something to preload before the job
# conda activate stas-xxx # if you have conda env to activate
LOG_PATH="harness_eval_main_log.txt"
# EDIT if it's not 8-gpus per node
GPUS_PER_NODE=8
NNODES=$SLURM_NNODES
NUM_PROCESSES=$(expr $NNODES \* $GPUS_PER_NODE)
# define the node 0 hostname:port
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
MASTER_PORT=6000
# note `\$SLURM_PROCID` we don't want it interpolated till `srun` since otherwise all nodes will get
# 0 and the launcher will hang
#
# same goes for `\$(hostname -s|tr -dc '0-9')` - we want it to interpolate at `srun` time
LAUNCHER="lm_eval --model hf \
"
# EDIT the path+name of the python script and whatever args it needs
export PROGRAM="\
--model_args pretrained={$1},parallelize=True \
--tasks {$2} \
--batch_size auto \
--verbosity DEBUG \
"
#--tasks examples/tasks/open_llm_leaderboard_tasks.txt \
#--model_args "pretrained=${1},trust_remote_code=True" \
#--model_parallel \
#--use_chat_template \
export CMD="$LAUNCHER $PROGRAM"
echo $CMD
# EDIT if you want to redirect /tmp to /scratch (some local SSD path) since /tmp is tiny on compute nodes
# export TMPDIR=/scratch
# EDIT: useful for debug if needed
#
# to debug NCCL issues
# export NCCL_DEBUG=INFO
#
# to unravel async errors w/o the correct traceback - potentially makes everything very slower
# export CUDA_LAUNCH_BLOCKING=1
#
# to force crashing on nccl issues like hanging broadcast
# export NCCL_ASYNC_ERROR_HANDLING=1
# srun error handling:
# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
SRUN_ARGS=" \
--wait=60 \
--kill-on-bad-exit=1 \
--jobid $SLURM_JOB_ID \
"
# bash -c is needed for the delayed interpolation of env vars to work
srun $SRUN_ARGS bash -c "$CMD" 2>&1 | tee -a $LOG_PATH
echo "END TIME: $(date)"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment