#!/bin/bash #SBATCH --job-name translation set -euxo pipefail # Vars without defaults : "${DGXSYSTEM:?DGXSYSTEM not set}" : "${CONT:?CONT not set}" # Vars with defaults : "${NEXP:=5}" : "${DATESTAMP:=$(date +'%y%m%d%H%M%S%N')}" : "${CLEAR_CACHES:=1}" : "${DATADIR:=/raid/datasets/xformer_v0p6/utf8}" : "${LOGDIR:=./results}" # Other vars readonly _seed_override=${SEED:-} readonly _logfile_base="${LOGDIR}/${DATESTAMP}" readonly _cont_name=translation _cont_mounts="${DATADIR}:/data,${LOGDIR}:/results" # Setup directories mkdir -p "${LOGDIR}" srun --ntasks="${SLURM_JOB_NUM_NODES}" mkdir -p "${LOGDIR}" # Setup container srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-name="${_cont_name}" true # Run experiments for _experiment_index in $(seq 1 "${NEXP}"); do ( echo "Beginning trial ${_experiment_index} of ${NEXP}" # Print system info srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-name="${_cont_name}" python -c " import mlperf_log_utils from mlperf_logging.mllog import constants mlperf_log_utils.mlperf_submission_log(constants.TRANSFORMER)" # Clear caches if [ "${CLEAR_CACHES}" -eq 1 ]; then srun --ntasks="${SLURM_JOB_NUM_NODES}" bash -c "echo -n 'Clearing cache on ' && hostname && sync && sudo /sbin/sysctl vm.drop_caches=3" srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-name="${_cont_name}" python -c " from mlperf_logging.mllog import constants from mlperf_log_utils import log_event log_event(key=constants.CACHE_CLEAR, value=True)" fi # Run experiment export SEED=${_seed_override:-$RANDOM} srun --mpi=none --ntasks="$(( SLURM_JOB_NUM_NODES * DGXNGPU ))" --ntasks-per-node="${DGXNGPU}" \ --container-name="${_cont_name}" --container-mounts="${_cont_mounts}" \ ./run_and_time.sh ) |& tee "${_logfile_base}_${_experiment_index}.log" done