run.sub

#!/bin/bash
#SBATCH --exclusive
#SBATCH --mem=0
#SBATCH --overcommit

# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -eux

# The following variables variables need to be set
# Base container to be used - container built in step 1 on quick start guide 
readonly docker_image="nvcr.io/nvidia/pytorch:20.06-py3"
# Location of dataset for phase 1
readonly datadir="/raid/datasets/bert/hdf5/shard_1472_test_split_10/seq_128_pred_20_dupe_5/training"
# Location of dataset for phase 2
readonly datadir_phase2="/raid/datasets/bert/hdf5/shard_1472_test_split_10/seq_512_pred_80_dupe_5/training"
# Path to where trained checkpoints will be saved on the system
readonly checkpointdir="$PWD/checkpoints"

readonly mounts=".:/workspace/bert,${datadir}:/workspace/data,${datadir_phase2}:/workspace/data_phase2,${checkpointdir}:/results"

BIND_CMD="./bind.sh --cpu=exclusive --ib=single --"

srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 mkdir -p "${checkpointdir}"

PHASE1="\
    --train_batch_size=${BATCHSIZE:-16} \
    --learning_rate=${LR:-6e-3} \
    --warmup_proportion=${WARMUP_UPDATES:-0.2843} \
    --input_dir=/workspace/data \
    --max_seq_length=128 \
    --max_predictions_per_seq=20 \
    --max_steps=7038 \
    --num_steps_per_checkpoint=2500 \
    "
PHASE2="\
    --train_batch_size=${BATCHSIZE:-4096} \
    --learning_rate=${LR:-4e-3} \
    --warmup_proportion=${WARMUP_UPDATES:-0.128} \
    --input_dir=/workspace/data_phase2 \
    --phase2 \
    --max_seq_length=512 \
    --max_predictions_per_seq=80 \
    --max_steps=1563 \
    --num_steps_per_checkpoint=1000 \
    --resume_from_checkpoint --phase1_end_step=7038 \
    "
PHASES=( "$PHASE1" "$PHASE2" ) 

PHASE=${PHASE:-1}

BERT_CMD="\
    ${BIND_CMD} python -u /workspace/bert/run_pretraining.py \
    --seed=42 \
    ${PHASES[$((PHASE-1))]} \
    --do_train \
    --config_file=/workspace/bert/bert_config.json \
    --output_dir=/results \
    --fp16 \
    --allreduce_post_accumulation --allreduce_post_accumulation_fp16 \
    --gradient_accumulation_steps=${GRADIENT_STEPS:-2} \
    --log_freq=1 \
    --local_rank=\${SLURM_LOCALID}"

srun -l --container-image="${docker_image}" --container-mounts="${mounts}" sh -c "${BERT_CMD}"