v1.0

e4575be9 · huaerkl · e4575be9 · e4575be9 · e4575be9 · e4575be9
Commit e4575be9 authored Aug 04, 2023 by huaerkl
20 changed files
--- a/pyproject.toml
+++ b/pyproject.toml
+[tool.black]
+line-length = 119
+target-version = ['py35']
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
+datasets
+nltk
+numpy
+parameterized
+pybind11
+regex
+six
+tensorboard
+transformers
+# versions from HF transformers
+black==21.4b0
+isort>=5.5.4
+ninja
--- a/requirements_dcu.txt
+++ b/requirements_dcu.txt
+absl-py==1.4.0
+aiohttp==3.8.5
+a1iosignal==1.3.1
+#apex @ file:///public/software/apps/DeepLearning/whl/dtk-23.04/apex/torch-1.10/apex-0.1%2Bf49ddd4.abi0.dtk2304.torch1.10-cp38-cp38-manylinux2014_x86_64.whl#sha256=938b1ac70165ace45ee218d1181d409a172ef873e8e640c8ae794a9758260eae
+appdirs==1.4.4
+async-timeout==4.0.2
+attrs==23.1.0
+black==21.4b0
+cachetools==5.3.1
+certifi==2023.5.7
+charset-normalizer==3.2.0
+click==8.1.6
+datasets==2.13.1
+#deepspeed @ file:///public/software/apps/DeepLearning/whl/dtk-23.04/deepspeed/torch-1.10/deepspeed-0.9.2%2Bgit25d5540.abi0.dtk2304.torch1.10.0-cp38-cp38-manylinux2014_x86_64.whl#sha256=c400f0fc214e8bcac44dde763fb7459ee1f7461049962ef456e1801cf9610bbb
+dill==0.3.6
+einops==0.6.1
+filelock==3.12.2
+frozenlist==1.4.0
+fsspec==2023.6.0
+google-auth==2.22.0
+google-auth-oauthlib==1.0.0
+grpcio==1.56.2
+hjson==3.1.0
+huggingface-hub==0.16.4
+idna==3.4
+importlib-metadata==6.8.0
+isort==5.12.0
+joblib==1.3.1
+Markdown==3.4.3
+MarkupSafe==2.1.3
+mpi4py==3.1.4
+multidict==6.0.4
+multiprocess==0.70.14
+mypy-extensions==1.0.0
+ninja==1.11.1
+nltk==3.8.1
+numpy==1.23.5
+oauthlib==3.2.2
+packaging==23.1
+pandas==2.0.3
+parameterized==0.9.0
+pathspec==0.11.1
+Pillow==10.0.0
+protobuf==4.23.4
+psutil==5.9.5
+py-cpuinfo==9.0.0
+pyarrow==12.0.1
+pyasn1==0.5.0
+pyasn1-modules==0.3.0
+pybind11==2.11.1
+pydantic==1.10.11
+python-dateutil==2.8.2
+pytz==2023.3
+PyYAML==6.0.1
+regex==2023.6.3
+requests==2.31.0
+requests-oauthlib==1.3.1
+rsa==4.9
+safetensors==0.3.1
+six==1.16.0
+some-package==0.1
+tensorboard==2.13.0
+tensorboard-data-server==0.7.1
+tokenizers==0.13.3
+toml==0.10.2
+#torch @ file:///public/software/apps/DeepLearning/whl/dtk-23.04/pytorch/torch-1.10/py38/torch-1.10.0%2Bgite378c3c.abi0.dtk2304-cp38-cp38-manylinux2014_x86_64.whl#sha256=0633dfa4814c2bd460e0bcd76c22c50e4824d21adf61274625ea2110f0063aea
+#torchaudio @ file:///public/software/apps/DeepLearning/whl/dtk-23.04/pytorch/torch-1.10/py38/torchaudio-0.10.0%2Ba9847c3.abi0.dtk2304.torch1.10.0-cp38-cp38-manylinux2014_x86_64.whl#sha256=a26117aafe8e7e331f3a6f31e58a4d2959a8cd860b8cd3e1586d774922d7ff3e
+#torchvision @ file:///public/software/apps/DeepLearning/whl/dtk-23.04/pytorch/torch-1.10/py38/torchvision-0.10.0%2Bgit48e6bbb.abi0.dtk2304.torch1.10-cp38-cp38-manylinux2014_x86_64.whl#sha256=1e8a3baddb1afe5c0c2373d6129e58a3877af6f942f28337a84af4835dad332b
+tqdm==4.65.0
+transformers==4.31.0
+typing_extensions==4.7.1
+tzdata==2023.3
+urllib3==1.26.16
+Werkzeug==2.3.6
+xxhash==3.2.0
+yarl==1.9.2
+zipp==3.16.2
--- a/run.sh
+++ b/run.sh
+#!/bin/bash
+
+
+DIR=`pwd`
+DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+#mkdir -p $DIR/logs
+#mkdir -p /tmp/logs
+
+
+#DATASET_1="<PATH TO THE FIRST DATASET>"
+#DATASET_2="<PATH TO THE SECOND DATASET>"
+#DATASET_3="<PATH TO THE THIRD DATASET>"
+#DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
+
+BASE_DATA_PATH=/data/Megatron-LM/data
+DATASET=${BASE_DATA_PATH}/indexed_datasets/megatron
+VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
+MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
+
+
+script_path=$(realpath $0)
+script_dir=$(dirname $script_path)
+#CONFIG_JSON="$script_dir/ds_config.json"
+CONFIG_JSON="/tmp/ds_config.json"
+
+USE_DEEPSPEED=1
+ZERO_STAGE=0
+
+
+# Debug
+#TP=4
+#PP=4
+#LAYERS=8
+#HIDDEN=512
+#SEQ=1024
+#GLOBAL_BATCH=128
+#WORKER_STR="-i worker-0"
+
+
+TP=1
+PP=2
+HIDDEN=1024
+LAYERS=24
+SEQ=1024
+GLOBAL_BATCH=2
+WORKER_STR=""
+
+MICRO_BATCH=1
+
+DTYPE="bf16"
+
+LOG_DIR="/tmp/tensorboard/tp${TP}_pp${PP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_${DTYPE}_fix3"
+mkdir -p $LOG_DIR
+
+while [[ $# -gt 0 ]]
+do
+key="$1"
+case $key in
+    --no-deepspeed)
+    USE_DEEPSPEED=0;
+    shift
+    ;;
+    -z|--zero-stage)
+    ZERO_STAGE=$2;
+    shift
+    ;;
+    *)
+    echo "Unknown argument(s)"
+    usage
+    exit 1
+    shift
+    ;;
+esac
+done
+
+
+options=" \
+	--tensor-model-parallel-size $TP \
+	--pipeline-model-parallel-size $PP \
+        --num-layers $LAYERS \
+        --hidden-size $HIDDEN \
+        --num-attention-heads 32 \
+        --seq-length $SEQ \
+        --loss-scale 12 \
+        --max-position-embeddings $SEQ \
+	--micro-batch-size $MICRO_BATCH \
+	--global-batch-size $GLOBAL_BATCH \
+	--train-iters 1000 \
+        --lr 6.0e-5 \
+	--min-lr 6.0e-6 \
+        --lr-decay-style cosine \
+        --log-interval 1 \
+        --eval-iters 40 \
+        --eval-interval 1000 \
+	--data-path ${DATASET} \
+	--vocab-file ${VOCAB_PATH} \
+	--merge-file ${MERGE_PATH} \
+	--save-interval 10000 \
+        --split 98,2,0 \
+        --clip-grad 1.0 \
+	--weight-decay 0.1 \
+	--adam-beta1 0.9 \
+	--adam-beta2 0.95 \
+	--init-method-std 0.006 \
+        --${DTYPE} \
+	--checkpoint-activations \
+	--exit-interval 10000 \
+	--tensorboard-dir $LOG_DIR
+        "
+
+
+if [[ ${USE_DEEPSPEED} -eq 1 ]]; then
+	echo "Using DeepSpeed"
+	options="${options} \
+		--deepspeed \
+		--deepspeed_config=${CONFIG_JSON} \
+		--zero-stage=${ZERO_STAGE} \
+		--deepspeed-activation-checkpointing \
+	"
+fi
+
+
+cat <<EOT > $CONFIG_JSON
+{
+  "train_batch_size" : $GLOBAL_BATCH,
+  "train_micro_batch_size_per_gpu": $MICRO_BATCH,
+  "steps_per_print": 1,
+
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+
+  "bf16": {
+    "enabled": true
+  },
+
+  "fp16": {
+    "enabled": false,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+  },
+
+  "wall_clock_breakdown" : true
+}
+EOT
+
+WORKER_STR="-i worker-0:0,1"
+#run_cmd="deepspeed -i worker-0:0,1,2,3 ${DIR}/pretrain_gpt.py $@ ${options}"
+#run_cmd="deepspeed -i worker-0 ${DIR}/pretrain_gpt.py $@ ${options}"
+run_cmd="deepspeed $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}"
+
+
+echo ${run_cmd}
+eval ${run_cmd}
+
+set +x
--- a/run_bf16.sh
+++ b/run_bf16.sh
+#!/bin/bash
+
+
+DIR=`pwd`
+DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+#mkdir -p $DIR/logs
+#mkdir -p /tmp/logs
+
+
+#DATASET_1="<PATH TO THE FIRST DATASET>"
+#DATASET_2="<PATH TO THE SECOND DATASET>"
+#DATASET_3="<PATH TO THE THIRD DATASET>"
+#DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
+
+#BASE_DATA_PATH=tests/data/gpt2
+#DATASET=${BASE_DATA_PATH}/meg-gpt2-openwebtext_text_document
+#VOCAB_PATH=${BASE_DATA_PATH}/gpt2-tiny-vocab.json
+#MERGE_PATH=${BASE_DATA_PATH}/gpt2-tiny-merges.txt
+
+BASE_DATA_PATH=/vc_data/Megatron-LM/data
+DATASET=${BASE_DATA_PATH}/indexed_datasets/megatron
+VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
+MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
+
+
+script_path=$(realpath $0)
+script_dir=$(dirname $script_path)
+CONFIG_JSON="$script_dir/ds_config.json"
+#CONFIG_JSON="/tmp/ds_config.json"
+
+USE_DEEPSPEED=1
+ZERO_STAGE=0
+
+#TP=4
+#PP=4
+
+# Debug
+DEBUG_MODE=0 
+if [[ $DEBUG_MODE == 1 ]]; then
+        LAYERS=4
+        HIDDEN=512
+        SEQ=512
+        EXIT_INTERVAL=3
+else
+        HIDDEN=1024
+        LAYERS=24
+        SEQ=1024
+        EXIT_INTERVAL=10
+fi  
+
+TP=2
+PP=2
+DP=4
+WORLD_SIZE=$((TP*PP*DP))
+GLOBAL_BATCH=4
+
+MICRO_BATCH=1
+TRAIN_ITERS=100000
+CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP} 
+LOAD_CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP}
+
+LR=6.0e-4
+MIN_LR=6.0e-5
+DTYPE="bf16"
+EXP_DIR=${HOME}/experiments/results/ckpt_reshape
+LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_cont"
+mkdir -p $LOG_DIR
+
+while [[ $# -gt 0 ]]
+do
+key="$1"
+case $key in
+    --no-deepspeed)
+    USE_DEEPSPEED=0;
+    shift
+    ;;
+    -z|--zero-stage)
+    ZERO_STAGE=$2;
+    shift
+    ;;
+    *)
+    echo "Unknown argument(s)"
+    usage
+    exit 1
+    shift
+    ;;
+esac
+done
+
+
+options=" \
+	--tensor-model-parallel-size $TP \
+	--pipeline-model-parallel-size $PP \
+        --num-layers $LAYERS \
+        --hidden-size $HIDDEN \
+        --num-attention-heads 32 \
+        --seq-length $SEQ \
+        --loss-scale 12 \
+        --max-position-embeddings $SEQ \
+	--micro-batch-size $MICRO_BATCH \
+	--global-batch-size $GLOBAL_BATCH \
+	--train-iters $TRAIN_ITERS \
+        --lr $LR \
+	--min-lr $MIN_LR \
+        --lr-decay-style cosine \
+        --log-interval 1 \
+        --eval-iters 40 \
+        --eval-interval 10 \
+	--data-path ${DATASET} \
+	--vocab-file ${VOCAB_PATH} \
+	--merge-file ${MERGE_PATH} \
+	--save-interval 1000 \
+        --split 98,2,0 \
+        --clip-grad 1.0 \
+	--weight-decay 0.1 \
+	--adam-beta1 0.9 \
+	--adam-beta2 0.95 \
+	--init-method-std 0.006 \
+        --${DTYPE} \
+	--checkpoint-activations \
+	--exit-interval ${EXIT_INTERVAL} \
+        --save ${CHECKPOINT_PATH} \
+        --load ${LOAD_CHECKPOINT_PATH} \
+        --position-embedding-type alibi \
+        --override-lr-scheduler \
+        --embed-layernorm \
+	--tensorboard-dir $LOG_DIR
+        "
+
+
+if [[ ${USE_DEEPSPEED} -eq 1 ]]; then
+	echo "Using DeepSpeed"
+	options="${options} \
+		--deepspeed \
+		--deepspeed_config=${CONFIG_JSON} \
+		--zero-stage=${ZERO_STAGE} \
+		--deepspeed-activation-checkpointing \
+	"
+fi
+
+
+cat <<EOT > $CONFIG_JSON
+{
+  "train_batch_size" : $GLOBAL_BATCH,
+  "train_micro_batch_size_per_gpu": $MICRO_BATCH,
+  "steps_per_print": 1,
+
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+
+  "bf16": {
+    "enabled": true
+  },
+
+  "fp16": {
+    "enabled": false,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+  },
+
+  "wall_clock_breakdown" : true
+}
+EOT
+
+#WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE"
+#WORKER_STR="-i worker-0:0,1,2,3"
+#run_cmd="deepspeed -i worker-0:0,1,2,3 ${DIR}/pretrain_gpt.py $@ ${options}"
+#run_cmd="deepspeed -i worker-0 ${DIR}/pretrain_gpt.py $@ ${options}"
+run_cmd="deepspeed --master_port 29700 $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}"
+
+
+echo ${run_cmd}
+eval ${run_cmd}
+
+set +x
--- a/run_fp16.sh
+++ b/run_fp16.sh
+#!/bin/bash
+
+
+DIR=`pwd`
+DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+#mkdir -p $DIR/logs
+#mkdir -p /tmp/logs
+
+
+#DATASET_1="<PATH TO THE FIRST DATASET>"
+#DATASET_2="<PATH TO THE SECOND DATASET>"
+#DATASET_3="<PATH TO THE THIRD DATASET>"
+#DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
+
+BASE_DATA_PATH=/data/Megatron-LM/data
+DATASET=${BASE_DATA_PATH}/indexed_datasets/megatron
+VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
+MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
+
+
+script_path=$(realpath $0)
+script_dir=$(dirname $script_path)
+#CONFIG_JSON="$script_dir/ds_config.json"
+CONFIG_JSON="/tmp/ds_config.json"
+
+USE_DEEPSPEED=1
+ZERO_STAGE=0
+
+
+# Debug
+#TP=4
+#PP=4
+#LAYERS=8
+#HIDDEN=512
+#SEQ=1024
+#GLOBAL_BATCH=128
+#WORKER_STR="-i worker-0"
+
+
+TP=1
+PP=1
+DP=2
+WORLD_SIZE=$((TP*PP*DP))
+HIDDEN=1024
+LAYERS=24
+SEQ=1024
+GLOBAL_BATCH=1
+WORKER_STR=""
+
+MICRO_BATCH=1
+LR=6.0e-4
+MIN_LR=6.0e-5
+DTYPE="fp16"
+EXP_DIR=${HOME}/experiments/results/bf16
+LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_fix3"
+mkdir -p $LOG_DIR
+
+while [[ $# -gt 0 ]]
+do
+key="$1"
+case $key in
+    --no-deepspeed)
+    USE_DEEPSPEED=0;
+    shift
+    ;;
+    -z|--zero-stage)
+    ZERO_STAGE=$2;
+    shift
+    ;;
+    *)
+    echo "Unknown argument(s)"
+    usage
+    exit 1
+    shift
+    ;;
+esac
+done
+
+
+options=" \
+	--tensor-model-parallel-size $TP \
+	--pipeline-model-parallel-size $PP \
+        --num-layers $LAYERS \
+        --hidden-size $HIDDEN \
+        --num-attention-heads 32 \
+        --seq-length $SEQ \
+        --loss-scale 12 \
+        --max-position-embeddings $SEQ \
+	--micro-batch-size $MICRO_BATCH \
+	--global-batch-size $GLOBAL_BATCH \
+	--train-iters 1000 \
+        --lr $LR \
+	--min-lr $MIN_LR \
+        --lr-decay-style cosine \
+        --log-interval 1 \
+        --eval-iters 40 \
+        --eval-interval 10 \
+	--data-path ${DATASET} \
+	--vocab-file ${VOCAB_PATH} \
+	--merge-file ${MERGE_PATH} \
+	--save-interval 10000 \
+        --split 98,2,0 \
+        --clip-grad 1.0 \
+	--weight-decay 0.1 \
+	--adam-beta1 0.9 \
+	--adam-beta2 0.95 \
+	--init-method-std 0.006 \
+        --${DTYPE} \
+	--checkpoint-activations \
+	--exit-interval 10000 \
+	--tensorboard-dir $LOG_DIR
+        "
+
+
+if [[ ${USE_DEEPSPEED} -eq 1 ]]; then
+	echo "Using DeepSpeed"
+	options="${options} \
+		--deepspeed \
+		--deepspeed_config=${CONFIG_JSON} \
+		--zero-stage=${ZERO_STAGE} \
+		--deepspeed-activation-checkpointing \
+	"
+fi
+
+
+cat <<EOT > $CONFIG_JSON
+{
+  "train_batch_size" : $GLOBAL_BATCH,
+  "train_micro_batch_size_per_gpu": $MICRO_BATCH,
+  "steps_per_print": 1,
+
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+
+  "bf16": {
+    "enabled": false
+  },
+
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 8
+  },
+
+  "wall_clock_breakdown" : true
+}
+EOT
+
+WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE"
+#WORKER_STR="-i worker-0:0,1,2,3"
+#run_cmd="deepspeed -i worker-0:0,1,2,3 ${DIR}/pretrain_gpt.py $@ ${options}"
+#run_cmd="deepspeed -i worker-0 ${DIR}/pretrain_gpt.py $@ ${options}"
+run_cmd="deepspeed --master_port 29600 $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}"
+
+
+echo ${run_cmd}
+eval ${run_cmd}
+
+set +x
--- a/run_universal_bf16.sh
+++ b/run_universal_bf16.sh
+#!/bin/bash
+
+
+DIR=`pwd`
+DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+#mkdir -p $DIR/logs
+#mkdir -p /tmp/logs
+
+
+#DATASET_1="<PATH TO THE FIRST DATASET>"
+#DATASET_2="<PATH TO THE SECOND DATASET>"
+#DATASET_3="<PATH TO THE THIRD DATASET>"
+#DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
+
+#BASE_DATA_PATH=tests/data/gpt2
+#DATASET=${BASE_DATA_PATH}/meg-gpt2-openwebtext_text_document
+#VOCAB_PATH=${BASE_DATA_PATH}/gpt2-tiny-vocab.json
+#MERGE_PATH=${BASE_DATA_PATH}/gpt2-tiny-merges.txt
+
+BASE_DATA_PATH=/vc_data/Megatron-LM/data
+DATASET=${BASE_DATA_PATH}/indexed_datasets/megatron
+VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
+MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
+
+
+script_path=$(realpath $0)
+script_dir=$(dirname $script_path)
+CONFIG_JSON="$script_dir/ds_config.json"
+#CONFIG_JSON="/tmp/ds_config.json"
+
+USE_DEEPSPEED=1
+ZERO_STAGE=0
+
+#TP=4
+#PP=4
+
+# Debug
+DEBUG_MODE=0 
+if [[ $DEBUG_MODE == 1 ]]; then
+        LAYERS=4
+        HIDDEN=512
+        SEQ=512
+        EXIT_INTERVAL=3
+else
+        HIDDEN=1024
+        LAYERS=24
+        SEQ=1024
+        EXIT_INTERVAL=10
+fi  
+
+TP=2
+PP=2
+DP=4
+WORLD_SIZE=$((TP*PP*DP))
+GLOBAL_BATCH=4
+
+MICRO_BATCH=1
+TRAIN_ITERS=100000
+CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP} 
+LOAD_CHECKPOINT_PATH=checkpoints/gpt2/tp2_pp2_dp4
+
+LR=6.0e-4
+MIN_LR=6.0e-5
+DTYPE="bf16"
+EXP_DIR=${HOME}/experiments/results/ckpt_reshape
+LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_uni"
+mkdir -p $LOG_DIR
+
+while [[ $# -gt 0 ]]
+do
+key="$1"
+case $key in
+    --no-deepspeed)
+    USE_DEEPSPEED=0;
+    shift
+    ;;
+    -z|--zero-stage)
+    ZERO_STAGE=$2;
+    shift
+    ;;
+    *)
+    echo "Unknown argument(s)"
+    usage
+    exit 1
+    shift
+    ;;
+esac
+done
+
+
+options=" \
+	--tensor-model-parallel-size $TP \
+	--pipeline-model-parallel-size $PP \
+        --num-layers $LAYERS \
+        --hidden-size $HIDDEN \
+        --num-attention-heads 32 \
+        --seq-length $SEQ \
+        --loss-scale 12 \
+        --max-position-embeddings $SEQ \
+	--micro-batch-size $MICRO_BATCH \
+	--global-batch-size $GLOBAL_BATCH \
+	--train-iters $TRAIN_ITERS \
+        --lr $LR \
+	--min-lr $MIN_LR \
+        --lr-decay-style cosine \
+        --log-interval 1 \
+        --eval-iters 40 \
+        --eval-interval 10 \
+	--data-path ${DATASET} \
+	--vocab-file ${VOCAB_PATH} \
+	--merge-file ${MERGE_PATH} \
+	--save-interval 1000 \
+        --split 98,2,0 \
+        --clip-grad 1.0 \
+	--weight-decay 0.1 \
+	--adam-beta1 0.9 \
+	--adam-beta2 0.95 \
+	--init-method-std 0.006 \
+        --${DTYPE} \
+	--checkpoint-activations \
+	--exit-interval ${EXIT_INTERVAL} \
+        --save ${CHECKPOINT_PATH} \
+        --load ${LOAD_CHECKPOINT_PATH} \
+        --universal-checkpoint \
+        --position-embedding-type alibi \
+        --override-lr-scheduler \
+        --embed-layernorm \
+	--tensorboard-dir $LOG_DIR
+        "
+
+
+if [[ ${USE_DEEPSPEED} -eq 1 ]]; then
+	echo "Using DeepSpeed"
+	options="${options} \
+		--deepspeed \
+		--deepspeed_config=${CONFIG_JSON} \
+		--zero-stage=${ZERO_STAGE} \
+		--deepspeed-activation-checkpointing \
+	"
+fi
+
+
+cat <<EOT > $CONFIG_JSON
+{
+  "train_batch_size" : $GLOBAL_BATCH,
+  "train_micro_batch_size_per_gpu": $MICRO_BATCH,
+  "steps_per_print": 1,
+
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+
+  "bf16": {
+    "enabled": true
+  },
+
+  "fp16": {
+    "enabled": false,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+  },
+
+  "wall_clock_breakdown" : true
+}
+EOT
+
+#WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE"
+#WORKER_STR="-i worker-0:0,1,2,3"
+#run_cmd="deepspeed -i worker-0:0,1,2,3 ${DIR}/pretrain_gpt.py $@ ${options}"
+#run_cmd="deepspeed -i worker-0 ${DIR}/pretrain_gpt.py $@ ${options}"
+run_cmd="deepspeed --master_port 29700 $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}"
+
+
+echo ${run_cmd}
+eval ${run_cmd}
+
+set +x
--- a/scripts/README.md
+++ b/scripts/README.md
+# Scripts
+
+This section should be organized with sub-folders for different things.
--- a/scripts/bloom-inference-scripts/README.md
+++ b/scripts/bloom-inference-scripts/README.md
+# Inference scripts for BLOOM
+
+Moved to https://github.com/huggingface/transformers-bloom-inference/tree/main/bloom-inference-scripts
--- a/scripts/bloom-inference-server/README.md
+++ b/scripts/bloom-inference-server/README.md
+## Inference solutions for BLOOM 176B
+
+Moved to https://github.com/huggingface/transformers-bloom-inference/tree/main/bloom-inference-server
--- a/scripts/test_multiple_dataset_sampling/create_dummy_dataset.py
+++ b/scripts/test_multiple_dataset_sampling/create_dummy_dataset.py
+import json
+import argparse
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dir',
+        type=str,
+        required=True,
+        help='directory to save data'
+        )
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = get_args()
+
+    for i in range(10):
+
+        row_limit = 1000
+        rows_to_save = [{'text': ''.join([str(i)+'-*']*128)}]
+
+        with open('{}/dataset_{}.json'.format(args.dir, i), 'w') as f:
+            f.write(
+                '\n'.join(json.dumps(_i) for _i in rows_to_save*row_limit)
+            )
+
+if __name__ == '__main__':
+    main()
--- a/scripts/test_multiple_dataset_sampling/preprocess_data.py
+++ b/scripts/test_multiple_dataset_sampling/preprocess_data.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Processing data for pretraining."""
+
+import argparse
+import json
+import multiprocessing
+import os
+import sys
+
+from megatron.data.indexed_dataset import best_fitting_dtype
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             os.path.pardir)))
+import time
+
+import torch
+try:
+    import nltk
+    nltk_available = True
+except ImportError:
+    nltk_available = False
+
+from megatron.tokenizer import build_tokenizer
+from megatron.data import indexed_dataset
+
+
+# https://stackoverflow.com/questions/33139531/preserve-empty-lines-with-nltks-punkt-tokenizer
+class CustomLanguageVars(nltk.tokenize.punkt.PunktLanguageVars):
+
+    _period_context_fmt = r"""
+        \S*                          # some word material
+        %(SentEndChars)s             # a potential sentence ending
+        \s*                       #  <-- THIS is what I changed
+        (?=(?P<after_tok>
+            %(NonWord)s              # either other punctuation
+            |
+            (?P<next_tok>\S+)     #  <-- Normally you would have \s+ here
+        ))"""
+
+class IdentitySplitter(object):
+    def tokenize(self, *text):
+        return text
+
+class Encoder(object):
+    def __init__(self, args):
+        self.args = args
+
+    def initializer(self):
+        # Use Encoder class as a container for global data
+        Encoder.tokenizer = build_tokenizer(self.args)
+        if self.args.split_sentences:
+            if not nltk_available:
+                print("NLTK is not available to split sentences.")
+                exit()
+            splitter = nltk.load("tokenizers/punkt/english.pickle")
+            if self.args.keep_newlines:
+                # this prevents punkt from eating newlines after sentences
+                Encoder.splitter = nltk.tokenize.punkt.PunktSentenceTokenizer(
+                    train_text = splitter._params,
+                    lang_vars = CustomLanguageVars())
+            else:
+                Encoder.splitter = splitter
+
+        else:
+            Encoder.splitter = IdentitySplitter()
+
+    def encode(self, json_line):
+        data = json.loads(json_line)
+        ids = {}
+        for key in self.args.json_keys:
+            text = data[key]
+            doc_ids = []
+            for sentence in Encoder.splitter.tokenize(text):
+                sentence_ids = Encoder.tokenizer.tokenize(sentence)
+                if len(sentence_ids) > 0:
+                    doc_ids.append(sentence_ids)
+            if len(doc_ids) > 0 and self.args.append_eod:
+                doc_ids[-1].append(Encoder.tokenizer.eod)
+            ids[key] = doc_ids
+        return ids, len(json_line)
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    group = parser.add_argument_group(title='input data')
+    group.add_argument('--input', type=str,
+                       help='Path to input JSON')
+    group.add_argument('--datasets', nargs='+', default=None,
+                       help='Paths to one or more input datasets to merge')
+    group.add_argument('--json-keys', nargs='+', default=['text'],
+                       help='space separate listed of keys to extract from json')
+    group.add_argument('--split-sentences', action='store_true',
+                       help='Split documents into sentences.')
+    group.add_argument('--keep-newlines', action='store_true',
+                       help='Keep newlines between sentences when splitting.')
+
+    group = parser.add_argument_group(title='tokenizer')
+    group.add_argument('--tokenizer-type', type=str, required=True,
+                       choices=['BertWordPieceLowerCase','BertWordPieceCase',
+                                'GPT2BPETokenizer', 'PretrainedFromHF'],
+                       help='What type of tokenizer to use.')
+    group.add_argument('--vocab-file', type=str, default=None,
+                       help='Path to the vocab file')
+    group.add_argument('--merge-file', type=str, default=None,
+                       help='Path to the BPE merge file (if necessary).')
+    group.add_argument('--append-eod', action='store_true',
+                       help='Append an <eod> token to the end of a document.')
+    group.add_argument("--tokenizer-name-or-path", type=str, default=None,
+                       help="Name or path of the huggingface tokenizer.")
+
+    group = parser.add_argument_group(title='output data')
+    group.add_argument('--output-prefix', type=str, required=True,
+                       help='Path to binary output file without suffix')
+    group.add_argument('--dataset-impl', type=str, default='mmap',
+                       choices=['lazy', 'cached', 'mmap'])
+
+    group = parser.add_argument_group(title='runtime')
+    group.add_argument('--workers', type=int, default=1,
+                       help='Number of worker processes to launch')
+    group.add_argument('--log-interval', type=int, default=100,
+                       help='Interval between progress updates')
+    args = parser.parse_args()
+    args.keep_empty = False
+
+    if args.tokenizer_type.lower().startswith('bert'):
+        if not args.split_sentences:
+            print("Bert tokenizer detected, are you sure you don't want to split sentences?")
+
+    # some default/dummy values for the tokenizer
+    args.rank = 0
+    args.make_vocab_size_divisible_by = 128
+    args.tensor_model_parallel_size = 1
+    args.vocab_extra_ids = 0
+
+    return args
+
+def main():
+    args = get_args()
+    startup_start = time.time()
+
+    print("Opening", args.input)
+    fin = open(args.input, 'r', encoding='utf-8')
+
+    if nltk_available and args.split_sentences:
+        nltk.download("punkt", quiet=True)
+
+    encoder = Encoder(args)
+    tokenizer = build_tokenizer(args)
+    pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer)
+    encoded_docs = pool.imap(encoder.encode, fin, 25)
+    #encoded_docs = map(encoder.encode, fin)
+
+    level = "document"
+    if args.split_sentences:
+        level = "sentence"
+
+    print(f"Vocab size: {tokenizer.vocab_size}")
+    print(f"Output prefix: {args.output_prefix}")
+    output_bin_files = {}
+    output_idx_files = {}
+    builders = {}
+    for key in args.json_keys:
+        output_bin_files[key] = "{}_{}_{}.bin".format(args.output_prefix,
+                                                    key, level)
+        output_idx_files[key] = "{}_{}_{}.idx".format(args.output_prefix,
+                                                    key, level)
+        builders[key] = indexed_dataset.make_builder(output_bin_files[key],
+                                                     impl=args.dataset_impl,
+                                                     dtype=best_fitting_dtype(tokenizer.vocab_size))
+
+    startup_end = time.time()
+    proc_start = time.time()
+    total_bytes_processed = 0
+    print("Time to startup:", startup_end - startup_start)
+
+    for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1):
+        total_bytes_processed += bytes_processed
+        for key, sentences in doc.items():
+            if len(sentences) == 0:
+                continue
+            for sentence in sentences:
+                builders[key].add_item(torch.IntTensor(sentence))
+            builders[key].end_document()
+        if i % args.log_interval == 0:
+            current = time.time()
+            elapsed = current - proc_start
+            mbs = total_bytes_processed/elapsed/1024/1024
+            print(f"Processed {i} documents",
+                f"({i/elapsed} docs/s, {mbs} MB/s).",
+                file=sys.stderr)
+
+    for key in args.json_keys:
+        builders[key].finalize(output_idx_files[key])
+
+if __name__ == '__main__':
+    main()
--- a/scripts/test_multiple_dataset_sampling/test_sampling.py
+++ b/scripts/test_multiple_dataset_sampling/test_sampling.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Pretrain GPT"""
+
+import torch
+from functools import partial
+from megatron import get_args
+from megatron import print_rank_0
+from megatron import get_timers
+from megatron import get_tokenizer
+from megatron import mpu
+from megatron.data.gpt_dataset import build_train_valid_test_datasets
+from megatron.model import GPTModel, GPTModelPipe
+from megatron.training import pretrain
+from megatron.utils import get_ltor_masks_and_position_ids
+from megatron.utils import average_losses_across_data_parallel_group
+
+import deepspeed
+from deepspeed.runtime.utils import see_memory_usage
+import os
+import subprocess
+
+collected_sample = {}
+token_dict = {
+    15: 0,
+    16: 1,
+    17: 2,
+    18: 3,
+    19: 4,
+    20: 5,
+    21: 6,
+    22: 7,
+    23: 8,
+    24: 9,
+}
+
+def model_provider(pre_process=True, post_process=True):
+    """Build the model."""
+
+    print_rank_0('building GPT model ...')
+    see_memory_usage(f"Before Building Model", force=True)
+
+    args = get_args()
+    with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(),
+                             remote_device=None if args.remote_device=='none' else args.remote_device,
+                             config=args.deepspeed_config,
+                             enabled=args.zero_stage==3):
+        if args.deepspeed and mpu.get_pipeline_model_parallel_world_size() > 1:
+            model = GPTModelPipe(
+                num_tokentypes=0,
+                parallel_output=True
+            )
+            # This is a hack to give us a reference to get_batch_pipe from within training.py
+            # We need to call model.set_batch_fn after deepspeed.initialize
+            model._megatron_batch_fn = get_batch_pipe
+
+            # Predompute the attention mask and store it in args. This avoids having to
+            # pipeline it as an activation during training. The mask is constant, and thus
+            # we can reuse it.
+            attention_mask = torch.tril(torch.ones(
+                (1, args.seq_length, args.seq_length), device=torch.cuda.current_device())).view(
+                    1, 1, args.seq_length, args.seq_length)
+            
+            # Convert attention mask to binary:
+            attention_mask = (attention_mask < 0.5)
+            if args.fp16:
+                attention_mask = attention_mask.half()
+            elif args.bf16:
+                attention_mask = attention_mask.bfloat16()
+            
+            args.attn_mask = attention_mask
+
+        else:
+            model = GPTModel(
+                num_tokentypes=0,
+                parallel_output=True,
+                pre_process=pre_process,
+                post_process=post_process
+            )
+    see_memory_usage(f"After Building Model", force=True)
+    return model
+
+
+def get_batch(data_iterator):
+    """Generate a batch"""
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    # Items and their type.
+    keys = ['text']
+    datatype = torch.int64
+
+    # Broadcast data.
+    if data_iterator is not None:
+        data = next(data_iterator)
+    else:
+        data = None
+    data_b = mpu.broadcast_data(keys, data, datatype)
+
+    # Unpack.
+    # tokens_ = data_b['text'].long()
+    tokens_ = data_b[keys[0]].long()
+    labels = tokens_[:, 1:].contiguous()
+    tokens = tokens_[:, :-1].contiguous()
+
+    # Get the masks and postition ids.
+    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+        tokens,
+        tokenizer.eod,
+        args.reset_position_ids,
+        args.reset_attention_mask,
+        args.eod_mask_loss)
+
+    return tokens, labels, loss_mask, attention_mask, position_ids
+
+def get_batch_pipe(data):
+    """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`"""
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    # Items and their type.
+    keys = ['text']
+    datatype = torch.int64
+
+    # Broadcast data.
+    data_b = mpu.broadcast_data(keys, data, datatype)
+
+    # Unpack.
+    tokens_ = data_b['text'].long()
+    labels = tokens_[:, 1:].contiguous()
+    tokens = tokens_[:, :-1].contiguous()
+
+    # Get the masks and postition ids.
+    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+        tokens,
+        tokenizer.eod,
+        args.reset_position_ids,
+        args.reset_attention_mask,
+        args.eod_mask_loss)
+
+    return (tokens, position_ids, attention_mask), (labels, loss_mask)
+
+def loss_func(loss_mask, output_tensor):
+    losses = output_tensor.float()
+    loss_mask = loss_mask.view(-1).float()
+    loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
+
+    # Reduce loss for logging.
+    averaged_loss = average_losses_across_data_parallel_group([loss])
+
+    return loss, {'lm loss': averaged_loss[0]}
+
+
+def forward_step(data_iterator, model):
+    """Forward step."""
+    args = get_args()
+    timers = get_timers()
+
+    # Get the batch.
+    timers('batch-generator').start()
+    tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
+        data_iterator)
+    timers('batch-generator').stop()
+
+    for row in tokens.detach().cpu():
+        token_idx = [i for i in row.tolist() if i in list(token_dict.keys())][0]
+        num = "dataset_{}".format(token_dict[token_idx])
+        if num in collected_sample:
+            collected_sample[num] += 1
+        else:
+            collected_sample[num] = 1
+
+    print_rank_0(collected_sample)
+    output_tensor = model(tokens, position_ids, attention_mask,
+                          labels=labels)
+
+    return output_tensor, partial(loss_func, loss_mask)
+
+
+def train_valid_test_datasets_provider(train_val_test_num_samples):
+    """Build train, valid, and test datasets."""
+    args = get_args()
+
+    print_rank_0('> building train, validation, and test datasets '
+                 'for GPT ...')
+    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
+        data_prefix=args.data_path,
+        data_impl=args.data_impl,
+        splits_string=args.split,
+        train_valid_test_num_samples=train_val_test_num_samples,
+        seq_length=args.seq_length,
+        seed=args.seed,
+        skip_warmup=(not args.mmap_warmup))
+    print_rank_0("> finished creating GPT datasets ...")
+
+    return train_ds, valid_ds, test_ds
+
+def command_exists(cmd):
+    result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
+    return result.wait() == 0
+
+def git_ds_info():
+    from deepspeed.env_report import main as ds_report
+    ds_report()
+
+    # Write out version/git info
+    git_hash_cmd = "git rev-parse --short HEAD"
+    git_branch_cmd = "git rev-parse --abbrev-ref HEAD"
+    if command_exists('git'):
+        try:
+            result = subprocess.check_output(git_hash_cmd, shell=True)
+            git_hash = result.decode('utf-8').strip()
+            result = subprocess.check_output(git_branch_cmd, shell=True)
+            git_branch = result.decode('utf-8').strip()
+        except subprocess.CalledProcessError:
+            git_hash = "unknown"
+            git_branch = "unknown"
+    else:
+        git_hash = "unknown"
+        git_branch = "unknown"
+    print(f'**** Git info for Megatron: git_hash={git_hash} git_branch={git_branch} ****')
+
+
+if __name__ == "__main__":
+    git_ds_info()
+    pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
+             args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})
--- a/scripts/test_multiple_dataset_sampling/test_sampling.sh
+++ b/scripts/test_multiple_dataset_sampling/test_sampling.sh
+#!/bin/bash
+BASE_DATA_PATH=/tmp
+INPUT_PATH=/tmp
+OUTPUT_PATH=/tmp
+
+wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json -P ${BASE_DATA_PATH}
+wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -P ${BASE_DATA_PATH}
+
+python create_dummy_dataset.py --dir ${INPUT_PATH}
+
+python preprocess_data.py \
+    --input ${INPUT_PATH}/dataset_0.json \
+    --output-prefix ${OUTPUT_PATH}/dataset-0 \
+    --vocab ${BASE_DATA_PATH}/gpt2-vocab.json \
+    --dataset-impl mmap \
+    --tokenizer-type GPT2BPETokenizer \
+    --merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \
+    --append-eod
+
+python preprocess_data.py \
+    --input ${INPUT_PATH}/dataset_1.json \
+    --output-prefix ${OUTPUT_PATH}/dataset-1 \
+    --vocab ${BASE_DATA_PATH}/gpt2-vocab.json \
+    --dataset-impl mmap \
+    --tokenizer-type GPT2BPETokenizer \
+    --merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \
+    --append-eod
+
+python preprocess_data.py \
+    --input ${INPUT_PATH}/dataset_2.json \
+    --output-prefix ${OUTPUT_PATH}/dataset-2 \
+    --vocab ${BASE_DATA_PATH}/gpt2-vocab.json \
+    --dataset-impl mmap \
+    --tokenizer-type GPT2BPETokenizer \
+    --merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \
+    --append-eod
+
+python preprocess_data.py \
+    --input ${INPUT_PATH}/dataset_3.json \
+    --output-prefix ${OUTPUT_PATH}/dataset-3 \
+    --vocab ${BASE_DATA_PATH}/gpt2-vocab.json \
+    --dataset-impl mmap \
+    --tokenizer-type GPT2BPETokenizer \
+    --merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \
+    --append-eod
+
+python preprocess_data.py \
+    --input ${INPUT_PATH}/dataset_4.json \
+    --output-prefix ${OUTPUT_PATH}/dataset-4 \
+    --vocab ${BASE_DATA_PATH}/gpt2-vocab.json \
+    --dataset-impl mmap \
+    --tokenizer-type GPT2BPETokenizer \
+    --merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \
+    --append-eod
+
+DIR=`pwd`
+DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+mkdir -p ${BASE_DATA_PATH}/logs
+
+DATASET_0="${OUTPUT_PATH}/dataset-0_text_document"
+DATASET_1="${OUTPUT_PATH}/dataset-1_text_document"
+DATASET_2="${OUTPUT_PATH}/dataset-2_text_document"
+DATASET_3="${OUTPUT_PATH}/dataset-3_text_document"
+DATASET_4="${OUTPUT_PATH}/dataset-4_text_document"
+DATASET="0.1 ${DATASET_0} 0.25 ${DATASET_1} 0.2 ${DATASET_2} 0.15 ${DATASET_3} 0.3 ${DATASET_4}"
+
+VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
+MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
+
+
+#script_path=$(realpath $0)
+#script_dir=$(dirname $script_path)
+#CONFIG_JSON="$script_dir/ds_config.json"
+
+USE_DEEPSPEED=1
+ZERO_STAGE=0
+
+
+# Debug
+#TP=4
+#PP=4
+#LAYERS=8
+#HIDDEN=512
+#SEQ=1024
+#GLOBAL_BATCH=128
+#WORKER_STR="-i worker-0"
+
+
+# 52B
+TP=4
+PP=16
+HIDDEN=1024
+LAYERS=24
+SEQ=128
+GLOBAL_BATCH=16
+WORKER_STR=""
+
+MICRO_BATCH=8
+
+while [[ $# -gt 0 ]]
+do
+key="$1"
+case $key in
+    --no-deepspeed)
+    USE_DEEPSPEED=0;
+    shift
+    ;;
+    -z|--zero-stage)
+    ZERO_STAGE=$2;
+    shift
+    ;;
+    *)
+    echo "Unknown argument(s)"
+    usage
+    exit 1
+    shift
+    ;;
+esac
+done
+
+
+options=" \
+	--tensor-model-parallel-size $TP \
+	--pipeline-model-parallel-size $PP \
+        --num-layers $LAYERS \
+        --hidden-size $HIDDEN \
+        --num-attention-heads 32 \
+        --seq-length $SEQ \
+        --loss-scale 12 \
+        --max-position-embeddings $SEQ \
+	--micro-batch-size $MICRO_BATCH \
+	--global-batch-size $GLOBAL_BATCH \
+	--train-iters 1000 \
+        --lr 6.0e-5 \
+	--min-lr 6.0e-6 \
+        --lr-decay-style cosine \
+        --log-interval 1 \
+        --eval-iters 40 \
+        --eval-interval 1000 \
+	--data-path ${DATASET} \
+	--vocab-file ${VOCAB_PATH} \
+	--merge-file ${MERGE_PATH} \
+	--save-interval 1000 \
+        --split 98,2,0 \
+        --clip-grad 1.0 \
+	--weight-decay 0.1 \
+	--adam-beta1 0.9 \
+	--adam-beta2 0.95 \
+	--init-method-std 0.006 \
+        --fp16 \
+	--checkpoint-activations
+        "
+
+
+if [[ ${USE_DEEPSPEED} -eq 1 ]]; then
+	echo "Using DeepSpeed"
+	options="${options} \
+		--deepspeed \
+		--deepspeed_config=${CONFIG_JSON} \
+		--zero-stage=${ZERO_STAGE} \
+		--deepspeed-activation-checkpointing \
+	"
+fi
+
+
+cat <<EOT > $CONFIG_JSON
+{
+  "train_batch_size" : $GLOBAL_BATCH,
+  "train_micro_batch_size_per_gpu": $MICRO_BATCH,
+  "steps_per_print": 1,
+
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+
+  "gradient_clipping": 1.0,
+  "prescale_gradients": true,
+
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+  },
+
+  "wall_clock_breakdown" : true
+}
+EOT
+
+# run_cmd="deepspeed $WORKER_STR ${DIR}/test_sampling.py $@ ${options}"
+run_cmd="deepspeed $WORKER_STR test_sampling.py $@ ${options}"
+
+echo ${run_cmd}
+eval ${run_cmd}
+
+set +x
--- a/setup.cfg
+++ b/setup.cfg
+[isort]
+default_section = FIRSTPARTY
+ensure_newline_before_comments = True
+force_grid_wrap = 0
+include_trailing_comma = True
+known_first_party = megatron
+known_third_party =
+    apex
+    codecarbon
+    datasets
+    deepspeed
+    git
+    nltk
+    numpy
+    pytest
+    tensorboard
+    torch
+    tqdm
+    transformers
+
+line_length = 119
+lines_after_imports = 2
+multi_line_output = 3
+use_parentheses = True
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Setup for pip package."""
+
+import os
+import sys
+import setuptools
+
+if sys.version_info < (3,):
+    raise Exception("Python 2 is not supported by Megatron.")
+
+from megatron.package_info import (
+    __description__,
+    __contact_names__,
+    __url__,
+    __download_url__,
+    __keywords__,
+    __license__,
+    __package_name__,
+    __version__,
+)
+
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+
+###############################################################################
+#                             Dependency Loading                              #
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #
+
+
+def req_file(filename):
+    with open(filename) as f:
+        content = f.readlines()
+    return [x.strip() for x in content]
+
+
+install_requires = req_file("requirements.txt")
+
+setuptools.setup(
+    name=__package_name__,
+    # Versions should comply with PEP440.  For a discussion on single-sourcing
+    # the version across setup.py and the project code, see
+    # https://packaging.python.org/en/latest/single_source_version.html
+    version=__version__,
+    description=__description__,
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    # The project's main homepage.
+    url=__url__,
+    author=__contact_names__,
+    maintainer=__contact_names__,
+    # The licence under which the project is released
+    license=__license__,
+    classifiers=[
+        'Intended Audience :: Developers',
+        'Intended Audience :: Science/Research',
+        'Intended Audience :: Information Technology',
+        # Indicate what your project relates to
+        'Topic :: Scientific/Engineering :: Artificial Intelligence',
+        'Topic :: Software Development :: Libraries :: Python Modules',
+        # Supported python versions
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
+        # Additional Setting
+        'Environment :: Console',
+        'Natural Language :: English',
+        'Operating System :: OS Independent',
+    ],
+    python_requires='>=3.6',
+    packages=setuptools.find_packages(),
+    install_requires=install_requires,
+    # Add in any packaged data.
+    include_package_data=True,
+    zip_safe=False,
+    # PyPI package information.
+    keywords=__keywords__
+)
--- a/start_fast.md
+++ b/start_fast.md
+# Fast Setup instructions
+
+This quick instructions document contains 3 steps:
+
+1. installing software
+2. preparing data
+3. running the script
+
+This is useful if you need to ask someone to reproduce problems with `Megatron-Deepspeed`
+
+## 1. Software
+
+Please follow this exact order.
+
+
+0. Create a new conda env if need be or activate an existing environment.
+
+1. Install `pytorch`. Choose the desired version install instructions [here](https://pytorch.org/get-started/locally/), but for conda it'd be:
+
+```
+conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch
+```
+
+2. Install system-wide `cuda` if you don't have it already. [NVIDIA instructions](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html). Of course ideally use [the premade packages for your distro](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#package-manager-installation).
+Use the same major version as pytorch's cuda build. To check use:
+
+```
+python -c 'import torch; print(f"pt={torch.__version__}, cuda={torch.version.cuda}")'
+```
+
+The minor versions don't actually have to match, but then you will need to hack `apex` installer to ignore minor version changes, see below.
+
+3. Install `apex`
+
+```
+git clone https://github.com/NVIDIA/apex
+cd apex
+pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check .  2>&1 | tee build.log
+cd -
+```
+
+If the pytorch and system-wide cuda minor versions mismatch, it's not a problem, you just need to hack `apex`'s build to bypass the check by applying this patch first and then build it.
+```
+diff --git a/setup.py b/setup.py
+index d76e998..f224dae 100644
+--- a/setup.py
+++ b/setup.py
+@@ -31,6 +31,8 @@ def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
+     print(raw_output + "from " + cuda_dir + "/bin\n")
+
+     if (bare_metal_major != torch_binary_major) or (bare_metal_minor != torch_binary_minor):
+        # allow minor diffs
+        if bare_metal_minor != torch_binary_minor: return
+         raise RuntimeError(
+             "Cuda extensions are being compiled with a version of Cuda that does "
+             "not match the version used to compile Pytorch binaries.  "
+```
+
+
+4. Checkout and prepare `Megatron-DeepSpeed` and install its requirements
+
+```
+git clone https://github.com/bigscience-workshop/Megatron-DeepSpeed
+cd Megatron-DeepSpeed
+pip install -r requirements.txt
+```
+
+
+
+
+## 2. Data
+
+Will work under the `Megatron-DeepSpeed` clone
+
+```
+cd Megatron-DeepSpeed
+```
+
+
+
+Prepare data for preprocessing
+```
+mkdir -p data
+wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json -O data/gpt2-vocab.json
+wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -O data/gpt2-merges.txt
+python -c 'from datasets import load_dataset; ds = load_dataset("stas/oscar-en-10k", split="train", keep_in_memory=False); ds.to_json(f"data/oscar-en-10k.jsonl", orient="records", lines=True, force_ascii=False)'
+```
+
+Pre-process a small dataset to be used for training
+
+```
+python tools/preprocess_data.py \
+    --input data/oscar-en-10k.jsonl \
+    --output-prefix data/meg-gpt2-oscar-en-10k \
+    --dataset-impl mmap \
+    --tokenizer-type GPT2BPETokenizer \
+    --merge-file data/gpt2-merges.txt \
+    --vocab data/gpt2-vocab.json \
+    --append-eod \
+    --workers 4
+```
+
+now you have data/meg-gpt2-oscar-en-10k, vocab and merges files to pass as arguments to training, the next section shows how to use them.
+
+Note that Megatron wants `data/meg-gpt2-oscar-en-10k_text_document` prefix later in `--data-path`
+
+## 3. Train
+
+Here is a tiny model training setup configured over 2 gpus to train on the data we prepared in step 2.
+
+Put it in a script or run it directly.
+
+If you have only 1 gpu, change these 2 lines below to:
+
+```
+N_GPUS=1
+TP_SIZE=1
+```
+
+The script:
+
+```
+CHECKPOINT_PATH=checkpoints/gpt2
+
+VOCAB_FILE=data/gpt2-vocab.json
+MERGE_FILE=data/gpt2-merges.txt
+DATA_PATH=data/meg-gpt2-oscar-en-10k_text_document
+TENSORBOARD_PATH=output_dir/tensorboard
+
+N_GPUS=2
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=16
+TP_SIZE=2
+PP_SIZE=1
+
+NLAYERS=2
+NHIDDEN=8
+NHEADS=2
+SEQ_LEN=512
+VOCAB_SIZE=50257
+
+SAVE_INTERVAL=50
+
+TRAIN_SAMPLES=10_000
+
+GPT_ARGS=" \
+    --num-layers $NLAYERS \
+    --hidden-size $NHIDDEN \
+    --num-attention-heads $NHEADS \
+    --seq-length $SEQ_LEN \
+    --max-position-embeddings $SEQ_LEN \
+    --micro-batch-size $MICRO_BATCH_SIZE \
+    --rampup-batch-size 2 2 1_000 \
+    --global-batch-size $GLOBAL_BATCH_SIZE \
+    --train-samples $TRAIN_SAMPLES \
+    --optimizer adam \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --adam-eps 1e-8 \
+    --lr 1e-4 \
+    --lr-warmup-samples 5 \
+    --min-lr 1e-6 \
+    --lr-decay-style cosine \
+    --lr-decay-samples 12 \
+    --clip-grad 1.0 \
+    --weight-decay 1e-1 \
+    --embed-layernorm \
+    --fp16 \
+    --partition-activations \
+    --seed 42 \
+    --vocab-file $VOCAB_FILE \
+    --merge-file $MERGE_FILE \
+    "
+
+OUTPUT_ARGS=" \
+    --exit-interval 100 \
+    --log-interval 10 \
+    --save-interval $SAVE_INTERVAL \
+    --eval-interval 100 \
+    --eval-iters 10 \
+    --checkpoint-activations \
+    "
+
+DATA_ARGS=" \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH \
+    --data-path $DATA_PATH \
+    --tensorboard-dir $TENSORBOARD_PATH \
+    --tensorboard-queue-size 5 \
+    --log-timers-to-tensorboard \
+    --log-batch-size-to-tensorboard \
+    --log-validation-ppl-to-tensorboard \
+    --kill-switch-path /tmp/kill-switch \
+    "
+
+ZERO_STAGE=1
+
+config_json="./ds_config.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
+  "train_batch_size": $GLOBAL_BATCH_SIZE,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
+EOT
+
+DEEPSPEED_ARGS=" \
+    --deepspeed \
+    --deepspeed_config ${config_json} \
+    --zero-stage ${ZERO_STAGE} \
+    --deepspeed-activation-checkpointing \
+    "
+
+ALL_ARGS="$GPT_ARGS $OUTPUT_ARGS $DATA_ARGS $DEEPSPEED_ARGS"
+
+MASTER_ADDR=localhost
+MASTER_PORT=6777
+
+export LAUNCHER="python -u -m torch.distributed.run \
+    --nproc_per_node $N_GPUS \
+    --nnodes 1 \
+    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    "
+export CMD=" \
+    $LAUNCHER pretrain_gpt.py \
+    --tensor-model-parallel-size $TP_SIZE \
+    --pipeline-model-parallel-size $PP_SIZE \
+    --distributed-backend nccl \
+    $ALL_ARGS \
+    "
+
+echo $CMD
+
+$CMD
+
+```
+
+You can, of course, run this as a slurm script, but here is [a full slurm script example](https://github.com/bigscience-workshop/bigscience/blob/d57b76bb592832bb4d2054cd5cbf132796be2d83/train/tr11-176B-ml/setup-test-n2.slurm), which has some tweaks to get `MASTER_ADDR` and a few other bits right under the SLURM environment on JeanZay, which may or may not be needed if you run it elsewhere.
+
+Remember to wipe out `$CHECKPOINT_PATH`, if you change the model shape and there is a checkpoint with the old shapes saved already.
--- a/tasks/data_utils.py
+++ b/tasks/data_utils.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Tasks data utility."""
+
+import re
+import numpy as np
+
+
+def clean_text(text):
+    """Remove new lines and multiple spaces and adjust end of sentence dot."""
+
+    text = text.replace("\n", " ")
+    text = re.sub(r'\s+', ' ', text)
+    for _ in range(3):
+        text = text.replace(' . ', '. ')
+
+    return text
+
+
+def build_sample(ids, types, paddings, label, unique_id):
+    """Convert to numpy and return a sample consumed by the batch producer."""
+
+    ids_np = np.array(ids, dtype=np.int64)
+    types_np = np.array(types, dtype=np.int64)
+    paddings_np = np.array(paddings, dtype=np.int64)
+    sample = ({'text': ids_np,
+               'types': types_np,
+               'padding_mask': paddings_np,
+               'label': int(label),
+               'uid': int(unique_id)})
+
+    return sample
+
+
+def build_tokens_types_paddings_from_text(text_a, text_b,
+                                          tokenizer, max_seq_length):
+    """Build token types and paddings, trim if needed, and pad if needed."""
+
+    text_a_ids = tokenizer.tokenize(text_a)
+    text_b_ids = None
+    if text_b is not None:
+        text_b_ids = tokenizer.tokenize(text_b)
+
+    return build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids,
+                                                max_seq_length, tokenizer.cls,
+                                                tokenizer.sep, tokenizer.pad)
+
+
+def build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, max_seq_length,
+                                         cls_id, sep_id, pad_id):
+    """Build token types and paddings, trim if needed, and pad if needed."""
+
+    ids = []
+    types = []
+    paddings = []
+
+    # [CLS].
+    ids.append(cls_id)
+    types.append(0)
+    paddings.append(1)
+
+    # A.
+    len_text_a = len(text_a_ids)
+    ids.extend(text_a_ids)
+    types.extend([0] * len_text_a)
+    paddings.extend([1] * len_text_a)
+
+    # [SEP].
+    ids.append(sep_id)
+    types.append(0)
+    paddings.append(1)
+
+    # B.
+    if text_b_ids is not None:
+        len_text_b = len(text_b_ids)
+        ids.extend(text_b_ids)
+        types.extend([1] * len_text_b)
+        paddings.extend([1] * len_text_b)
+
+    # Cap the size.
+    trimmed = False
+    if len(ids) >= max_seq_length:
+        max_seq_length_m1 = max_seq_length - 1
+        ids = ids[0:max_seq_length_m1]
+        types = types[0:max_seq_length_m1]
+        paddings = paddings[0:max_seq_length_m1]
+        trimmed = True
+
+    # [SEP].
+    if (text_b_ids is not None) or trimmed:
+        ids.append(sep_id)
+        if text_b_ids is None:
+            types.append(0)
+        else:
+            types.append(1)
+        paddings.append(1)
+
+    # Padding.
+    padding_length = max_seq_length - len(ids)
+    if padding_length > 0:
+        ids.extend([pad_id] * padding_length)
+        types.extend([pad_id] * padding_length)
+        paddings.extend([0] * padding_length)
+
+    return ids, types, paddings
--- a/tasks/ensemble_classifier.py
+++ b/tasks/ensemble_classifier.py
+import os
+import argparse
+import collections
+
+import numpy as np
+import torch
+
+
+def process_files(args):
+    all_predictions = collections.OrderedDict()
+    all_labels = collections.OrderedDict()
+    all_uid = collections.OrderedDict()
+    for path in args.paths:
+        path = os.path.join(path, args.prediction_name)
+        try:
+            data = torch.load(path)
+            for dataset in data:
+                name, d = dataset
+                predictions, labels, uid = d
+                if name not in all_predictions:
+                    all_predictions[name] = np.array(predictions)
+                    if args.labels is None:
+                        args.labels = [i for i in range(all_predictions[name].shape[1])]
+                    if args.eval:
+                        all_labels[name] = np.array(labels)
+                    all_uid[name] = np.array(uid)
+                else:
+                    all_predictions[name] += np.array(predictions)
+                    assert np.allclose(all_uid[name], np.array(uid))
+        except Exception as e:
+            print(e)
+            continue
+    return all_predictions, all_labels, all_uid
+
+
+def get_threshold(all_predictions, all_labels, one_threshold=False):
+    if one_threshold:
+        all_predictons = {'combined': np.concatenate(list(all_predictions.values()))}
+        all_labels = {'combined': np.concatenate(list(all_predictions.labels()))}
+    out_thresh = []
+    for dataset in all_predictions:
+        preds = all_predictions[dataset]
+        labels = all_labels[dataset]
+        out_thresh.append(calc_threshold(preds, labels))
+    return out_thresh
+
+
+def calc_threshold(p, l):
+    trials = [(i) * (1. / 100.) for i in range(100)]
+    best_acc = float('-inf')
+    best_thresh = 0
+    for t in trials:
+        acc = ((apply_threshold(p, t).argmax(-1) == l).astype(float)).mean()
+        if acc > best_acc:
+            best_acc = acc
+            best_thresh = t
+    return best_thresh
+
+
+def apply_threshold(preds, t):
+    assert (np.allclose(preds.sum(-1), np.ones(preds.shape[0])))
+    prob = preds[:, -1]
+    thresholded = (prob >= t).astype(int)
+    preds = np.zeros_like(preds)
+    preds[np.arange(len(thresholded)), thresholded.reshape(-1)] = 1
+    return preds
+
+
+def threshold_predictions(all_predictions, threshold):
+    if len(threshold) != len(all_predictions):
+        threshold = [threshold[-1]] * (len(all_predictions) - len(threshold))
+    for i, dataset in enumerate(all_predictions):
+        thresh = threshold[i]
+        preds = all_predictions[dataset]
+        all_predictions[dataset] = apply_threshold(preds, thresh)
+    return all_predictions
+
+
+def postprocess_predictions(all_predictions, all_labels, args):
+    for d in all_predictions:
+        all_predictions[d] = all_predictions[d] / len(args.paths)
+
+    if args.calc_threshold:
+        args.threshold = get_threshold(all_predictions, all_labels, args.one_threshold)
+        print('threshold', args.threshold)
+
+    if args.threshold is not None:
+        all_predictions = threshold_predictions(all_predictions, args.threshold)
+
+    return all_predictions, all_labels
+
+
+def write_predictions(all_predictions, all_labels, all_uid, args):
+    all_correct = 0
+    count = 0
+    for dataset in all_predictions:
+        preds = all_predictions[dataset]
+        preds = np.argmax(preds, -1)
+        if args.eval:
+            correct = (preds == all_labels[dataset]).sum()
+            num = len(all_labels[dataset])
+            accuracy = correct / num
+            count += num
+            all_correct += correct
+            accuracy = (preds == all_labels[dataset]).mean()
+            print(accuracy)
+        if not os.path.exists(os.path.join(args.outdir, dataset)):
+            os.makedirs(os.path.join(args.outdir, dataset))
+        outpath = os.path.join(
+            args.outdir, dataset, os.path.splitext(
+                args.prediction_name)[0] + '.tsv')
+        with open(outpath, 'w') as f:
+            f.write('id\tlabel\n')
+            f.write('\n'.join(str(uid) + '\t' + str(args.labels[p])
+                              for uid, p in zip(all_uid[dataset], preds.tolist())))
+    if args.eval:
+        print(all_correct / count)
+
+
+def ensemble_predictions(args):
+    all_predictions, all_labels, all_uid = process_files(args)
+    all_predictions, all_labels = postprocess_predictions(all_predictions, all_labels, args)
+    write_predictions(all_predictions, all_labels, all_uid, args)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--paths', required=True, nargs='+',
+                        help='paths to checkpoint directories used in ensemble')
+    parser.add_argument('--eval', action='store_true',
+                        help='compute accuracy metrics against labels (dev set)')
+    parser.add_argument('--outdir',
+                        help='directory to place ensembled predictions in')
+    parser.add_argument('--prediction-name', default='test_predictions.pt',
+                        help='name of predictions in checkpoint directories')
+    parser.add_argument('--calc-threshold', action='store_true',
+                        help='calculate threshold classification')
+    parser.add_argument('--one-threshold', action='store_true',
+                        help='use on threshold for all subdatasets')
+    parser.add_argument('--threshold', nargs='+', default=None, type=float,
+                        help='user supplied threshold for classification')
+    parser.add_argument('--labels', nargs='+', default=None,
+                        help='whitespace separated list of label names')
+    args = parser.parse_args()
+    ensemble_predictions(args)
+
+
+if __name__ == '__main__':
+    main()
--- a/tasks/eval_harness/download.py
+++ b/tasks/eval_harness/download.py
+# Downloads the specified taks in the evaluation harness
+# This is particularly useful when running in environments where the GPU nodes 
+# do not have internet access. This way we can pre-download them and use the cached data-set during evaluation.
+
+from lm_eval import tasks
+from lm_eval.tasks import ALL_TASKS
+import argparse
+import os
+
+
+parser = argparse.ArgumentParser(description='Download evaluation harness', allow_abbrev=False)
+parser.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks to download.')
+args = parser.parse_args()
+
+def main():
+    task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',')
+    tasks.get_task_dict(task_list)
+
+if __name__ == '__main__':
+    main()
+
+
+    
\ No newline at end of file