update

e1354f9d · liangjing · e1354f9d · e1354f9d · e1354f9d · e1354f9d
Commit e1354f9d authored May 22, 2024 by liangjing
20 changed files
--- a/examples_deepspeed/curriculum_learning/ds_pretrain_gpt_1.3B_rope_slw.sh
+++ b/examples_deepspeed/curriculum_learning/ds_pretrain_gpt_1.3B_rope_slw.sh
+#!/bin/bash
+dir=`pwd`
+###############################################################################
+### Main configs
+## GPT-3 models use 2K sequence length/context window
+seq_len=2048
+
+## The "GPT-3 XXX" below are configs from GPT-3 paper
+## https://arxiv.org/abs/2005.14165, choose based on
+## your desired model size or build your own configs
+
+## init_std is standard deviation for weight initialization. Usually larger
+## model needs lower std. We used a heuristic equation of sqrt(1/3/hidden_size)
+## from the MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf)
+
+## We changed min_lr to a lower number (1.0e-6), which we found is able to
+## provide better zero-shot eval results.
+
+## GPT-3 Small 125M
+# model_size=0.125
+# num_layers=12
+# hidden_size=768
+# num_attn_heads=12
+# global_batch_size=256
+# lr=6.0e-4
+# min_lr=1.0e-6
+# init_std=0.02
+
+## GPT-3 Medium 350M
+# model_size=0.35
+# num_layers=24
+# hidden_size=1024
+# num_attn_heads=16
+# global_batch_size=256
+# lr=3.0e-4
+# min_lr=1.0e-6
+# init_std=0.018
+
+## GPT-3 Large 760M
+# model_size=0.76
+# num_layers=24
+# hidden_size=1536
+# num_attn_heads=16
+# global_batch_size=256
+# lr=2.5e-4
+# min_lr=1.0e-6
+# init_std=0.015
+
+## GPT-3 XL 1.3B
+model_size=1.3
+num_layers=24
+hidden_size=2048
+num_attn_heads=16
+global_batch_size=512
+lr=2.0e-4
+min_lr=1.0e-6
+init_std=0.013
+
+## GPT-3 2.7B
+# model_size=2.7
+# num_layers=32
+# hidden_size=2560
+# num_attn_heads=32
+# global_batch_size=512
+# lr=1.6e-4
+# min_lr=1.0e-6
+# init_std=0.011
+
+## GPT-3 6.7B
+# model_size=6.7
+# num_layers=32
+# hidden_size=4096
+# num_attn_heads=32
+# global_batch_size=1024
+# lr=1.2e-4
+# min_lr=1.0e-6
+# init_std=0.009
+
+## GPT-3 13B
+# model_size=13
+# num_layers=40
+# hidden_size=5120
+# num_attn_heads=40
+# global_batch_size=1024
+# lr=1.0e-4
+# min_lr=1.0e-6
+# init_std=0.008
+
+## GPT-3 175B
+# model_size=175
+# num_layers=96
+# hidden_size=12288
+# num_attn_heads=96
+# global_batch_size=1536
+# lr=0.6e-4
+# min_lr=1.0e-6
+# init_std=0.005
+###############################################################################
+### Training duration configs
+## The main termination condition, original GPT-3 paper trains for 300B tokens.
+train_tokens_in_billion=300
+train_tokens=$((${train_tokens_in_billion} * 1000000000))
+
+## train_samples is another termination condition and also affect the number of 
+## data samples to be indexed. Since we want to reach the train_tokens
+## above, and data efficiency techniques may change num tokens in some samples,
+## so we just set this config large enough to make sure we have enough
+## processed data and don't terminate by train_samples.
+train_samples=$(( 300 * 1000000000 * 2 / ${seq_len} ))
+
+## Another wall-clock time termination condition in minutes. Set it large
+## enough to avoid undesired early termination.
+exit_duration=30000000
+###############################################################################
+### lr configs
+## lr warmup and decay duration.
+## Original GPT-3 paper uses 375M warmup tokens and 260B cosine decay tokens.
+## Here we increase the warmup tokens to 3B since when batch size warmup is not
+## used, there are more tokens per step. Thus we need to increase warmup tokens
+## to make sure there are enough warmup steps, which is important for training
+## stability.
+lr_warmup_tokens_in_million=3000
+lr_warmup_tokens=$((${lr_warmup_tokens_in_million} * 1000000))
+## Here we changed the LR decay tokens to align with total train tokens, since
+## related works (e.g., https://arxiv.org/abs/2203.15556) find that setting the
+## learning rate schedule to match the number of training tokens results in the
+## best final model quality 
+lr_decay_tokens_in_billion=${train_tokens_in_billion}
+lr_decay_tokens=$((${lr_decay_tokens_in_billion} * 1000000000))
+lr_decay_style="cosine"
+###############################################################################
+### Parallelism configs
+## Model parallelism, 1 is no MP
+mp_size=4
+
+## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
+## Note that currently both curriculum learning and random-LTD are NOT
+## compatible with pipeline parallelism.
+pp_size=8
+no_pp="false"
+
+## ZeRO-based data parallelism, stage=0 will disable ZeRO
+zero_stage=1
+
+## Total number of GPUs. ds_ssh is from DeepSpeed library.
+num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
+num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+num_node=$(( ${num_gpus} / ${num_gpus_pernode} ))
+
+## Data parallel size.
+dp_size=$(( ${num_gpus} / ${pp_size} / ${mp_size} ))
+
+## Micro batch size per GPU
+## Make sure that batch_size <= global_batch_size*pp_size*mp_size/num_gpus
+## Reduce it manually if GPU OOM
+# batch_size=$(( ${global_batch_size} / ${dp_size} ))
+batch_size=2
+###############################################################################
+### curriculum learning (sequence length warmup) configs
+# The "divided by 3" means we use 1/3 of baseline's total steps for sequence length warmup.
+# This is not always the best config, but usually a reasonable choice to start with.
+cl_step=$(( ${lr_warmup_tokens} / 3 / ${global_batch_size} / ${seq_len} ))
+# Starting sequence length during sequence length warmup. If the train/validation loss is
+# unstable at the beginning of training, need to increase this but also need to keep as multiples
+# of 8 in order to enable Tensor Core acceleration.
+cl_min=64
+###############################################################################
+### Misc configs
+log_interval=10
+eval_iters=10
+eval_interval=100
+# num_save controls how frequent to save checkpoint. num_save=20 means that a
+# checkpoint will be saved every 5% of training. For longer training you would
+# want larger num_save to save more frequently, and vice versa.
+num_save=100
+estimated_train_iter=$((${train_tokens} / ${seq_len} / ${global_batch_size}))
+# save_interval=$((${estimated_train_iter} / ${num_save}))
+save_interval=100
+
+## Activation checkpointing saves GPU memory, but reduces training speed
+activation_checkpoint="true"
+# activation_checkpoint="false"
+
+## Whether or not log optimizer states (norms, max abs values) to tensorboard.
+## This is not required for training and might save GPU memory when turned off.
+log_optimizer_state="true"
+###############################################################################
+### Output and data configs
+current_time=$(date "+%Y.%m.%d_%H.%M.%S")
+host="${HOSTNAME}"
+seed=1234
+num_workers=0
+
+## Public the Pile dataset, can be downloaded at
+## https://mystic.the-eye.eu/public/AI/pile_neox/ or 
+## https://the-eye.eu/public/AI/pile_neox/ Change data_home to where you
+## store the pile_text_document.bin and pile_text_document.idx.
+data_home="/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing"
+data_path="${data_home}/pile_text_document"
+
+vocab_path="gpt2-vocab.json"
+if [ ! -f "$vocab_path" ]; then
+    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
+fi
+merge_path="gpt2-merges.txt"
+if [ ! -f "$merge_path" ]; then
+    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
+fi
+
+prescale_grad="true"
+jobname="gpt_${model_size}B_tok${train_tokens_in_billion}B"
+jobname="${jobname}_lr${lr}_min${min_lr}_w${lr_warmup_tokens_in_million}M_d${lr_decay_tokens_in_billion}B_${lr_decay_style}"
+jobname="${jobname}_gbs${global_batch_size}_mbs${batch_size}_g${num_gpus}"
+if [[ $zero_stage -gt 0 ]]; then
+    jobname="${jobname}_z${zero_stage}"
+    prescale_grad="false"
+fi
+if [[ $mp_size -gt 1 ]]; then
+    jobname="${jobname}_mp${mp_size}"
+fi
+if [ "${no_pp}" = "false" ]; then
+    jobname="${jobname}_pp${pp_size}"
+fi
+jobname="${jobname}_seed${seed}_rebase_rope0.25"
+jobname="${jobname}_cl_step${cl_step}_cl_min${cl_min}"
+
+username=$(whoami)
+output_home="/blob/users/${username}/project/data_efficient_gpt"
+log_path="${output_home}/log/"
+checkpoint_path="${output_home}/checkpoint/${jobname}"
+## Microsoft internal constraint: because tensorboard is logged by last rank,
+## it's better to put the path in NFS instead of Blob.
+tensorboard_dir="/vc_data/users/${username}/project/data_efficient_gpt/tensorboard/"
+tensorboard_path="${tensorboard_dir}${jobname}_${host}_${current_time}"
+mkdir -p ${log_path}
+mkdir -p ${checkpoint_path}
+mkdir -p ${tensorboard_path}
+###############################################################################
+data_options=" \
+    --vocab-file ${vocab_path} \
+    --merge-file ${merge_path} \
+    --data-path ${data_path} \
+    --data-impl mmap"
+
+## If CL is used, make sure to set "--split" the same as what you used during
+## offline data analysis&indexing.
+megatron_options=" \
+    --override-opt_param-scheduler \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --tensor-model-parallel-size ${mp_size} \
+    --init-method-std ${init_std} \
+    --lr-decay-tokens ${lr_decay_tokens} \
+    --lr-warmup-tokens ${lr_warmup_tokens} \
+    --micro-batch-size ${batch_size} \
+    --exit-duration-in-mins ${exit_duration} \
+    --global-batch-size ${global_batch_size} \
+    --num-layers ${num_layers} \
+    --hidden-size ${hidden_size} \
+    --num-attention-heads ${num_attn_heads} \
+    --seq-length ${seq_len} \
+    --max-position-embeddings ${seq_len} \
+    --train-tokens ${train_tokens} \
+    --train-samples ${train_samples} \
+    --lr ${lr} \
+    --min-lr ${min_lr} \
+    --lr-decay-style ${lr_decay_style} \
+    --split 949,50,1 \
+    --log-interval ${log_interval} \
+    --eval-interval ${eval_interval} \
+    --eval-iters ${eval_iters} \
+    --save-interval ${save_interval} \
+    --weight-decay 0.1 \
+    --clip-grad 1.0 \
+    --hysteresis 2 \
+    --num-workers ${num_workers} \
+    --fp16 \
+    --seed ${seed} \
+    --load ${checkpoint_path} \
+    --save ${checkpoint_path} \
+    --no-async-tensor-model-parallel-allreduce \
+    --use-rotary-position-embeddings \
+    --rotary-percent 0.25 \
+    --tensorboard-queue-size 1 \
+    --log-timers-to-tensorboard \
+    --log-batch-size-to-tensorboard \
+    --log-validation-ppl-to-tensorboard \
+    --tensorboard-dir ${tensorboard_path}"
+
+if [ "${activation_checkpoint}" = "true" ]; then
+megatron_options="${megatron_options} \
+    --checkpoint-activations"
+fi
+
+if [ "${log_optimizer_state}" = "true" ]; then
+megatron_options="${megatron_options} \
+    --log-optimizer-states-to-tensorboard"
+fi
+
+config_json="ds_config_gbs${global_batch_size}_mbs${batch_size}_log${log_interval}_zero${zero_stage}_cl_step${cl_step}_cl_min${cl_min}.json"
+template_json="ds_config_gpt_slw_TEMPLATE.json"
+sed "s/GBSIZE/${global_batch_size}/" ${template_json} \
+    | sed "s/MBSIZE/${batch_size}/" \
+    | sed "s/LOG_INTERVAL/${log_interval}/" \
+    | sed "s/ZERO_STAGE/${zero_stage}/" \
+    | sed "s/PRESCALE_GRAD/${prescale_grad}/" \
+    | sed "s/CONFIG_CL_MIN/${cl_min}/" \
+    | sed "s/CONFIG_CL_MAX/${seq_len}/" \
+    | sed "s/CONFIG_CL_DURATION/${cl_step}/" \
+      > ${config_json}
+
+deepspeed_options=" \
+    --deepspeed \
+    --deepspeed_config ${config_json} \
+    --zero-stage ${zero_stage} \
+    --pipeline-model-parallel-size ${pp_size}"
+
+if [[ "${no_pp}" = "true" ]]; then
+deepspeed_options="${deepspeed_options} \
+    --no-pipeline-parallel"
+fi
+
+if [ "${activation_checkpoint}" = "true" ]; then
+deepspeed_options="${deepspeed_options} \
+    --deepspeed-activation-checkpointing"
+fi
+
+## When saving checkpoint to a storage with cache, their could be consistency
+## issue of the pointer to latest checkpoint. Here we find the correct pointer
+## and broadcast it to all nodes.
+iteration_file="$checkpoint_path/latest_checkpointed_iteration.txt"
+iteration_file_2="$checkpoint_path/latest"
+iteration=0
+for (( node = 0; node <= num_node-1; node++ ))
+do
+    if $(ssh -q worker-"$node" "test -f \"$iteration_file\""); then
+        local_iteration=$(ssh -q worker-"$node" cat $iteration_file)
+        iteration=$(( ${local_iteration} > ${iteration} ? ${local_iteration} :  ${iteration} ))
+    fi
+done
+if [[ $iteration -gt 0 ]]; then
+    iteration_2="global_step${iteration}"
+    ds_ssh "echo $iteration > $iteration_file"
+    ds_ssh "echo $iteration_2 > $iteration_file_2"
+fi
+
+deepspeed ${dir}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &>> ${log_path}/${jobname}_${host}_${current_time}.log
\ No newline at end of file
--- a/examples_deepspeed/curriculum_learning/ds_train.sh
+++ b/examples_deepspeed/curriculum_learning/ds_train.sh
+# # baseline
+# CONFIG=baseline
+# TAG=baseline
+# MODEL_SIZE=1558
+# LR=1.5e-4
+# BSZ=512
+# SEQ_LEN=1024
+# MP_SIZE=1
+# SEED=1234
+# SAVE_INTERVAL=5000
+# NUM_ITER=600000
+# NUM_TOKEN=157286400000
+# LR_DECAY_TOKEN=157286400000
+# LR_WARMUP_ITER=3000
+# CONFIG_TEMPLATE=false
+# CURRICULUM_STEP=0
+# CURRICULUM_MIN=0
+
+# curriculum learning
+CONFIG=curriculum_fixed_linear
+MODEL_SIZE=1558
+LR=6e-4
+BSZ=4096
+SEQ_LEN=1024
+MP_SIZE=1
+SEED=1234
+SAVE_INTERVAL=1000
+NUM_ITER=75000
+NUM_TOKEN=157286400000
+LR_DECAY_TOKEN=157286400000
+LR_WARMUP_ITER=3000
+CONFIG_TEMPLATE=true
+CURRICULUM_STEP=45000
+CURRICULUM_MIN=64
+TAG="${CONFIG}_s${CURRICULUM_MIN}to${SEQ_LEN}_step${CURRICULUM_STEP}"
+
+bash ds_pretrain_gpt2.sh $CONFIG $TAG $MODEL_SIZE $LR $BSZ $SEQ_LEN $MP_SIZE $SEED $SAVE_INTERVAL $NUM_ITER $NUM_TOKEN $LR_DECAY_TOKEN $LR_WARMUP_ITER $CONFIG_TEMPLATE $CURRICULUM_STEP $CURRICULUM_MIN
--- a/examples_deepspeed/curriculum_learning/ds_zero_stage_1_config_baseline.json
+++ b/examples_deepspeed/curriculum_learning/ds_zero_stage_1_config_baseline.json
+{
+  "train_batch_size": 512,
+  "gradient_accumulation_steps": 1,
+  "steps_per_print": 1,
+  "zero_optimization": {
+    "stage": 1
+  },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.00015,
+      "max_grad_norm": 1.0,
+      "betas": [0.9, 0.95]
+    }
+  },
+  "gradient_clipping": 1.0,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "wall_clock_breakdown": false,
+  "zero_allow_untested_optimizer": false
+}
--- a/examples_deepspeed/curriculum_learning/ds_zero_stage_1_config_curriculum_fixed_linear.json
+++ b/examples_deepspeed/curriculum_learning/ds_zero_stage_1_config_curriculum_fixed_linear.json
+{
+  "train_batch_size": 512,
+  "gradient_accumulation_steps": 1,
+  "steps_per_print": 1,
+  "zero_optimization": {
+    "stage": 1
+  },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.00015,
+      "max_grad_norm": 1.0,
+      "betas": [0.9, 0.95]
+    }
+  },
+  "gradient_clipping": 1.0,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "wall_clock_breakdown": false,
+  "zero_allow_untested_optimizer": false,
+  "curriculum_learning": {
+    "enabled": true,
+    "curriculum_type": "seqlen",
+    "min_difficulty": CONFIG_CL_MIN,
+    "max_difficulty": CONFIG_CL_MAX,
+    "schedule_type": "fixed_linear",
+    "schedule_config": {
+      "total_curriculum_step": CONFIG_CL_DURATION,
+      "difficulty_step": 8
+    }
+  }
+}
--- a/examples_deepspeed/data_efficiency/README.md
+++ b/examples_deepspeed/data_efficiency/README.md
+This directory includes GPT-3/BERT pretraining example scripts for DeepSpeed Data Efficiency Library technologies (curriculum learning, random-LTD, and the two composed together).
+
+You need to install updated DeepSpeed version (>=0.8.0), which contains the DeepSpeed Data Efficiency Library.
+
+Additional tutorial can be found at [DeepSpeed website](https://www.deepspeed.ai/tutorials/data-efficiency/).
+
+Additional technical details can be found in our [random-LTD paper](https://arxiv.org/abs/2211.11586) and [data efficiency paper](https://arxiv.org/abs/2212.03597).
+
+## GPT-3 pretraining and evaluation
+Inside ``gpt`` folder, first the ``ds_analyze_gpt_data_map.sh`` and ``ds_analyze_gpt_data_reduce.sh`` are used for curriculum learning's offline data analysis and indexing.
+
+``gpt/pretrain`` includes the pretraining example scripts. You can choose a setup to run by uncommenting one block in ``ds_pretrain_gpt_1.3B_dense_run.sh``. One thing to note is that in our [random-LTD paper](https://arxiv.org/abs/2211.11586) we did not scale peak learning rate when using less than 100% data, while in our later [data efficiency paper](https://arxiv.org/abs/2212.03597) we find that scaling LR based on used percentage of data helps improve model quality.
+
+``gpt/eval`` includes the zero-/few-shot evaluation example scripts. ``ds_evalharness_parallel_run.sh`` is for zero-shot, and ``ds_evalharness_parallel_run_10shot.sh`` is for 10-shot.
+
+## BERT pretraining and finetuning
+Inside ``bert`` folder, first the ``pile_data_download_preprocess.py`` can be used to download and preprocess the public Pile dataset.
+
+The ``ds_analyze_bert_data_map.sh`` and ``ds_analyze_bert_data_reduce.sh`` are used for curriculum learning's offline data analysis and indexing.
+
+``bert/pretrain`` includes the pretraining example scripts. You can choose a setup to run by uncommenting one block in ``ds_pretrain_bert_336M_run.sh``. One thing to note is that in our [random-LTD paper](https://arxiv.org/abs/2211.11586) we did not scale peak learning rate when using less than 100% data, while in our later [data efficiency paper](https://arxiv.org/abs/2212.03597) we find that scaling LR based on used percentage of data helps improve model quality.
+
+``bert/finetune`` includes the MNLI/QQP/RACE finetuning example scripts following the [Megatron-LM paper](https://arxiv.org/abs/1909.08053). However, we found that the RACE task's accuracy is not very stable and the Megatron-LM paper used a very long number of epochs for MNLI/QQP which is not necessary. Thus we added capability of finetuning other GLUE tasks, and switched to follow the hyperparameters of the [original BERT paper](https://arxiv.org/abs/1810.04805). The corresponding scripts are at ``bert/finetune_glue``, which we recommend to use instead of ``bert/finetune``. Our [data efficiency paper](https://arxiv.org/abs/2212.03597) also uses the scripts under ``bert/finetune_glue`` for GLUE finetuning.
\ No newline at end of file
--- a/examples_deepspeed/data_efficiency/analyze_data.py
+++ b/examples_deepspeed/data_efficiency/analyze_data.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+
+import os
+import time
+import sys
+import math
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             os.path.pardir,os.path.pardir)))
+from datetime import datetime
+import numpy as np
+import torch
+
+from deepspeed.runtime.data_pipeline.data_sampling.data_analyzer \
+    import DataAnalyzer
+from deepspeed.runtime.data_pipeline.data_sampling.indexed_dataset \
+    import MMapIndexedDataset
+
+from megatron import get_args
+from megatron import print_rank_0
+from megatron.initialize import initialize_megatron
+
+def get_tasks_args(parser):
+    """Provide extra arguments required for data analyzing."""
+    group = parser.add_argument_group(title='data_analyzing')
+
+    group.add_argument('--analyzing-task', type=str, required=True,
+                       default=None,
+                       choices=['map',
+                                'reduce'],
+                       help='What type of analyzing task to perform.')
+    group.add_argument('--analyzing-data-type', type=str, required=True,
+                       default=None,
+                       choices=['BERT',
+                                'GPT'],
+                       help='What type of data.')
+    group.add_argument('--analyzing-metric', type=str, nargs='+', default=[],
+                       help='What kinds of metrics to analyze.')
+    group.add_argument('--analyzing-num-workers', type=int, default=1,
+                       help='Number of workers. Each worker could be a single CPU node.')
+    group.add_argument('--analyzing-worker-id', type=int, default=0,
+                       help='Worker id of current node.')
+    group.add_argument('--analyzing-num-threads', type=int, default=1,
+                       help='Number of threads for each worker.')
+    group.add_argument('--analyzing-num-threads-reduce', type=int, default=1,
+                       help='Number of threads for each worker.')
+    group.add_argument('--analyzing-specific-threads', type=int, nargs='+', default=[],
+                       help='Which specific threads to run. Helpful when there are specific thread failed in previous run.')
+    return parser
+
+def train_valid_test_datasets_provider_gpt():
+    """Build train, valid, and test datasets."""
+    args = get_args()
+
+    print_rank_0('> building train, validation, and test datasets '
+                 'for GPT ...')
+    from megatron.data.gpt_dataset import build_train_valid_test_datasets
+    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
+        data_prefix=args.data_path,
+        data_impl=args.data_impl,
+        splits_string=args.split,
+        train_valid_test_num_samples=[1,1,1], # Just dummy numbers since we assume args.train_data_exact_num_epochs will override them
+        seq_length=args.seq_length,
+        seed=args.seed,
+        skip_warmup=(not args.mmap_warmup))
+    print_rank_0("> finished creating GPT datasets ...")
+
+    return train_ds, valid_ds, test_ds
+
+def train_valid_test_datasets_provider_bert():
+    """Build train, valid, and test datasets."""
+    args = get_args()
+
+    print_rank_0('> building train, validation, and test datasets '
+                 'for BERT ...')
+    from megatron.data.dataset_utils import build_train_valid_test_datasets
+    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
+        data_prefix=args.data_path,
+        data_impl=args.data_impl,
+        splits_string=args.split,
+        train_valid_test_num_samples=[1,1,1], # Just dummy numbers since we assume args.train_data_exact_num_epochs will override them
+        max_seq_length=args.seq_length,
+        masked_lm_prob=args.mask_prob,
+        short_seq_prob=args.short_seq_prob,
+        seed=args.seed,
+        skip_warmup=(not args.mmap_warmup),
+        binary_head=args.bert_binary_head)
+    print_rank_0("> finished creating BERT datasets ...")
+
+    return train_ds, valid_ds, test_ds
+
+def metric_seqlen(data):
+    metric = torch.count_nonzero(data['padding_mask'], dim=1)
+    return metric
+
+def metric_total_vocab_freq(data):
+    args = get_args()
+    if args.analyzing_data_type == 'BERT':
+        frequency = torch.bincount(data['text'].view(-1),
+            minlength=args.padded_vocab_size+1,
+            weights=data['padding_mask'].view(-1))
+    elif args.analyzing_data_type == 'GPT':
+        frequency = torch.bincount(data['text'].view(-1),
+            minlength=args.padded_vocab_size+1)
+    return frequency
+
+def metric_vocab_rarity(data):
+    args = get_args()
+    if args.analyzing_data_type == 'BERT':
+        rarity = torch.sum(data['padding_mask'] * \
+            args.total_vocab_freq[data['text']], dim=1).to(torch.long)
+    elif args.analyzing_data_type == 'GPT':
+        rarity = []
+        # Do one by one to avoid too high memory consumption
+        for row in range(data['text'].size()[0]):
+            rarity.append(int(torch.sum(args.total_vocab_freq[data['text'][row]]).item()))
+        rarity = torch.tensor(rarity, dtype=torch.long)
+    print(f"rarity min {min(rarity)}, max {max(rarity)}, len {len(rarity)}, avg {sum(rarity)/len(rarity)}")
+    return rarity
+
+def metric_seqlen_vocab_rarity(data):
+    args = get_args()
+    metric = torch.count_nonzero(data['padding_mask'], dim=1).to(torch.long) * args.seqlen_coeff
+    metric += torch.sum(data['padding_mask'] * \
+        args.total_vocab_freq[data['text']], dim=1).to(torch.long)
+    print(f"metric min {min(metric)}, max {max(metric)}, len {len(metric)}, avg {sum(metric)/len(metric)}")
+    return metric
+
+def get_metric_function(metric_name):
+    if metric_name == 'seqlen':
+        return metric_seqlen
+    if metric_name == 'total_vocab_freq':
+        return metric_total_vocab_freq
+    if metric_name == 'vocab_rarity':
+        return metric_vocab_rarity
+    if metric_name == 'seqlen_vocab_rarity':
+        return metric_seqlen_vocab_rarity
+
+def get_metric_type(metric_name):
+    if metric_name == 'seqlen':
+        return 'single_value_per_sample'
+    if metric_name == 'total_vocab_freq':
+        return 'accumulate_value_over_samples'
+    if metric_name == 'vocab_rarity':
+        return 'single_value_per_sample'
+    if metric_name == 'seqlen_vocab_rarity':
+        return 'single_value_per_sample'
+
+def run_map():
+    args = get_args()
+    if args.analyzing_data_type == 'BERT':
+        args.mask_prob = 0 # When analyzing data, we don't want any mask.
+        train_ds, _, _ = train_valid_test_datasets_provider_bert()
+    elif args.analyzing_data_type == 'GPT':
+        train_ds, _, _ = train_valid_test_datasets_provider_gpt()
+        assert 'seqlen' not in args.analyzing_metric, 'GPT data has fixed seqlen, thus unnecessary to analyze seqlen metric.'
+        assert 'seqlen_vocab_rarity' not in args.analyzing_metric, 'GPT data has fixed seqlen, thus unnecessary to analyze seqlen metric.'
+    if 'vocab_rarity' in args.analyzing_metric or 'seqlen_vocab_rarity' in args.analyzing_metric:
+        total_vocab_freq_fname = f"{args.save}/total_vocab_freq/total_vocab_freq_metric_value"
+        assert os.path.isfile(f"{total_vocab_freq_fname}.bin") and os.path.isfile(f"{total_vocab_freq_fname}.idx"), "To analyze vocab rarity, first need to analyze the total vocab freq."
+        total_vocab_freq = MMapIndexedDataset(total_vocab_freq_fname, skip_warmup=True)
+        total_vocab_freq = np.copy(total_vocab_freq[0])
+        total_vocab_freq[total_vocab_freq == 0] = 1 # Avoid log(0) error
+        total_vocab_freq = np.log(total_vocab_freq/sum(total_vocab_freq)) * -1
+        args.total_vocab_freq = torch.tensor(total_vocab_freq, dtype=torch.double)
+        if 'seqlen_vocab_rarity' in args.analyzing_metric:
+            # Use large coeff to make seqlen dominates vocab_rarity
+            max_possible_rarity = args.seq_length * torch.max(args.total_vocab_freq).item()
+            args.seqlen_coeff = 10 ** (math.ceil(math.log(max_possible_rarity, 10)) + 1)
+            print(f"Metric seqlen_vocab_rarity: using {args.seqlen_coeff} as coefficient for seqlen.")
+    metric_functions = [get_metric_function(x) for x in args.analyzing_metric]
+    metric_types = [get_metric_type(x) for x in args.analyzing_metric]
+    # For metric_dtypes we int64 by default since it could be hard to estimate
+    # the appropriate dtype before the mapping analysis. During reduce where
+    # we merge the analysis results, the DataAnalyzer will automatically choose
+    # the dtype of merged result file as the smallest one that meet the range
+    # requirement.
+    metric_dtypes = [np.int64 for x in args.analyzing_metric]
+    start = time.time()
+    data_analyzer = DataAnalyzer(train_ds,
+        num_workers=args.analyzing_num_workers,
+        worker_id=args.analyzing_worker_id,
+        num_threads=args.analyzing_num_threads,
+        specific_threads=args.analyzing_specific_threads,
+        batch_size=args.global_batch_size, metric_names=args.analyzing_metric,
+        metric_functions=metric_functions, metric_types=metric_types,
+        metric_dtypes=metric_dtypes, save_path=args.save)
+    data_analyzer.run_map()
+    duration = (time.time() - start) / 3600.0
+    print(f"map job finished in {duration} hr.")
+
+def run_reduce():
+    args = get_args()
+    if args.analyzing_data_type == 'BERT':
+        args.mask_prob = 0 # When analyzing data, we don't want any mask.
+        train_ds, _, _ = train_valid_test_datasets_provider_bert()
+    elif args.analyzing_data_type == 'GPT':
+        train_ds, _, _ = train_valid_test_datasets_provider_gpt()
+    metric_functions = [get_metric_function(x) for x in args.analyzing_metric]
+    metric_types = [get_metric_type(x) for x in args.analyzing_metric]
+    metric_dtypes = [np.int64 for x in args.analyzing_metric]
+    start = time.time()
+    data_analyzer = DataAnalyzer(train_ds,
+        num_workers=args.analyzing_num_workers,
+        num_threads=args.analyzing_num_threads,
+        num_threads_reduce=args.analyzing_num_threads_reduce,
+        batch_size=args.global_batch_size, metric_names=args.analyzing_metric,
+        metric_functions=metric_functions, metric_types=metric_types,
+        metric_dtypes=metric_dtypes, save_path=args.save)
+    data_analyzer.run_reduce()
+    duration = (time.time() - start) / 3600.0
+    print(f"reduce job finished in {duration} hr.")
+
+if __name__ == "__main__":
+    initialize_megatron(extra_args_provider=get_tasks_args, allow_no_cuda=True)
+    args = get_args()
+    if args.analyzing_task == 'map':
+        run_map()
+    elif args.analyzing_task == 'reduce':
+        run_reduce()
+    else:
+        raise NotImplementedError('Task {} is not implemented.'.format(
+            args.analyzing_task))
--- a/examples_deepspeed/data_efficiency/bert/ds_analyze_bert_data_map.sh
+++ b/examples_deepspeed/data_efficiency/bert/ds_analyze_bert_data_map.sh
+#!/bin/bash
+
+num_workers=1 # Num nodes to run the map job
+num_threads=40 # Num threads on each node. Set this based on #CPU cores
+
+# If different data epochs have slightly different data samples (e.g., due
+# to randomness), then you need to specify large enough num_epochs that cover
+# whole pretraining. If different data epochs are the same, set num_epochs to
+# 1 to only index 1 epoch, and during pretraining DeepSpeed data efficiency
+# library will automatically handle reshuffling when reaching another epoch.
+num_epochs=5
+
+# Which node is this node (start with 0 and end with num_workers-1). This
+# script only launch the map job on 1 worker node, since we don't expect
+# running on many nodes and workers don't need any communication. But you
+# can modify this script to add a MPI/torch distributed launcher.
+worker_id=$1
+save_path="/blob/users/conglli/data/analysis_pile_bert_${num_epochs}epoch/"
+
+metric='total_vocab_freq'
+# metric='vocab_rarity' # this requires the result of total_vocab_freq
+# metric='seqlen_vocab_rarity' # this requires the result of total_vocab_freq
+# metric='seqlen'
+
+seq_len=512
+batch_size=10000
+
+jobname="bert-pile-analyzing-${metric}-${num_epochs}epoch-map-worker${worker_id}"
+## Public the Pile dataset, see prepare_pile_data.py in the same directory
+## about how to download and preprocess the data.
+## Change data_home to your own training data path.
+# data_home="/vc_data_blob/users/conglli/the_pile_bert"
+data_home="/blob/data/the_pile_bert"
+data_path="${data_home}/pile_bert_train_text_sentence"
+
+vocab_path="bert-large-uncased-vocab.txt"
+if [ ! -f "$vocab_path" ]; then
+    wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
+fi
+
+# Make sure the "--split" is the same as what you will use for pre-training.
+options=" \
+    --analyzing-task map \
+    --analyzing-data-type BERT \
+    --analyzing-metric ${metric} \
+    --analyzing-num-workers ${num_workers} \
+    --analyzing-worker-id ${worker_id} \
+    --analyzing-num-threads ${num_threads} \
+    --vocab-file ${vocab_path} \
+    --data-path ${data_path} \
+    --data-impl mmap \
+    --tokenizer-type BertWordPieceLowerCase \
+    --micro-batch-size ${batch_size} \
+    --global-batch-size ${batch_size} \
+    --seq-length ${seq_len} \
+    --max-position-embeddings ${seq_len} \
+    --num-layers 1 \
+    --hidden-size 1 \
+    --num-attention-heads 1 \
+    --split 949,50,1 \
+    --distributed-backend gloo \
+    --train-data-exact-num-epochs ${num_epochs} \
+    --return-data-index \
+    --save-interval 1 \
+    --save ${save_path}"
+
+python ../analyze_data.py ${options} &> ${jobname}.log
\ No newline at end of file
--- a/examples_deepspeed/data_efficiency/bert/ds_analyze_bert_data_reduce.sh
+++ b/examples_deepspeed/data_efficiency/bert/ds_analyze_bert_data_reduce.sh
+#!/bin/bash
+
+# Set these 2 to the same as what you used during map job. We need these 2
+# configs to know how many map job result files do we have.
+num_workers=1
+num_threads=40
+# Reduce job only has 1 worker but can accelerate by multithreading.
+num_threads_reduce=40
+
+# If different data epochs have slightly different data samples (e.g., due
+# to randomness), then you need to specify large enough num_epochs that cover
+# whole pretraining. If different data epochs are the same, set num_epochs to
+# 1 to only index 1 epoch, and during pretraining DeepSpeed data efficiency
+# library will automatically handle reshuffling when reaching another epoch.
+num_epochs=5
+
+save_path="/blob/users/conglli/data/analysis_pile_bert_${num_epochs}epoch/"
+
+metric='total_vocab_freq'
+# metric='vocab_rarity' # this requires the result of total_vocab_freq
+# metric='seqlen_vocab_rarity' # this requires the result of total_vocab_freq
+# metric='seqlen'
+
+seq_len=512
+batch_size=10000
+
+jobname="bert-pile-analyzing-${metric}-${num_epochs}epoch-reduce"
+## Public the Pile dataset, see prepare_pile_data.py in the same directory
+## about how to download and preprocess the data.
+## Change data_home to your own training data path.
+# data_home="/vc_data_blob/users/conglli/the_pile_bert"
+data_home="/blob/data/the_pile_bert"
+data_path="${data_home}/pile_bert_train_text_sentence"
+
+vocab_path="bert-large-uncased-vocab.txt"
+if [ ! -f "$vocab_path" ]; then
+    wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
+fi
+
+# Make sure the "--split" is the same as what you will use for pre-training.
+options=" \
+    --analyzing-task reduce \
+    --analyzing-data-type BERT \
+    --analyzing-metric ${metric} \
+    --analyzing-num-workers ${num_workers} \
+    --analyzing-num-threads ${num_threads} \
+    --analyzing-num-threads-reduce ${num_threads_reduce} \
+    --vocab-file ${vocab_path} \
+    --data-path ${data_path} \
+    --data-impl mmap \
+    --tokenizer-type BertWordPieceLowerCase \
+    --micro-batch-size ${batch_size} \
+    --global-batch-size ${batch_size} \
+    --seq-length ${seq_len} \
+    --max-position-embeddings ${seq_len} \
+    --num-layers 1 \
+    --hidden-size 1 \
+    --num-attention-heads 1 \
+    --split 949,50,1 \
+    --distributed-backend gloo \
+    --train-data-exact-num-epochs ${num_epochs} \
+    --return-data-index \
+    --save-interval 1 \
+    --save ${save_path}"
+
+python ../analyze_data.py ${options} &> ${jobname}.log
\ No newline at end of file
--- a/examples_deepspeed/data_efficiency/bert/finetune/ds_config_bert_TEMPLATE.json
+++ b/examples_deepspeed/data_efficiency/bert/finetune/ds_config_bert_TEMPLATE.json
+{
+  "train_batch_size" : CONFIG_BATCH_SIZE,
+  "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
+  "steps_per_print": LOG_INTERVAL,
+
+  "zero_optimization": {
+    "stage": ZERO_STAGE
+  },
+
+  "gradient_clipping": 1.0,
+  "prescale_gradients": PRESCALE_GRAD,
+
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 11
+  },
+
+  "wall_clock_breakdown" : false
+}
--- a/examples_deepspeed/data_efficiency/bert/finetune/ds_finetune_bert_mnli.sh
+++ b/examples_deepspeed/data_efficiency/bert/finetune/ds_finetune_bert_mnli.sh
+seed=1234
+pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp"
+
+###############################################################################
+### Main configs
+### The main configs are from Megatron-LM paper
+### https://arxiv.org/abs/1909.08053. Choose based on your desired model size
+### or build your own configs.
+seq_len=512
+
+## From Table 6 in https://arxiv.org/abs/1909.08053.
+task="MNLI"
+global_batch_size=128
+lr=1e-5
+epochs=10
+
+train_data="/blob/data/GlueData/MNLI/train.tsv"
+valid_data="/blob/data/GlueData/MNLI/dev_matched.tsv \
+            /blob/data/GlueData/MNLI/dev_mismatched.tsv"
+
+## Adjust based on number of GPUs.
+batch_size=16
+
+## BERT 110M (same config as original BERT-Base model)
+## This config is not included in Megatron-LM paper
+# model_size=0.11
+# num_layers=12
+# hidden_size=768
+# num_attn_heads=12
+
+## BERT 336M (same config as original BERT-Large model)
+model_size=0.336
+num_layers=24
+hidden_size=1024
+num_attn_heads=16
+
+## BERT 1.3B
+# model_size=1.3
+# num_layers=24
+# hidden_size=2048
+# num_attn_heads=32
+
+## BERT 3.9B
+# model_size=3.9
+# num_layers=48
+# hidden_size=2560
+# num_attn_heads=40
+###############################################################################
+### Parallelism configs
+## Model parallelism, 1 is no MP
+mp_size=1
+
+## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
+## Currently pipeline parallelism is not supported for BERT model: DeepSpeed's
+## pipeline parallelism is only integrated with the GPT case, and currently
+## DeepSpeed is not integrated with Megatron's own pipeline parallelism.
+pp_size=1
+no_pp="true"
+
+## ZeRO stage
+zero_stage=0
+###############################################################################
+### Misc configs
+log_interval=10
+eval_iters=50
+eval_interval=100
+save_interval=500000
+
+## Activation checkpointing saves GPU memory, but reduces training speed
+# activation_checkpoint="true"
+activation_checkpoint="false"
+###############################################################################
+vocab_file="bert-large-uncased-vocab.txt"
+if [ ! -f "$vocab_file" ]; then
+    wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
+fi
+
+jobname="${task}-bsz${global_batch_size}-lr${lr}-epochs${epochs}-seed${seed}"
+checkpoint_path="${pretrained_checkpoint}-finetune/${jobname}"
+mkdir -p ${checkpoint_path}
+
+template_json="ds_config_bert_TEMPLATE.json"
+config_json="ds_config_bert_bsz${global_batch_size}_mbsz${batch_size}_log${log_interval}_zero${zero_stage}.json"
+if [[ $zero_stage -gt 0 ]]; then
+sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
+    | sed "s/LOG_INTERVAL/${log_interval}/" \
+    | sed "s/ZERO_STAGE/${zero_stage}/" \
+    | sed "s/PRESCALE_GRAD/false/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+      > ${config_json}
+else
+sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
+    | sed "s/LOG_INTERVAL/${log_interval}/" \
+    | sed "s/ZERO_STAGE/${zero_stage}/" \
+    | sed "s/PRESCALE_GRAD/true/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+      > ${config_json}
+fi
+
+options=" \
+    --finetune \
+    --deepspeed \
+    --deepspeed_config ${config_json} \
+    --zero-stage ${zero_stage} \
+    --task ${task} \
+    --seed ${seed} \
+    --train-data ${train_data} \
+    --valid-data ${valid_data} \
+    --tokenizer-type BertWordPieceLowerCase \
+    --vocab-file ${vocab_file} \
+    --epochs ${epochs} \
+    --pretrained-checkpoint ${pretrained_checkpoint} \
+    --tensor-model-parallel-size ${mp_size} \
+    --pipeline-model-parallel-size ${pp_size} \
+    --num-layers ${num_layers} \
+    --hidden-size ${hidden_size} \
+    --num-attention-heads ${num_attn_heads} \
+    --global-batch-size ${global_batch_size} \
+    --micro-batch-size ${batch_size} \
+    --lr ${lr} \
+    --lr-decay-style linear \
+    --lr-warmup-fraction 0.065 \
+    --seq-length ${seq_len} \
+    --max-position-embeddings ${seq_len} \
+    --save-interval ${save_interval} \
+    --save ${checkpoint_path} \
+    --log-interval ${log_interval} \
+    --eval-interval ${eval_interval} \
+    --eval-iters ${eval_iters} \
+    --weight-decay 1.0e-1 \
+    --fp16"
+
+if [ "${activation_checkpoint}" = "true" ]; then
+options="${options} \
+    --checkpoint-activations \
+    --deepspeed-activation-checkpointing"
+fi
+
+if [[ "${no_pp}" = "true" ]]; then
+options="${options} \
+    --no-pipeline-parallel"
+fi
+
+# After the fine-tuning finishes, you can find the dev set accuracy numbers by
+# "grep -e "overall:" -e "metrics for" ${checkpoint_path}/output.log"
+deepspeed ../../../../tasks/main.py ${options} &> ${checkpoint_path}/output.log
--- a/examples_deepspeed/data_efficiency/bert/finetune/ds_finetune_bert_qqp.sh
+++ b/examples_deepspeed/data_efficiency/bert/finetune/ds_finetune_bert_qqp.sh
+seed=1234
+pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp"
+
+###############################################################################
+### Main configs
+### The main configs are from Megatron-LM paper
+### https://arxiv.org/abs/1909.08053. Choose based on your desired model size
+### or build your own configs.
+seq_len=512
+
+## From Table 6 in https://arxiv.org/abs/1909.08053.
+task="QQP"
+
+train_data="/blob/data/GlueData/QQP/train.tsv"
+valid_data="/blob/data/GlueData/QQP/dev.tsv"
+
+## Adjust based on number of GPUs.
+batch_size=16
+
+## BERT 110M (same config as original BERT-Base model)
+## This config is not included in Megatron-LM paper
+# model_size=0.11
+# num_layers=12
+# hidden_size=768
+# num_attn_heads=12
+# global_batch_size=128
+# lr=5e-5
+# epochs=12
+
+## BERT 336M (same config as original BERT-Large model)
+model_size=0.336
+num_layers=24
+hidden_size=1024
+num_attn_heads=16
+global_batch_size=128
+lr=5e-5
+epochs=12
+
+## BERT 1.3B
+# model_size=1.3
+# num_layers=24
+# hidden_size=2048
+# num_attn_heads=32
+# global_batch_size=128
+# lr=3e-5
+# epochs=12
+
+## BERT 3.9B
+# model_size=3.9
+# num_layers=48
+# hidden_size=2560
+# num_attn_heads=40
+# global_batch_size=256
+# lr=4e-5
+# epochs=12
+###############################################################################
+### Parallelism configs
+## Model parallelism, 1 is no MP
+mp_size=1
+
+## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
+## Currently pipeline parallelism is not supported for BERT model: DeepSpeed's
+## pipeline parallelism is only integrated with the GPT case, and currently
+## DeepSpeed is not integrated with Megatron's own pipeline parallelism.
+pp_size=1
+no_pp="true"
+
+## ZeRO stage
+zero_stage=0
+###############################################################################
+### Misc configs
+log_interval=10
+eval_iters=50
+eval_interval=100
+save_interval=500000
+
+## Activation checkpointing saves GPU memory, but reduces training speed
+# activation_checkpoint="true"
+activation_checkpoint="false"
+###############################################################################
+vocab_file="bert-large-uncased-vocab.txt"
+if [ ! -f "$vocab_file" ]; then
+    wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
+fi
+
+jobname="${task}-bsz${global_batch_size}-lr${lr}-epochs${epochs}-seed${seed}"
+checkpoint_path="${pretrained_checkpoint}-finetune/${jobname}"
+mkdir -p ${checkpoint_path}
+
+template_json="ds_config_bert_TEMPLATE.json"
+config_json="ds_config_bert_bsz${global_batch_size}_mbsz${batch_size}_log${log_interval}_zero${zero_stage}.json"
+if [[ $zero_stage -gt 0 ]]; then
+sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
+    | sed "s/LOG_INTERVAL/${log_interval}/" \
+    | sed "s/ZERO_STAGE/${zero_stage}/" \
+    | sed "s/PRESCALE_GRAD/false/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+      > ${config_json}
+else
+sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
+    | sed "s/LOG_INTERVAL/${log_interval}/" \
+    | sed "s/ZERO_STAGE/${zero_stage}/" \
+    | sed "s/PRESCALE_GRAD/true/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+      > ${config_json}
+fi
+
+options=" \
+    --finetune \
+    --deepspeed \
+    --deepspeed_config ${config_json} \
+    --zero-stage ${zero_stage} \
+    --task ${task} \
+    --seed ${seed} \
+    --train-data ${train_data} \
+    --valid-data ${valid_data} \
+    --tokenizer-type BertWordPieceLowerCase \
+    --vocab-file ${vocab_file} \
+    --epochs ${epochs} \
+    --pretrained-checkpoint ${pretrained_checkpoint} \
+    --tensor-model-parallel-size ${mp_size} \
+    --pipeline-model-parallel-size ${pp_size} \
+    --num-layers ${num_layers} \
+    --hidden-size ${hidden_size} \
+    --num-attention-heads ${num_attn_heads} \
+    --global-batch-size ${global_batch_size} \
+    --micro-batch-size ${batch_size} \
+    --lr ${lr} \
+    --lr-decay-style linear \
+    --lr-warmup-fraction 0.065 \
+    --seq-length ${seq_len} \
+    --max-position-embeddings ${seq_len} \
+    --save-interval ${save_interval} \
+    --save ${checkpoint_path} \
+    --log-interval ${log_interval} \
+    --eval-interval ${eval_interval} \
+    --eval-iters ${eval_iters} \
+    --weight-decay 1.0e-1 \
+    --fp16"
+
+if [ "${activation_checkpoint}" = "true" ]; then
+options="${options} \
+    --checkpoint-activations \
+    --deepspeed-activation-checkpointing"
+fi
+
+if [[ "${no_pp}" = "true" ]]; then
+options="${options} \
+    --no-pipeline-parallel"
+fi
+
+# After the fine-tuning finishes, you can find the dev set accuracy numbers by
+# "grep -e "overall:" -e "metrics for" ${checkpoint_path}/output.log"
+deepspeed ../../../../tasks/main.py ${options} &> ${checkpoint_path}/output.log
--- a/examples_deepspeed/data_efficiency/bert/finetune/ds_finetune_bert_race.sh
+++ b/examples_deepspeed/data_efficiency/bert/finetune/ds_finetune_bert_race.sh
+seed=1234
+## RACE have two sub-tasks that need to be finetuned separately
+difficulty="middle"
+# difficulty="high"
+pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp"
+
+###############################################################################
+### Main configs
+### The main configs are from Megatron-LM paper
+### https://arxiv.org/abs/1909.08053. Choose based on your desired model size
+### or build your own configs.
+seq_len=512
+
+## From Table 6 in https://arxiv.org/abs/1909.08053.
+task="RACE"
+
+## Race dataset can be downloaded by:
+## wget http://www.cs.cmu.edu/~glai1/data/race/RACE.tar.gz
+train_data="/blob/data/RACE/train/${difficulty}"
+
+## The Megatron paper https://arxiv.org/abs/1909.08053 says: "For the test set
+## results of RACE, we first use the development set to find the checkpoint
+## that gives us the median score on the 5 random seeds and we report the
+## results from that checkpoint on the test set", which is a quite confusing
+## description. For simplicity, instead we directly get the median dev and test
+## set score on 5 random seeds from a single pretrained_checkpoint.
+valid_data="/blob/data/RACE/dev/${difficulty} \
+            /blob/data/RACE/test/${difficulty}"
+
+## Adjust based on number of GPUs.
+batch_size=4
+
+## BERT 110M (same config as original BERT-Base model)
+## This config is not included in Megatron-LM paper
+# model_size=0.11
+# num_layers=12
+# hidden_size=768
+# num_attn_heads=12
+# global_batch_size=32
+# lr=2e-5
+# epochs=3
+
+## BERT 336M (same config as original BERT-Large model)
+model_size=0.336
+num_layers=24
+hidden_size=1024
+num_attn_heads=16
+global_batch_size=32
+lr=2e-5
+epochs=3
+
+## BERT 1.3B
+# model_size=1.3
+# num_layers=24
+# hidden_size=2048
+# num_attn_heads=32
+# global_batch_size=16
+# lr=1e-5
+# epochs=3
+
+## BERT 3.9B
+# model_size=3.9
+# num_layers=48
+# hidden_size=2560
+# num_attn_heads=40
+# global_batch_size=32
+# lr=2e-5
+# epochs=3
+###############################################################################
+### Parallelism configs
+## Model parallelism, 1 is no MP
+mp_size=1
+
+## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
+## Currently pipeline parallelism is not supported for BERT model: DeepSpeed's
+## pipeline parallelism is only integrated with the GPT case, and currently
+## DeepSpeed is not integrated with Megatron's own pipeline parallelism.
+pp_size=1
+no_pp="true"
+
+## ZeRO stage
+zero_stage=0
+###############################################################################
+### Misc configs
+log_interval=10
+eval_iters=50
+eval_interval=100
+save_interval=100000
+
+## Activation checkpointing saves GPU memory, but reduces training speed
+# activation_checkpoint="true"
+activation_checkpoint="false"
+###############################################################################
+vocab_file="bert-large-uncased-vocab.txt"
+if [ ! -f "$vocab_file" ]; then
+    wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
+fi
+
+jobname="${task}-${difficulty}-bsz${global_batch_size}-lr${lr}-epochs${epochs}-seed${seed}"
+checkpoint_path="${pretrained_checkpoint}-finetune/${jobname}"
+mkdir -p ${checkpoint_path}
+
+template_json="ds_config_bert_TEMPLATE.json"
+config_json="ds_config_bert_bsz${global_batch_size}_mbsz${batch_size}_log${log_interval}_zero${zero_stage}.json"
+if [[ $zero_stage -gt 0 ]]; then
+sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
+    | sed "s/LOG_INTERVAL/${log_interval}/" \
+    | sed "s/ZERO_STAGE/${zero_stage}/" \
+    | sed "s/PRESCALE_GRAD/false/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+      > ${config_json}
+else
+sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
+    | sed "s/LOG_INTERVAL/${log_interval}/" \
+    | sed "s/ZERO_STAGE/${zero_stage}/" \
+    | sed "s/PRESCALE_GRAD/true/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+      > ${config_json}
+fi
+
+options=" \
+    --finetune \
+    --deepspeed \
+    --deepspeed_config ${config_json} \
+    --zero-stage ${zero_stage} \
+    --task ${task} \
+    --seed ${seed} \
+    --train-data ${train_data} \
+    --valid-data ${valid_data} \
+    --tokenizer-type BertWordPieceLowerCase \
+    --vocab-file ${vocab_file} \
+    --epochs ${epochs} \
+    --pretrained-checkpoint ${pretrained_checkpoint} \
+    --tensor-model-parallel-size ${mp_size} \
+    --pipeline-model-parallel-size ${pp_size} \
+    --num-layers ${num_layers} \
+    --hidden-size ${hidden_size} \
+    --num-attention-heads ${num_attn_heads} \
+    --global-batch-size ${global_batch_size} \
+    --micro-batch-size ${batch_size} \
+    --lr ${lr} \
+    --lr-decay-style linear \
+    --lr-warmup-fraction 0.06 \
+    --seq-length ${seq_len} \
+    --max-position-embeddings ${seq_len} \
+    --save-interval ${save_interval} \
+    --save ${checkpoint_path} \
+    --log-interval ${log_interval} \
+    --eval-interval ${eval_interval} \
+    --eval-iters ${eval_iters} \
+    --weight-decay 1.0e-1 \
+    --clip-grad 1.0 \
+    --fp16"
+
+if [ "${activation_checkpoint}" = "true" ]; then
+options="${options} \
+    --checkpoint-activations \
+    --deepspeed-activation-checkpointing"
+fi
+
+if [[ "${no_pp}" = "true" ]]; then
+options="${options} \
+    --no-pipeline-parallel"
+fi
+
+# After the fine-tuning finishes, you can find the dev/test set accuracy numbers
+# by "grep -e "overall:" -e "metrics for" ${checkpoint_path}/output.log"
+deepspeed ../../../../tasks/main.py ${options} &> ${checkpoint_path}/output.log
--- a/examples_deepspeed/data_efficiency/bert/finetune/ds_finetune_gather_result.py
+++ b/examples_deepspeed/data_efficiency/bert/finetune/ds_finetune_gather_result.py
+import os
+import statistics
+
+def gather_numbers(fname, match_keywords, index_keywords, index_offsets):
+    results = {}
+    for k in index_keywords:
+        results[k] = []
+    file1 = open(fname, 'r')
+    while True:
+        line = file1.readline()
+        if not line:
+            break
+        splits = line.split(' ')
+        for i in range(len(match_keywords)):
+            if match_keywords[i] in line:
+                ref_idx = splits.index(index_keywords[i])
+                results[index_keywords[i]].append(float(splits[ref_idx+index_offsets[i]]))
+    file1.close()
+    return results
+
+def gather_MNLI_results(result_path):
+    overall = []
+    matched = []
+    mismatched = []
+    for file in os.listdir(result_path):
+        if file.startswith('MNLI'):
+            fname = f'{result_path}/{file}/output.log'
+            if os.path.exists(fname):
+                results = gather_numbers(fname,
+                    ['overall:', 'metrics for dev-matched:', 'metrics for dev-mismatched:'],
+                    ['overall:', 'dev-matched:', 'dev-mismatched:'],
+                    [9, 9, 9])
+                overall_candidate = results['overall:']
+                matched_candidate = results['dev-matched:']
+                mismatched_candidate = results['dev-mismatched:']
+                if len(overall_candidate) > 0:
+                    assert len(overall_candidate) == len(matched_candidate) and len(overall_candidate) == len(mismatched_candidate)
+                    best_index = overall_candidate.index(max(overall_candidate))
+                    overall.append(overall_candidate[best_index])
+                    matched.append(matched_candidate[best_index])
+                    mismatched.append(mismatched_candidate[best_index])
+    if len(overall) > 0:
+        if len(overall) % 2 == 1:
+            median_idx = overall.index(statistics.median(overall))
+        else:
+            median_idx = overall.index(statistics.median_high(overall))
+        print(f'MNLI how Megatron paper reported: overall results median {statistics.median(overall)}, corresponding matched/mismatched: {matched[median_idx]}/{mismatched[median_idx]}')
+        print(f'MNLI other results:')
+        print(f'MNLI overall results {overall}, median {statistics.median(overall)} (corresponding matched/mismatched {matched[median_idx]}/{mismatched[median_idx]}), mean {statistics.mean(overall)}, std {statistics.stdev(overall)}')
+        print(f'MNLI matched results {matched}, median {statistics.median(matched)}, mean {statistics.mean(matched)}, std {statistics.stdev(matched)}')
+        print(f'MNLI mismatched results {mismatched}, median {statistics.median(mismatched)}, mean {statistics.mean(mismatched)}, std {statistics.stdev(mismatched)}')
+    else:
+        print("Didn't find any MNLI result")
+
+def gather_QQP_results(result_path):
+    overall = []
+    for file in os.listdir(result_path):
+        if file.startswith('QQP'):
+            fname = f'{result_path}/{file}/output.log'
+            if os.path.exists(fname):
+                results = gather_numbers(fname, ['overall:'], ['overall:'], [9])
+                overall_candidate = results['overall:']
+                if len(overall_candidate) > 0:
+                    best_index = overall_candidate.index(max(overall_candidate))
+                    overall.append(overall_candidate[best_index])
+    if len(overall) > 0:
+        print(f'QQP how Megatron paper reported: overall results median {statistics.median(overall)}')
+        print(f'QQP other results:')
+        print(f'QQP overall results {overall}, median {statistics.median(overall)}, mean {statistics.mean(overall)}, std {statistics.stdev(overall)}')
+    else:
+        print("Didn't find any QQP result")
+
+def gather_RACE_results(result_path, task):
+    dev = []
+    test = []
+    for file in os.listdir(result_path):
+        if file.startswith(f'RACE-{task}'):
+            fname = f'{result_path}/{file}/output.log'
+            if os.path.exists(fname):
+                results = gather_numbers(fname,
+                    [f'metrics for dev-{task}:', f'metrics for test-{task}:'],
+                    [f'dev-{task}:', f'test-{task}:'],
+                    [9, 9])
+                dev_candidate = results[f'dev-{task}:']
+                test_candidate = results[f'test-{task}:']
+                if len(dev_candidate) > 0:
+                    assert len(dev_candidate) == len(test_candidate)
+                    dev.append(max(dev_candidate))
+                    test.append(max(test_candidate))
+    if len(dev) > 0:
+        if len(dev) % 2 == 1:
+            median_idx = dev.index(statistics.median(dev))
+        else:
+            median_idx = dev.index(statistics.median_high(dev))
+        print(f'RACE-{task} how Megatron paper reported: test result from the median of dev results {test[median_idx]}')
+        print(f'RACE-{task} other results:')
+        print(f'RACE-{task} dev results {dev}, median {statistics.median(dev)}, mean {statistics.mean(dev)}, std {statistics.stdev(dev)}')
+        print(f'RACE-{task} test results {test}, median {statistics.median(test)}, mean {statistics.mean(test)}, std {statistics.stdev(test)}')
+    else:
+        print(f"Didn't find any RACE-{task} result")
+
+def gather_finetune_results(result_path):
+    print(f'Gather finetune results for {result_path}')
+    gather_MNLI_results(result_path)
+    gather_QQP_results(result_path)
+    gather_RACE_results(result_path, 'middle')
+    gather_RACE_results(result_path, 'high')
+
+if __name__ == '__main__':
+    result_path='/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp-finetune/'
+    gather_finetune_results(result_path)
\ No newline at end of file
--- a/examples_deepspeed/data_efficiency/bert/finetune_glue/ds_config_bert_TEMPLATE.json
+++ b/examples_deepspeed/data_efficiency/bert/finetune_glue/ds_config_bert_TEMPLATE.json
+{
+  "train_batch_size" : CONFIG_BATCH_SIZE,
+  "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
+  "steps_per_print": LOG_INTERVAL,
+
+  "zero_optimization": {
+    "stage": ZERO_STAGE
+  },
+
+  "gradient_clipping": 1.0,
+  "prescale_gradients": PRESCALE_GRAD,
+
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 11
+  },
+
+  "wall_clock_breakdown" : false
+}
--- a/examples_deepspeed/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue.sh
+++ b/examples_deepspeed/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue.sh
+hostname_and_rank=$1
+master_port=$2
+seed=$3
+task=$4
+lr=$5
+pretrained_checkpoint=$6
+
+# hostname_and_rank="worker-0:0,1,2,3"
+# master_port=12345
+# seed=1234
+# task="MNLI"
+# lr=2e-5
+# pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp"
+
+###############################################################################
+### Main configs
+seq_len=512
+
+global_batch_size=32
+epochs=3
+
+train_data="/blob/data/GlueData/${task}/train.tsv"
+valid_data="/blob/data/GlueData/${task}/dev.tsv"
+if [[ "${task}" = "MNLI" ]]; then
+valid_data="/blob/data/GlueData/MNLI/dev_matched.tsv \
+            /blob/data/GlueData/MNLI/dev_mismatched.tsv"
+fi
+
+## Adjust based on number of GPUs.
+batch_size=8
+
+## BERT 110M (BERT-Base)
+# model_size=0.11
+# num_layers=12
+# hidden_size=768
+# num_attn_heads=12
+
+## BERT 336M (BERT-Large)
+model_size=0.336
+num_layers=24
+hidden_size=1024
+num_attn_heads=16
+
+## BERT 1.3B
+# model_size=1.3
+# num_layers=24
+# hidden_size=2048
+# num_attn_heads=32
+
+## BERT 3.9B
+# model_size=3.9
+# num_layers=48
+# hidden_size=2560
+# num_attn_heads=40
+###############################################################################
+### Parallelism configs
+## Model parallelism, 1 is no MP
+mp_size=1
+
+## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
+## Currently pipeline parallelism is not supported for BERT model: DeepSpeed's
+## pipeline parallelism is only integrated with the GPT case, and currently
+## DeepSpeed is not integrated with Megatron's own pipeline parallelism.
+pp_size=1
+no_pp="true"
+
+## ZeRO stage
+zero_stage=0
+###############################################################################
+### Misc configs
+log_interval=10
+eval_iters=50
+eval_interval=100
+
+## Activation checkpointing saves GPU memory, but reduces training speed
+# activation_checkpoint="true"
+activation_checkpoint="false"
+###############################################################################
+vocab_file="bert-large-uncased-vocab.txt"
+if [ ! -f "$vocab_file" ]; then
+    wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
+fi
+
+jobname="${task}-bsz${global_batch_size}-lr${lr}-epochs${epochs}-seed${seed}"
+# output_path="${pretrained_checkpoint}-finetune-glue-4v100/${jobname}"
+output_path=$(basename "$pretrained_checkpoint")
+output_path="glue-results/${output_path}-finetune-glue-4v100/${jobname}"
+mkdir -p ${output_path}
+
+template_json="ds_config_bert_TEMPLATE.json"
+config_json="ds_config_bert_bsz${global_batch_size}_mbsz${batch_size}_log${log_interval}_zero${zero_stage}.json"
+if [[ $zero_stage -gt 0 ]]; then
+sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
+    | sed "s/LOG_INTERVAL/${log_interval}/" \
+    | sed "s/ZERO_STAGE/${zero_stage}/" \
+    | sed "s/PRESCALE_GRAD/false/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+      > ${config_json}
+else
+sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
+    | sed "s/LOG_INTERVAL/${log_interval}/" \
+    | sed "s/ZERO_STAGE/${zero_stage}/" \
+    | sed "s/PRESCALE_GRAD/true/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+      > ${config_json}
+fi
+
+options=" \
+    --finetune \
+    --deepspeed \
+    --deepspeed_config ${config_json} \
+    --zero-stage ${zero_stage} \
+    --task ${task} \
+    --seed ${seed} \
+    --train-data ${train_data} \
+    --valid-data ${valid_data} \
+    --tokenizer-type BertWordPieceLowerCase \
+    --vocab-file ${vocab_file} \
+    --epochs ${epochs} \
+    --pretrained-checkpoint ${pretrained_checkpoint} \
+    --tensor-model-parallel-size ${mp_size} \
+    --pipeline-model-parallel-size ${pp_size} \
+    --num-layers ${num_layers} \
+    --hidden-size ${hidden_size} \
+    --num-attention-heads ${num_attn_heads} \
+    --global-batch-size ${global_batch_size} \
+    --micro-batch-size ${batch_size} \
+    --lr ${lr} \
+    --lr-decay-style linear \
+    --lr-warmup-fraction 0.1 \
+    --seq-length ${seq_len} \
+    --max-position-embeddings ${seq_len} \
+    --log-interval ${log_interval} \
+    --eval-interval ${eval_interval} \
+    --eval-iters ${eval_iters} \
+    --weight-decay 1.0e-1 \
+    --fp16"
+
+if [ "${activation_checkpoint}" = "true" ]; then
+options="${options} \
+    --checkpoint-activations \
+    --deepspeed-activation-checkpointing"
+fi
+
+if [[ "${no_pp}" = "true" ]]; then
+options="${options} \
+    --no-pipeline-parallel"
+fi
+
+# After the fine-tuning finishes, you can find the dev set accuracy numbers by
+# "grep -e "overall:" -e "metrics for" ${output_path}/output.log"
+deepspeed --include=${hostname_and_rank} --master_port=${master_port} ../../../../tasks/main.py ${options} &> ${output_path}/output.log
--- a/examples_deepspeed/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue_run.sh
+++ b/examples_deepspeed/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue_run.sh
+hostname_and_rank=$1
+master_port=$2
+pretrained_checkpoint=$3
+
+# hostname_and_rank="worker-0:0,1,2,3"
+# master_port=12345
+# pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp"
+
+tasks=(
+    RTE
+    MRPC
+    STS-B
+    CoLA
+    SST-2
+    QNLI
+    QQP
+    MNLI
+)
+
+seeds=(
+    1234
+    1235
+    1236
+    1237
+    1238
+)
+
+lrs=(
+    2e-5
+    3e-5
+    4e-5
+    5e-5
+)
+
+for ((i=0;i<${#tasks[@]};++i)); do
+    task=${tasks[i]}
+    for ((j=0;j<${#seeds[@]};++j)); do
+        seed=${seeds[j]}
+        for ((k=0;k<${#lrs[@]};++k)); do
+            lr=${lrs[k]}
+            bash ds_finetune_bert_glue.sh ${hostname_and_rank} ${master_port} ${seed} ${task} ${lr} ${pretrained_checkpoint}
+        done
+    done
+done
\ No newline at end of file
--- a/examples_deepspeed/data_efficiency/bert/finetune_glue/ds_finetune_gather_result.py
+++ b/examples_deepspeed/data_efficiency/bert/finetune_glue/ds_finetune_gather_result.py
+import os
+import statistics
+
+def gather_numbers(fname, match_keywords, index_keywords, index_offsets):
+    results = {}
+    for k in index_keywords:
+        results[k] = []
+    file1 = open(fname, 'r')
+    while True:
+        line = file1.readline()
+        if not line:
+            break
+        splits = line.split(' ')
+        for i in range(len(match_keywords)):
+            if match_keywords[i] in line:
+                ref_idx = splits.index(index_keywords[i])
+                results[index_keywords[i]].append(float(splits[ref_idx+index_offsets[i]]))
+    file1.close()
+    return results
+
+def gather_GLUE_results(result_path, key, lr):
+    result = []
+    mnli_matched_result = []
+    mnli_mismatched_result = []
+    for file in os.listdir(result_path):
+        if file.startswith(key) and lr in file:
+            fname = f'{result_path}/{file}/output.log'
+            if os.path.exists(fname):
+                if key == "STS-B":
+                    results = gather_numbers(fname, ['metrics for'], ['spearmanr'], [2])
+                    overall_candidate = results['spearmanr']
+                    overall_candidate = [x * 100.0 for x in overall_candidate]
+                elif key == "CoLA":
+                    results = gather_numbers(fname, ['metrics for'], ['mcc'], [2])
+                    overall_candidate = results['mcc']
+                    overall_candidate = [x * 100.0 for x in overall_candidate]
+                elif key == "MNLI":
+                    results = gather_numbers(fname,
+                        ['overall:', 'metrics for dev-matched:', 'metrics for dev-mismatched:'],
+                        ['overall:', 'dev-matched:', 'dev-mismatched:'],
+                        [9, 9, 9])
+                    overall_candidate = results['overall:']
+                    matched_candidate = results['dev-matched:']
+                    mismatched_candidate = results['dev-mismatched:']
+                else:
+                    results = gather_numbers(fname, ['overall:'], ['overall:'], [9])
+                    overall_candidate = results['overall:']
+                if len(overall_candidate) > 0:
+                    if len(overall_candidate) != 3:
+                        print(f"{result_path} task {key} lr {lr} only has {len(overall_candidate)} epoch")
+                    best_index = overall_candidate.index(max(overall_candidate))
+                    result.append(overall_candidate[best_index])
+                    if key == "MNLI":
+                        mnli_matched_result.append(matched_candidate[best_index])
+                        mnli_mismatched_result.append(mismatched_candidate[best_index])
+    if len(result) > 0:
+        if len(result) != 5:
+            print(f"{result_path} task {key} lr {lr} only has {len(result)} seed")
+        if key == "MNLI":
+            best_index = result.index(statistics.median_high(result))
+            return round(mnli_matched_result[best_index],2), round(statistics.stdev(mnli_matched_result),2), round(mnli_mismatched_result[best_index],2), round(statistics.stdev(mnli_mismatched_result),2)
+        else:
+            return round(statistics.median_high(result),2), round(statistics.stdev(result),2)
+    else:
+        if key == "MNLI":
+            return None, None, None, None
+        else:
+            return None, None
+
+def gather_finetune_results(result_path, extra_col=[], lr="2e-5"):
+    output = ""
+    for field in extra_col:
+        output += f"{field} &"
+    task_output = ""
+    median_list, std_list = [], []
+    m_median, m_std, mm_median, mm_std = gather_GLUE_results(result_path, "MNLI", lr)
+    if m_median is not None:
+        median_list += [m_median, mm_median]
+        std_list += [m_std, mm_std]
+    task_output += f"{m_median}±{m_std} & {mm_median}±{mm_std} &"
+    tasks = ["QQP", "QNLI", "SST-2", "CoLA", "STS-B", "MRPC", "RTE"]
+    for task in tasks:
+        t_median, t_std = gather_GLUE_results(result_path, task, lr)
+        if t_median is not None:
+            median_list += [t_median]
+            std_list += [t_std]
+        if task == "RTE":
+            task_output += f"{t_median}±{t_std} "
+        else:
+            task_output += f"{t_median}±{t_std} &"
+    overall_median = round(sum(median_list) / len(median_list), 2)
+    overall_std = round(sum(std_list) / len(std_list), 2)
+    output += f"{overall_median}±{overall_std} &"
+    output += task_output
+    output += " \\\\"
+    print(output)
+
+if __name__ == '__main__':
+    print("\\begin{table}")
+    print("\centering")
+    print("\\tiny")
+    text = "\\begin{tabular}{@{}l|"
+    for _ in range(11):
+        text += "c"
+    text += "@{}}"
+    print(text)
+    print("\\toprule")
+    print("Case & Train tokens & Average & MNLI-m & MNLI-mm & QQP & QNLI & SST-2 & CoLA & STS-B & MRPC & RTE \\\\")
+    print("\midrule")
+    
+    result_path='/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp-finetune/'
+    gather_finetune_results(result_path)
+    
+    print("\\bottomrule")
+    print("\end{tabular}")
+    print("\end{table}")
+    print("")
+    print("")
\ No newline at end of file
--- a/examples_deepspeed/data_efficiency/bert/pile_data_download_preprocess.py
+++ b/examples_deepspeed/data_efficiency/bert/pile_data_download_preprocess.py
+import zstandard
+import sys
+import time
+import os
+
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             os.path.pardir,os.path.pardir,os.path.pardir)))
+from megatron.data import indexed_dataset
+
+def pile_download(download_url, file_path, i):
+    start = time.time()
+    zstd_file_path = f"{file_path}{i:02}.jsonl.zst"
+    download_path = f"{download_url}{i:02}.jsonl.zst"
+    if not os.path.exists(zstd_file_path):
+        os.system(f"wget -P {file_path} {download_path}")
+        print(f"Finished downloading chunk {i} in {time.time() - start} sec")
+
+def pile_decompress(download_url, file_path, i):
+    zstd_file_path = f"{file_path}{i:02}.jsonl.zst"
+    output_path = f"{file_path}{i:02}.jsonl"
+    if not os.path.exists(output_path):
+        if not os.path.exists(zstd_file_path):
+            pile_download(download_url, file_path, i)
+        start = time.time()
+        with open(zstd_file_path, 'rb') as compressed:
+            decomp = zstandard.ZstdDecompressor()
+            with open(output_path, 'wb') as destination:
+                decomp.copy_stream(compressed, destination)
+        os.remove(zstd_file_path)
+        print(f"Finished decompressing chunk {i} in {time.time() - start} sec")
+
+def pile_preprocess(download_url, file_path, vocab_file, num_workers, i):
+    json_file_path = f"{file_path}{i:02}.jsonl"
+    output_prefix = f"{file_path}pile_bert_train_{i:02}"
+    if not os.path.exists(f"{output_prefix}_text_sentence.idx"):
+        if not os.path.exists(json_file_path):
+            pile_decompress(download_url, file_path, i)
+        start = time.time()
+        cmd = f"python ../../tools/preprocess_data.py \
+                --input {json_file_path} \
+                --output-prefix {output_prefix} \
+                --vocab {vocab_file} \
+                --dataset-impl mmap \
+                --tokenizer-type BertWordPieceLowerCase \
+                --split-sentences \
+                --workers {num_workers} "
+        # It's possible to hit MemoryError during above cmd since the memory
+        # usage is proportional to num_workers. In this case we delete the
+        # incomplete output and user shall retry with smaller num_workers.
+        # Our experience show that chunk 6, 7, 9, 17, 18, 20, 21, 24, 27
+        # particularly have large memory usage.
+        if os.system(cmd) == 0: # Success
+            os.remove(json_file_path)
+        else:
+            print(f"Error: chunk {i} preprocessing got error, delete \
+                    incomplete output. If MemoryError appeared, please retry \
+                    with num_workers smaller than {num_workers}.")
+            if os.path.exists(f"{output_prefix}_text_sentence.idx"):
+                os.remove(f"{output_prefix}_text_sentence.idx")
+            if os.path.exists(f"{output_prefix}_text_sentence.bin"):
+                os.remove(f"{output_prefix}_text_sentence.bin")
+        print(f"Finished preprocessing chunk {i} in {time.time() - start} sec")
+
+def pile_merge(file_path):
+    start = time.time()
+    num_chunks = 30
+    vocab_size = 30524
+    for i in range(num_chunks):
+        output_prefix = f"{file_path}pile_bert_train_{i:02}"
+        assert os.path.exists(f"{output_prefix}_text_sentence.idx")
+        assert os.path.exists(f"{output_prefix}_text_sentence.bin")
+    builder = indexed_dataset.make_builder(
+        f"{file_path}pile_bert_train_text_sentence.bin", impl="mmap",
+        vocab_size=vocab_size)
+    for i in range(num_chunks):
+        chunk_file = f"{file_path}pile_bert_train_{i:02}_text_sentence"
+        print(f"Merging file {chunk_file}")
+        builder.merge_file_(chunk_file)
+    print("Finalizing merged file ...")
+    builder.finalize(f"{file_path}pile_bert_train_text_sentence.idx")
+    print(f"Finished merging in {time.time() - start} sec")
+    # After verifying the merged data with real training, you may want to
+    # delete the data chunks.
+    # for i in range(num_chunks):
+    #     output_prefix = f"{file_path}pile_bert_train_{i:02}"
+    #     os.remove(f"{output_prefix}_text_sentence.idx")
+    #     os.remove(f"{output_prefix}_text_sentence.bin")
+
+if __name__ == '__main__':
+    # Path to download and store all the output files during the whole process.
+    # Estimated max storage usage would be around 1.6 TB (or 780GB if skip the
+    # final merge). Memory usage is proportional to the num_workers below (can
+    # be as high as O(300GB) if num_workers is around 20).
+    file_path = "/blob/data/the_pile_bert/"
+    # The raw Pile data has 30 compressed .zst chunks. To run on single
+    # machine for all chunks, run "python prepare_pile_data.py range 0 30".
+    # You can also split and run on multiple machines to speed up, since
+    # processing one chunk can take hours. The whole process only uses CPU.
+    if sys.argv[1] == "merge":
+        # "python prepare_pile_data.py merge" means merge all 30 processed data
+        # chunks. Run it only after all 30 chunks are preprocessed. The memory
+        # usage during merge is about 600GB. If you don't have enough memory,
+        # one solution is to directly use the 30 data chunks as multiple
+        # datasets. See '--data-path' in
+        # github.com/microsoft/Megatron-DeepSpeed/blob/main/megatron/arguments.py
+        pile_merge(file_path)
+    else:
+        if sys.argv[1] == "range":
+            # "python prepare_pile_data.py range 0 30" means process chunk 0-29
+            selected_chunk = range(int(sys.argv[2]), int(sys.argv[3]))
+        else:
+            # "python prepare_pile_data.py 2 5 8" means process chunk 2, 5, 8
+            selected_chunk = [int(x) for x in sys.argv[1:]]
+        print("selected_chunk: ", selected_chunk)
+        # Number of process. Adjust based on your CPU/Memory.
+        num_workers = 20
+        # Where the raw Pile data can be downloaded. The url may change in
+        # future. Contact EleutherAI (https://github.com/EleutherAI/the-pile)
+        # if this url does not work.
+        download_url = "https://the-eye.eu/public/AI/pile/train/"
+        vocab_file = "bert-large-uncased-vocab.txt"
+        vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt"
+        if not os.path.exists(vocab_file):
+            os.system(f"wget {vocab_url}")
+        os.makedirs(file_path, exist_ok=True)
+
+        for i in selected_chunk:
+            pile_preprocess(download_url, file_path, vocab_file, num_workers, i)
--- a/examples_deepspeed/data_efficiency/bert/pretrain/ds_config_bert_1clmetric_TEMPLATE.json
+++ b/examples_deepspeed/data_efficiency/bert/pretrain/ds_config_bert_1clmetric_TEMPLATE.json
+{
+  "train_batch_size": GBSIZE,
+  "train_micro_batch_size_per_gpu": MBSIZE,
+  "steps_per_print": LOG_INTERVAL,
+
+  "zero_optimization": {
+    "stage": ZERO_STAGE
+  },
+
+  "gradient_clipping": 1.0,
+  "prescale_gradients": PRESCALE_GRAD,
+
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 11
+  },
+
+  "wall_clock_breakdown" : false,
+  "dataloader_drop_last": true,
+  "data_efficiency": {
+    "enabled": true,
+    "seed": DATA_EFFICIENCY_SEED,
+    "data_routing": {
+      "enabled": LTD_ENABLED,
+      "random_ltd":{
+        "enabled": LTD_ENABLED,
+        "total_layer_num": 24,
+        "random_ltd_layer_num": 22,
+        "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22],
+        "model_mask_name": "attention_mask",
+        "model_type": "encoder",
+        "hidden_state_order": "seq_batch_dim",
+        "random_ltd_schedule": {
+          "min_value": LTD_MIN,
+          "max_value": LTD_MAX,
+          "schedule_type":"fixed_linear",
+          "schedule_config": {
+            "require_steps": LTD_STEP,
+            "seq_per_step": 16
+          }
+        }
+      } 
+    },
+    "data_sampling": {
+      "enabled": CL_ENABLED,
+      "num_workers": DATA_SAMPLING_NUM_WORKERS,
+      "curriculum_learning": {
+        "enabled": CL_ENABLED,
+        "data_cluster_path": "CL_CLUSTER_PATH",
+        "curriculum_metrics": {
+          "CL_1st_METRIC_NAME": {
+            "index_to_sample_path": "CL_1st_SAMPLE_PATH",
+            "index_to_metric_path": "CL_1st_METRIC_PATH",
+            "difficulty_type": "CL_1st_DIFF_TYPE",
+            "clustering_type": "CL_1st_CLUSTER_TYPE",
+            "min_difficulty": CL_1st_MIN,
+            "max_difficulty": CL_1st_MAX,
+            "schedule_type": "fixed_root",
+            "schedule_config": {
+              "total_curriculum_step": CL_1st_TOTAL_STEP,
+              "difficulty_step": CL_1st_DIFF_STEP,
+              "root_degree": CL_1st_ROOT
+            }
+          }
+        }
+      }
+    }
+  }
+}
--- a/examples_deepspeed/data_efficiency/bert/pretrain/ds_config_bert_2clmetrics_TEMPLATE.json
+++ b/examples_deepspeed/data_efficiency/bert/pretrain/ds_config_bert_2clmetrics_TEMPLATE.json
+{
+  "train_batch_size": GBSIZE,
+  "train_micro_batch_size_per_gpu": MBSIZE,
+  "steps_per_print": LOG_INTERVAL,
+
+  "zero_optimization": {
+    "stage": ZERO_STAGE
+  },
+
+  "gradient_clipping": 1.0,
+  "prescale_gradients": PRESCALE_GRAD,
+
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 11
+  },
+
+  "wall_clock_breakdown" : false,
+  "dataloader_drop_last": true,
+  "data_efficiency": {
+    "enabled": true,
+    "seed": DATA_EFFICIENCY_SEED,
+    "data_routing": {
+      "enabled": LTD_ENABLED,
+      "random_ltd":{
+        "enabled": LTD_ENABLED,
+        "total_layer_num": 24,
+        "random_ltd_layer_num": 22,
+        "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22],
+        "model_mask_name": "attention_mask",
+        "model_type": "encoder",
+        "hidden_state_order": "seq_batch_dim",
+        "random_ltd_schedule": {
+          "min_value": LTD_MIN,
+          "max_value": LTD_MAX,
+          "schedule_type":"fixed_linear",
+          "schedule_config": {
+            "require_steps": LTD_STEP,
+            "seq_per_step": 16
+          }
+        }
+      } 
+    },
+    "data_sampling": {
+      "enabled": CL_ENABLED,
+      "num_workers": DATA_SAMPLING_NUM_WORKERS,
+      "curriculum_learning": {
+        "enabled": CL_ENABLED,
+        "data_cluster_path": "CL_CLUSTER_PATH",
+        "curriculum_metrics": {
+          "CL_1st_METRIC_NAME": {
+            "index_to_sample_path": "CL_1st_SAMPLE_PATH",
+            "index_to_metric_path": "CL_1st_METRIC_PATH",
+            "difficulty_type": "CL_1st_DIFF_TYPE",
+            "clustering_type": "CL_1st_CLUSTER_TYPE",
+            "min_difficulty": CL_1st_MIN,
+            "max_difficulty": CL_1st_MAX,
+            "schedule_type": "fixed_root",
+            "schedule_config": {
+              "total_curriculum_step": CL_1st_TOTAL_STEP,
+              "difficulty_step": CL_1st_DIFF_STEP,
+              "root_degree": CL_1st_ROOT
+            }
+          },
+          "CL_2nd_METRIC_NAME": {
+            "index_to_sample_path": "CL_2nd_SAMPLE_PATH",
+            "index_to_metric_path": "CL_2nd_METRIC_PATH",
+            "difficulty_type": "CL_2nd_DIFF_TYPE",
+            "clustering_type": "CL_2nd_CLUSTER_TYPE",
+            "min_difficulty": CL_2nd_MIN,
+            "max_difficulty": CL_2nd_MAX,
+            "schedule_type": "fixed_root",
+            "schedule_config": {
+              "total_curriculum_step": CL_2nd_TOTAL_STEP,
+              "difficulty_step": CL_2nd_DIFF_STEP,
+              "root_degree": CL_2nd_ROOT
+            }
+          }
+        }
+      }
+    }
+  }
+}