Commit e1354f9d authored by liangjing's avatar liangjing
Browse files

update

parents
Pipeline #1025 failed with stages
in 0 seconds
#!/bin/bash
dir=`pwd`
###############################################################################
### Main configs
## GPT-3 models use 2K sequence length/context window
seq_len=2048
## The "GPT-3 XXX" below are configs from GPT-3 paper
## https://arxiv.org/abs/2005.14165, choose based on
## your desired model size or build your own configs
## init_std is standard deviation for weight initialization. Usually larger
## model needs lower std. We used a heuristic equation of sqrt(1/3/hidden_size)
## from the MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf)
## We changed min_lr to a lower number (1.0e-6), which we found is able to
## provide better zero-shot eval results.
## GPT-3 Small 125M
# model_size=0.125
# num_layers=12
# hidden_size=768
# num_attn_heads=12
# global_batch_size=256
# lr=6.0e-4
# min_lr=1.0e-6
# init_std=0.02
## GPT-3 Medium 350M
# model_size=0.35
# num_layers=24
# hidden_size=1024
# num_attn_heads=16
# global_batch_size=256
# lr=3.0e-4
# min_lr=1.0e-6
# init_std=0.018
## GPT-3 Large 760M
# model_size=0.76
# num_layers=24
# hidden_size=1536
# num_attn_heads=16
# global_batch_size=256
# lr=2.5e-4
# min_lr=1.0e-6
# init_std=0.015
## GPT-3 XL 1.3B
model_size=1.3
num_layers=24
hidden_size=2048
num_attn_heads=16
global_batch_size=512
lr=2.0e-4
min_lr=1.0e-6
init_std=0.013
## GPT-3 2.7B
# model_size=2.7
# num_layers=32
# hidden_size=2560
# num_attn_heads=32
# global_batch_size=512
# lr=1.6e-4
# min_lr=1.0e-6
# init_std=0.011
## GPT-3 6.7B
# model_size=6.7
# num_layers=32
# hidden_size=4096
# num_attn_heads=32
# global_batch_size=1024
# lr=1.2e-4
# min_lr=1.0e-6
# init_std=0.009
## GPT-3 13B
# model_size=13
# num_layers=40
# hidden_size=5120
# num_attn_heads=40
# global_batch_size=1024
# lr=1.0e-4
# min_lr=1.0e-6
# init_std=0.008
## GPT-3 175B
# model_size=175
# num_layers=96
# hidden_size=12288
# num_attn_heads=96
# global_batch_size=1536
# lr=0.6e-4
# min_lr=1.0e-6
# init_std=0.005
###############################################################################
### Training duration configs
## The main termination condition, original GPT-3 paper trains for 300B tokens.
train_tokens_in_billion=300
train_tokens=$((${train_tokens_in_billion} * 1000000000))
## train_samples is another termination condition and also affect the number of
## data samples to be indexed. Since we want to reach the train_tokens
## above, and data efficiency techniques may change num tokens in some samples,
## so we just set this config large enough to make sure we have enough
## processed data and don't terminate by train_samples.
train_samples=$(( 300 * 1000000000 * 2 / ${seq_len} ))
## Another wall-clock time termination condition in minutes. Set it large
## enough to avoid undesired early termination.
exit_duration=30000000
###############################################################################
### lr configs
## lr warmup and decay duration.
## Original GPT-3 paper uses 375M warmup tokens and 260B cosine decay tokens.
## Here we increase the warmup tokens to 3B since when batch size warmup is not
## used, there are more tokens per step. Thus we need to increase warmup tokens
## to make sure there are enough warmup steps, which is important for training
## stability.
lr_warmup_tokens_in_million=3000
lr_warmup_tokens=$((${lr_warmup_tokens_in_million} * 1000000))
## Here we changed the LR decay tokens to align with total train tokens, since
## related works (e.g., https://arxiv.org/abs/2203.15556) find that setting the
## learning rate schedule to match the number of training tokens results in the
## best final model quality
lr_decay_tokens_in_billion=${train_tokens_in_billion}
lr_decay_tokens=$((${lr_decay_tokens_in_billion} * 1000000000))
lr_decay_style="cosine"
###############################################################################
### Parallelism configs
## Model parallelism, 1 is no MP
mp_size=4
## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
## Note that currently both curriculum learning and random-LTD are NOT
## compatible with pipeline parallelism.
pp_size=8
no_pp="false"
## ZeRO-based data parallelism, stage=0 will disable ZeRO
zero_stage=1
## Total number of GPUs. ds_ssh is from DeepSpeed library.
num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
num_node=$(( ${num_gpus} / ${num_gpus_pernode} ))
## Data parallel size.
dp_size=$(( ${num_gpus} / ${pp_size} / ${mp_size} ))
## Micro batch size per GPU
## Make sure that batch_size <= global_batch_size*pp_size*mp_size/num_gpus
## Reduce it manually if GPU OOM
# batch_size=$(( ${global_batch_size} / ${dp_size} ))
batch_size=2
###############################################################################
### curriculum learning (sequence length warmup) configs
# The "divided by 3" means we use 1/3 of baseline's total steps for sequence length warmup.
# This is not always the best config, but usually a reasonable choice to start with.
cl_step=$(( ${lr_warmup_tokens} / 3 / ${global_batch_size} / ${seq_len} ))
# Starting sequence length during sequence length warmup. If the train/validation loss is
# unstable at the beginning of training, need to increase this but also need to keep as multiples
# of 8 in order to enable Tensor Core acceleration.
cl_min=64
###############################################################################
### Misc configs
log_interval=10
eval_iters=10
eval_interval=100
# num_save controls how frequent to save checkpoint. num_save=20 means that a
# checkpoint will be saved every 5% of training. For longer training you would
# want larger num_save to save more frequently, and vice versa.
num_save=100
estimated_train_iter=$((${train_tokens} / ${seq_len} / ${global_batch_size}))
# save_interval=$((${estimated_train_iter} / ${num_save}))
save_interval=100
## Activation checkpointing saves GPU memory, but reduces training speed
activation_checkpoint="true"
# activation_checkpoint="false"
## Whether or not log optimizer states (norms, max abs values) to tensorboard.
## This is not required for training and might save GPU memory when turned off.
log_optimizer_state="true"
###############################################################################
### Output and data configs
current_time=$(date "+%Y.%m.%d_%H.%M.%S")
host="${HOSTNAME}"
seed=1234
num_workers=0
## Public the Pile dataset, can be downloaded at
## https://mystic.the-eye.eu/public/AI/pile_neox/ or
## https://the-eye.eu/public/AI/pile_neox/ Change data_home to where you
## store the pile_text_document.bin and pile_text_document.idx.
data_home="/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing"
data_path="${data_home}/pile_text_document"
vocab_path="gpt2-vocab.json"
if [ ! -f "$vocab_path" ]; then
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
fi
merge_path="gpt2-merges.txt"
if [ ! -f "$merge_path" ]; then
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
fi
prescale_grad="true"
jobname="gpt_${model_size}B_tok${train_tokens_in_billion}B"
jobname="${jobname}_lr${lr}_min${min_lr}_w${lr_warmup_tokens_in_million}M_d${lr_decay_tokens_in_billion}B_${lr_decay_style}"
jobname="${jobname}_gbs${global_batch_size}_mbs${batch_size}_g${num_gpus}"
if [[ $zero_stage -gt 0 ]]; then
jobname="${jobname}_z${zero_stage}"
prescale_grad="false"
fi
if [[ $mp_size -gt 1 ]]; then
jobname="${jobname}_mp${mp_size}"
fi
if [ "${no_pp}" = "false" ]; then
jobname="${jobname}_pp${pp_size}"
fi
jobname="${jobname}_seed${seed}_rebase_rope0.25"
jobname="${jobname}_cl_step${cl_step}_cl_min${cl_min}"
username=$(whoami)
output_home="/blob/users/${username}/project/data_efficient_gpt"
log_path="${output_home}/log/"
checkpoint_path="${output_home}/checkpoint/${jobname}"
## Microsoft internal constraint: because tensorboard is logged by last rank,
## it's better to put the path in NFS instead of Blob.
tensorboard_dir="/vc_data/users/${username}/project/data_efficient_gpt/tensorboard/"
tensorboard_path="${tensorboard_dir}${jobname}_${host}_${current_time}"
mkdir -p ${log_path}
mkdir -p ${checkpoint_path}
mkdir -p ${tensorboard_path}
###############################################################################
data_options=" \
--vocab-file ${vocab_path} \
--merge-file ${merge_path} \
--data-path ${data_path} \
--data-impl mmap"
## If CL is used, make sure to set "--split" the same as what you used during
## offline data analysis&indexing.
megatron_options=" \
--override-opt_param-scheduler \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--tensor-model-parallel-size ${mp_size} \
--init-method-std ${init_std} \
--lr-decay-tokens ${lr_decay_tokens} \
--lr-warmup-tokens ${lr_warmup_tokens} \
--micro-batch-size ${batch_size} \
--exit-duration-in-mins ${exit_duration} \
--global-batch-size ${global_batch_size} \
--num-layers ${num_layers} \
--hidden-size ${hidden_size} \
--num-attention-heads ${num_attn_heads} \
--seq-length ${seq_len} \
--max-position-embeddings ${seq_len} \
--train-tokens ${train_tokens} \
--train-samples ${train_samples} \
--lr ${lr} \
--min-lr ${min_lr} \
--lr-decay-style ${lr_decay_style} \
--split 949,50,1 \
--log-interval ${log_interval} \
--eval-interval ${eval_interval} \
--eval-iters ${eval_iters} \
--save-interval ${save_interval} \
--weight-decay 0.1 \
--clip-grad 1.0 \
--hysteresis 2 \
--num-workers ${num_workers} \
--fp16 \
--seed ${seed} \
--load ${checkpoint_path} \
--save ${checkpoint_path} \
--no-async-tensor-model-parallel-allreduce \
--use-rotary-position-embeddings \
--rotary-percent 0.25 \
--tensorboard-queue-size 1 \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
--tensorboard-dir ${tensorboard_path}"
if [ "${activation_checkpoint}" = "true" ]; then
megatron_options="${megatron_options} \
--checkpoint-activations"
fi
if [ "${log_optimizer_state}" = "true" ]; then
megatron_options="${megatron_options} \
--log-optimizer-states-to-tensorboard"
fi
config_json="ds_config_gbs${global_batch_size}_mbs${batch_size}_log${log_interval}_zero${zero_stage}_cl_step${cl_step}_cl_min${cl_min}.json"
template_json="ds_config_gpt_slw_TEMPLATE.json"
sed "s/GBSIZE/${global_batch_size}/" ${template_json} \
| sed "s/MBSIZE/${batch_size}/" \
| sed "s/LOG_INTERVAL/${log_interval}/" \
| sed "s/ZERO_STAGE/${zero_stage}/" \
| sed "s/PRESCALE_GRAD/${prescale_grad}/" \
| sed "s/CONFIG_CL_MIN/${cl_min}/" \
| sed "s/CONFIG_CL_MAX/${seq_len}/" \
| sed "s/CONFIG_CL_DURATION/${cl_step}/" \
> ${config_json}
deepspeed_options=" \
--deepspeed \
--deepspeed_config ${config_json} \
--zero-stage ${zero_stage} \
--pipeline-model-parallel-size ${pp_size}"
if [[ "${no_pp}" = "true" ]]; then
deepspeed_options="${deepspeed_options} \
--no-pipeline-parallel"
fi
if [ "${activation_checkpoint}" = "true" ]; then
deepspeed_options="${deepspeed_options} \
--deepspeed-activation-checkpointing"
fi
## When saving checkpoint to a storage with cache, their could be consistency
## issue of the pointer to latest checkpoint. Here we find the correct pointer
## and broadcast it to all nodes.
iteration_file="$checkpoint_path/latest_checkpointed_iteration.txt"
iteration_file_2="$checkpoint_path/latest"
iteration=0
for (( node = 0; node <= num_node-1; node++ ))
do
if $(ssh -q worker-"$node" "test -f \"$iteration_file\""); then
local_iteration=$(ssh -q worker-"$node" cat $iteration_file)
iteration=$(( ${local_iteration} > ${iteration} ? ${local_iteration} : ${iteration} ))
fi
done
if [[ $iteration -gt 0 ]]; then
iteration_2="global_step${iteration}"
ds_ssh "echo $iteration > $iteration_file"
ds_ssh "echo $iteration_2 > $iteration_file_2"
fi
deepspeed ${dir}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &>> ${log_path}/${jobname}_${host}_${current_time}.log
\ No newline at end of file
# # baseline
# CONFIG=baseline
# TAG=baseline
# MODEL_SIZE=1558
# LR=1.5e-4
# BSZ=512
# SEQ_LEN=1024
# MP_SIZE=1
# SEED=1234
# SAVE_INTERVAL=5000
# NUM_ITER=600000
# NUM_TOKEN=157286400000
# LR_DECAY_TOKEN=157286400000
# LR_WARMUP_ITER=3000
# CONFIG_TEMPLATE=false
# CURRICULUM_STEP=0
# CURRICULUM_MIN=0
# curriculum learning
CONFIG=curriculum_fixed_linear
MODEL_SIZE=1558
LR=6e-4
BSZ=4096
SEQ_LEN=1024
MP_SIZE=1
SEED=1234
SAVE_INTERVAL=1000
NUM_ITER=75000
NUM_TOKEN=157286400000
LR_DECAY_TOKEN=157286400000
LR_WARMUP_ITER=3000
CONFIG_TEMPLATE=true
CURRICULUM_STEP=45000
CURRICULUM_MIN=64
TAG="${CONFIG}_s${CURRICULUM_MIN}to${SEQ_LEN}_step${CURRICULUM_STEP}"
bash ds_pretrain_gpt2.sh $CONFIG $TAG $MODEL_SIZE $LR $BSZ $SEQ_LEN $MP_SIZE $SEED $SAVE_INTERVAL $NUM_ITER $NUM_TOKEN $LR_DECAY_TOKEN $LR_WARMUP_ITER $CONFIG_TEMPLATE $CURRICULUM_STEP $CURRICULUM_MIN
{
"train_batch_size": 512,
"gradient_accumulation_steps": 1,
"steps_per_print": 1,
"zero_optimization": {
"stage": 1
},
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015,
"max_grad_norm": 1.0,
"betas": [0.9, 0.95]
}
},
"gradient_clipping": 1.0,
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"wall_clock_breakdown": false,
"zero_allow_untested_optimizer": false
}
{
"train_batch_size": 512,
"gradient_accumulation_steps": 1,
"steps_per_print": 1,
"zero_optimization": {
"stage": 1
},
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015,
"max_grad_norm": 1.0,
"betas": [0.9, 0.95]
}
},
"gradient_clipping": 1.0,
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"wall_clock_breakdown": false,
"zero_allow_untested_optimizer": false,
"curriculum_learning": {
"enabled": true,
"curriculum_type": "seqlen",
"min_difficulty": CONFIG_CL_MIN,
"max_difficulty": CONFIG_CL_MAX,
"schedule_type": "fixed_linear",
"schedule_config": {
"total_curriculum_step": CONFIG_CL_DURATION,
"difficulty_step": 8
}
}
}
This directory includes GPT-3/BERT pretraining example scripts for DeepSpeed Data Efficiency Library technologies (curriculum learning, random-LTD, and the two composed together).
You need to install updated DeepSpeed version (>=0.8.0), which contains the DeepSpeed Data Efficiency Library.
Additional tutorial can be found at [DeepSpeed website](https://www.deepspeed.ai/tutorials/data-efficiency/).
Additional technical details can be found in our [random-LTD paper](https://arxiv.org/abs/2211.11586) and [data efficiency paper](https://arxiv.org/abs/2212.03597).
## GPT-3 pretraining and evaluation
Inside ``gpt`` folder, first the ``ds_analyze_gpt_data_map.sh`` and ``ds_analyze_gpt_data_reduce.sh`` are used for curriculum learning's offline data analysis and indexing.
``gpt/pretrain`` includes the pretraining example scripts. You can choose a setup to run by uncommenting one block in ``ds_pretrain_gpt_1.3B_dense_run.sh``. One thing to note is that in our [random-LTD paper](https://arxiv.org/abs/2211.11586) we did not scale peak learning rate when using less than 100% data, while in our later [data efficiency paper](https://arxiv.org/abs/2212.03597) we find that scaling LR based on used percentage of data helps improve model quality.
``gpt/eval`` includes the zero-/few-shot evaluation example scripts. ``ds_evalharness_parallel_run.sh`` is for zero-shot, and ``ds_evalharness_parallel_run_10shot.sh`` is for 10-shot.
## BERT pretraining and finetuning
Inside ``bert`` folder, first the ``pile_data_download_preprocess.py`` can be used to download and preprocess the public Pile dataset.
The ``ds_analyze_bert_data_map.sh`` and ``ds_analyze_bert_data_reduce.sh`` are used for curriculum learning's offline data analysis and indexing.
``bert/pretrain`` includes the pretraining example scripts. You can choose a setup to run by uncommenting one block in ``ds_pretrain_bert_336M_run.sh``. One thing to note is that in our [random-LTD paper](https://arxiv.org/abs/2211.11586) we did not scale peak learning rate when using less than 100% data, while in our later [data efficiency paper](https://arxiv.org/abs/2212.03597) we find that scaling LR based on used percentage of data helps improve model quality.
``bert/finetune`` includes the MNLI/QQP/RACE finetuning example scripts following the [Megatron-LM paper](https://arxiv.org/abs/1909.08053). However, we found that the RACE task's accuracy is not very stable and the Megatron-LM paper used a very long number of epochs for MNLI/QQP which is not necessary. Thus we added capability of finetuning other GLUE tasks, and switched to follow the hyperparameters of the [original BERT paper](https://arxiv.org/abs/1810.04805). The corresponding scripts are at ``bert/finetune_glue``, which we recommend to use instead of ``bert/finetune``. Our [data efficiency paper](https://arxiv.org/abs/2212.03597) also uses the scripts under ``bert/finetune_glue`` for GLUE finetuning.
\ No newline at end of file
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''
Copyright 2022 The Microsoft DeepSpeed Team
'''
import os
import time
import sys
import math
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
os.path.pardir,os.path.pardir)))
from datetime import datetime
import numpy as np
import torch
from deepspeed.runtime.data_pipeline.data_sampling.data_analyzer \
import DataAnalyzer
from deepspeed.runtime.data_pipeline.data_sampling.indexed_dataset \
import MMapIndexedDataset
from megatron import get_args
from megatron import print_rank_0
from megatron.initialize import initialize_megatron
def get_tasks_args(parser):
"""Provide extra arguments required for data analyzing."""
group = parser.add_argument_group(title='data_analyzing')
group.add_argument('--analyzing-task', type=str, required=True,
default=None,
choices=['map',
'reduce'],
help='What type of analyzing task to perform.')
group.add_argument('--analyzing-data-type', type=str, required=True,
default=None,
choices=['BERT',
'GPT'],
help='What type of data.')
group.add_argument('--analyzing-metric', type=str, nargs='+', default=[],
help='What kinds of metrics to analyze.')
group.add_argument('--analyzing-num-workers', type=int, default=1,
help='Number of workers. Each worker could be a single CPU node.')
group.add_argument('--analyzing-worker-id', type=int, default=0,
help='Worker id of current node.')
group.add_argument('--analyzing-num-threads', type=int, default=1,
help='Number of threads for each worker.')
group.add_argument('--analyzing-num-threads-reduce', type=int, default=1,
help='Number of threads for each worker.')
group.add_argument('--analyzing-specific-threads', type=int, nargs='+', default=[],
help='Which specific threads to run. Helpful when there are specific thread failed in previous run.')
return parser
def train_valid_test_datasets_provider_gpt():
"""Build train, valid, and test datasets."""
args = get_args()
print_rank_0('> building train, validation, and test datasets '
'for GPT ...')
from megatron.data.gpt_dataset import build_train_valid_test_datasets
train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
data_prefix=args.data_path,
data_impl=args.data_impl,
splits_string=args.split,
train_valid_test_num_samples=[1,1,1], # Just dummy numbers since we assume args.train_data_exact_num_epochs will override them
seq_length=args.seq_length,
seed=args.seed,
skip_warmup=(not args.mmap_warmup))
print_rank_0("> finished creating GPT datasets ...")
return train_ds, valid_ds, test_ds
def train_valid_test_datasets_provider_bert():
"""Build train, valid, and test datasets."""
args = get_args()
print_rank_0('> building train, validation, and test datasets '
'for BERT ...')
from megatron.data.dataset_utils import build_train_valid_test_datasets
train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
data_prefix=args.data_path,
data_impl=args.data_impl,
splits_string=args.split,
train_valid_test_num_samples=[1,1,1], # Just dummy numbers since we assume args.train_data_exact_num_epochs will override them
max_seq_length=args.seq_length,
masked_lm_prob=args.mask_prob,
short_seq_prob=args.short_seq_prob,
seed=args.seed,
skip_warmup=(not args.mmap_warmup),
binary_head=args.bert_binary_head)
print_rank_0("> finished creating BERT datasets ...")
return train_ds, valid_ds, test_ds
def metric_seqlen(data):
metric = torch.count_nonzero(data['padding_mask'], dim=1)
return metric
def metric_total_vocab_freq(data):
args = get_args()
if args.analyzing_data_type == 'BERT':
frequency = torch.bincount(data['text'].view(-1),
minlength=args.padded_vocab_size+1,
weights=data['padding_mask'].view(-1))
elif args.analyzing_data_type == 'GPT':
frequency = torch.bincount(data['text'].view(-1),
minlength=args.padded_vocab_size+1)
return frequency
def metric_vocab_rarity(data):
args = get_args()
if args.analyzing_data_type == 'BERT':
rarity = torch.sum(data['padding_mask'] * \
args.total_vocab_freq[data['text']], dim=1).to(torch.long)
elif args.analyzing_data_type == 'GPT':
rarity = []
# Do one by one to avoid too high memory consumption
for row in range(data['text'].size()[0]):
rarity.append(int(torch.sum(args.total_vocab_freq[data['text'][row]]).item()))
rarity = torch.tensor(rarity, dtype=torch.long)
print(f"rarity min {min(rarity)}, max {max(rarity)}, len {len(rarity)}, avg {sum(rarity)/len(rarity)}")
return rarity
def metric_seqlen_vocab_rarity(data):
args = get_args()
metric = torch.count_nonzero(data['padding_mask'], dim=1).to(torch.long) * args.seqlen_coeff
metric += torch.sum(data['padding_mask'] * \
args.total_vocab_freq[data['text']], dim=1).to(torch.long)
print(f"metric min {min(metric)}, max {max(metric)}, len {len(metric)}, avg {sum(metric)/len(metric)}")
return metric
def get_metric_function(metric_name):
if metric_name == 'seqlen':
return metric_seqlen
if metric_name == 'total_vocab_freq':
return metric_total_vocab_freq
if metric_name == 'vocab_rarity':
return metric_vocab_rarity
if metric_name == 'seqlen_vocab_rarity':
return metric_seqlen_vocab_rarity
def get_metric_type(metric_name):
if metric_name == 'seqlen':
return 'single_value_per_sample'
if metric_name == 'total_vocab_freq':
return 'accumulate_value_over_samples'
if metric_name == 'vocab_rarity':
return 'single_value_per_sample'
if metric_name == 'seqlen_vocab_rarity':
return 'single_value_per_sample'
def run_map():
args = get_args()
if args.analyzing_data_type == 'BERT':
args.mask_prob = 0 # When analyzing data, we don't want any mask.
train_ds, _, _ = train_valid_test_datasets_provider_bert()
elif args.analyzing_data_type == 'GPT':
train_ds, _, _ = train_valid_test_datasets_provider_gpt()
assert 'seqlen' not in args.analyzing_metric, 'GPT data has fixed seqlen, thus unnecessary to analyze seqlen metric.'
assert 'seqlen_vocab_rarity' not in args.analyzing_metric, 'GPT data has fixed seqlen, thus unnecessary to analyze seqlen metric.'
if 'vocab_rarity' in args.analyzing_metric or 'seqlen_vocab_rarity' in args.analyzing_metric:
total_vocab_freq_fname = f"{args.save}/total_vocab_freq/total_vocab_freq_metric_value"
assert os.path.isfile(f"{total_vocab_freq_fname}.bin") and os.path.isfile(f"{total_vocab_freq_fname}.idx"), "To analyze vocab rarity, first need to analyze the total vocab freq."
total_vocab_freq = MMapIndexedDataset(total_vocab_freq_fname, skip_warmup=True)
total_vocab_freq = np.copy(total_vocab_freq[0])
total_vocab_freq[total_vocab_freq == 0] = 1 # Avoid log(0) error
total_vocab_freq = np.log(total_vocab_freq/sum(total_vocab_freq)) * -1
args.total_vocab_freq = torch.tensor(total_vocab_freq, dtype=torch.double)
if 'seqlen_vocab_rarity' in args.analyzing_metric:
# Use large coeff to make seqlen dominates vocab_rarity
max_possible_rarity = args.seq_length * torch.max(args.total_vocab_freq).item()
args.seqlen_coeff = 10 ** (math.ceil(math.log(max_possible_rarity, 10)) + 1)
print(f"Metric seqlen_vocab_rarity: using {args.seqlen_coeff} as coefficient for seqlen.")
metric_functions = [get_metric_function(x) for x in args.analyzing_metric]
metric_types = [get_metric_type(x) for x in args.analyzing_metric]
# For metric_dtypes we int64 by default since it could be hard to estimate
# the appropriate dtype before the mapping analysis. During reduce where
# we merge the analysis results, the DataAnalyzer will automatically choose
# the dtype of merged result file as the smallest one that meet the range
# requirement.
metric_dtypes = [np.int64 for x in args.analyzing_metric]
start = time.time()
data_analyzer = DataAnalyzer(train_ds,
num_workers=args.analyzing_num_workers,
worker_id=args.analyzing_worker_id,
num_threads=args.analyzing_num_threads,
specific_threads=args.analyzing_specific_threads,
batch_size=args.global_batch_size, metric_names=args.analyzing_metric,
metric_functions=metric_functions, metric_types=metric_types,
metric_dtypes=metric_dtypes, save_path=args.save)
data_analyzer.run_map()
duration = (time.time() - start) / 3600.0
print(f"map job finished in {duration} hr.")
def run_reduce():
args = get_args()
if args.analyzing_data_type == 'BERT':
args.mask_prob = 0 # When analyzing data, we don't want any mask.
train_ds, _, _ = train_valid_test_datasets_provider_bert()
elif args.analyzing_data_type == 'GPT':
train_ds, _, _ = train_valid_test_datasets_provider_gpt()
metric_functions = [get_metric_function(x) for x in args.analyzing_metric]
metric_types = [get_metric_type(x) for x in args.analyzing_metric]
metric_dtypes = [np.int64 for x in args.analyzing_metric]
start = time.time()
data_analyzer = DataAnalyzer(train_ds,
num_workers=args.analyzing_num_workers,
num_threads=args.analyzing_num_threads,
num_threads_reduce=args.analyzing_num_threads_reduce,
batch_size=args.global_batch_size, metric_names=args.analyzing_metric,
metric_functions=metric_functions, metric_types=metric_types,
metric_dtypes=metric_dtypes, save_path=args.save)
data_analyzer.run_reduce()
duration = (time.time() - start) / 3600.0
print(f"reduce job finished in {duration} hr.")
if __name__ == "__main__":
initialize_megatron(extra_args_provider=get_tasks_args, allow_no_cuda=True)
args = get_args()
if args.analyzing_task == 'map':
run_map()
elif args.analyzing_task == 'reduce':
run_reduce()
else:
raise NotImplementedError('Task {} is not implemented.'.format(
args.analyzing_task))
#!/bin/bash
num_workers=1 # Num nodes to run the map job
num_threads=40 # Num threads on each node. Set this based on #CPU cores
# If different data epochs have slightly different data samples (e.g., due
# to randomness), then you need to specify large enough num_epochs that cover
# whole pretraining. If different data epochs are the same, set num_epochs to
# 1 to only index 1 epoch, and during pretraining DeepSpeed data efficiency
# library will automatically handle reshuffling when reaching another epoch.
num_epochs=5
# Which node is this node (start with 0 and end with num_workers-1). This
# script only launch the map job on 1 worker node, since we don't expect
# running on many nodes and workers don't need any communication. But you
# can modify this script to add a MPI/torch distributed launcher.
worker_id=$1
save_path="/blob/users/conglli/data/analysis_pile_bert_${num_epochs}epoch/"
metric='total_vocab_freq'
# metric='vocab_rarity' # this requires the result of total_vocab_freq
# metric='seqlen_vocab_rarity' # this requires the result of total_vocab_freq
# metric='seqlen'
seq_len=512
batch_size=10000
jobname="bert-pile-analyzing-${metric}-${num_epochs}epoch-map-worker${worker_id}"
## Public the Pile dataset, see prepare_pile_data.py in the same directory
## about how to download and preprocess the data.
## Change data_home to your own training data path.
# data_home="/vc_data_blob/users/conglli/the_pile_bert"
data_home="/blob/data/the_pile_bert"
data_path="${data_home}/pile_bert_train_text_sentence"
vocab_path="bert-large-uncased-vocab.txt"
if [ ! -f "$vocab_path" ]; then
wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
fi
# Make sure the "--split" is the same as what you will use for pre-training.
options=" \
--analyzing-task map \
--analyzing-data-type BERT \
--analyzing-metric ${metric} \
--analyzing-num-workers ${num_workers} \
--analyzing-worker-id ${worker_id} \
--analyzing-num-threads ${num_threads} \
--vocab-file ${vocab_path} \
--data-path ${data_path} \
--data-impl mmap \
--tokenizer-type BertWordPieceLowerCase \
--micro-batch-size ${batch_size} \
--global-batch-size ${batch_size} \
--seq-length ${seq_len} \
--max-position-embeddings ${seq_len} \
--num-layers 1 \
--hidden-size 1 \
--num-attention-heads 1 \
--split 949,50,1 \
--distributed-backend gloo \
--train-data-exact-num-epochs ${num_epochs} \
--return-data-index \
--save-interval 1 \
--save ${save_path}"
python ../analyze_data.py ${options} &> ${jobname}.log
\ No newline at end of file
#!/bin/bash
# Set these 2 to the same as what you used during map job. We need these 2
# configs to know how many map job result files do we have.
num_workers=1
num_threads=40
# Reduce job only has 1 worker but can accelerate by multithreading.
num_threads_reduce=40
# If different data epochs have slightly different data samples (e.g., due
# to randomness), then you need to specify large enough num_epochs that cover
# whole pretraining. If different data epochs are the same, set num_epochs to
# 1 to only index 1 epoch, and during pretraining DeepSpeed data efficiency
# library will automatically handle reshuffling when reaching another epoch.
num_epochs=5
save_path="/blob/users/conglli/data/analysis_pile_bert_${num_epochs}epoch/"
metric='total_vocab_freq'
# metric='vocab_rarity' # this requires the result of total_vocab_freq
# metric='seqlen_vocab_rarity' # this requires the result of total_vocab_freq
# metric='seqlen'
seq_len=512
batch_size=10000
jobname="bert-pile-analyzing-${metric}-${num_epochs}epoch-reduce"
## Public the Pile dataset, see prepare_pile_data.py in the same directory
## about how to download and preprocess the data.
## Change data_home to your own training data path.
# data_home="/vc_data_blob/users/conglli/the_pile_bert"
data_home="/blob/data/the_pile_bert"
data_path="${data_home}/pile_bert_train_text_sentence"
vocab_path="bert-large-uncased-vocab.txt"
if [ ! -f "$vocab_path" ]; then
wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
fi
# Make sure the "--split" is the same as what you will use for pre-training.
options=" \
--analyzing-task reduce \
--analyzing-data-type BERT \
--analyzing-metric ${metric} \
--analyzing-num-workers ${num_workers} \
--analyzing-num-threads ${num_threads} \
--analyzing-num-threads-reduce ${num_threads_reduce} \
--vocab-file ${vocab_path} \
--data-path ${data_path} \
--data-impl mmap \
--tokenizer-type BertWordPieceLowerCase \
--micro-batch-size ${batch_size} \
--global-batch-size ${batch_size} \
--seq-length ${seq_len} \
--max-position-embeddings ${seq_len} \
--num-layers 1 \
--hidden-size 1 \
--num-attention-heads 1 \
--split 949,50,1 \
--distributed-backend gloo \
--train-data-exact-num-epochs ${num_epochs} \
--return-data-index \
--save-interval 1 \
--save ${save_path}"
python ../analyze_data.py ${options} &> ${jobname}.log
\ No newline at end of file
{
"train_batch_size" : CONFIG_BATCH_SIZE,
"train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
"steps_per_print": LOG_INTERVAL,
"zero_optimization": {
"stage": ZERO_STAGE
},
"gradient_clipping": 1.0,
"prescale_gradients": PRESCALE_GRAD,
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 11
},
"wall_clock_breakdown" : false
}
seed=1234
pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp"
###############################################################################
### Main configs
### The main configs are from Megatron-LM paper
### https://arxiv.org/abs/1909.08053. Choose based on your desired model size
### or build your own configs.
seq_len=512
## From Table 6 in https://arxiv.org/abs/1909.08053.
task="MNLI"
global_batch_size=128
lr=1e-5
epochs=10
train_data="/blob/data/GlueData/MNLI/train.tsv"
valid_data="/blob/data/GlueData/MNLI/dev_matched.tsv \
/blob/data/GlueData/MNLI/dev_mismatched.tsv"
## Adjust based on number of GPUs.
batch_size=16
## BERT 110M (same config as original BERT-Base model)
## This config is not included in Megatron-LM paper
# model_size=0.11
# num_layers=12
# hidden_size=768
# num_attn_heads=12
## BERT 336M (same config as original BERT-Large model)
model_size=0.336
num_layers=24
hidden_size=1024
num_attn_heads=16
## BERT 1.3B
# model_size=1.3
# num_layers=24
# hidden_size=2048
# num_attn_heads=32
## BERT 3.9B
# model_size=3.9
# num_layers=48
# hidden_size=2560
# num_attn_heads=40
###############################################################################
### Parallelism configs
## Model parallelism, 1 is no MP
mp_size=1
## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
## Currently pipeline parallelism is not supported for BERT model: DeepSpeed's
## pipeline parallelism is only integrated with the GPT case, and currently
## DeepSpeed is not integrated with Megatron's own pipeline parallelism.
pp_size=1
no_pp="true"
## ZeRO stage
zero_stage=0
###############################################################################
### Misc configs
log_interval=10
eval_iters=50
eval_interval=100
save_interval=500000
## Activation checkpointing saves GPU memory, but reduces training speed
# activation_checkpoint="true"
activation_checkpoint="false"
###############################################################################
vocab_file="bert-large-uncased-vocab.txt"
if [ ! -f "$vocab_file" ]; then
wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
fi
jobname="${task}-bsz${global_batch_size}-lr${lr}-epochs${epochs}-seed${seed}"
checkpoint_path="${pretrained_checkpoint}-finetune/${jobname}"
mkdir -p ${checkpoint_path}
template_json="ds_config_bert_TEMPLATE.json"
config_json="ds_config_bert_bsz${global_batch_size}_mbsz${batch_size}_log${log_interval}_zero${zero_stage}.json"
if [[ $zero_stage -gt 0 ]]; then
sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
| sed "s/CONFIG_MBSIZE/${batch_size}/" \
| sed "s/LOG_INTERVAL/${log_interval}/" \
| sed "s/ZERO_STAGE/${zero_stage}/" \
| sed "s/PRESCALE_GRAD/false/" \
| sed "s/CONFIG_FP16_ENABLED/true/" \
| sed "s/CONFIG_BF16_ENABLED/false/" \
> ${config_json}
else
sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
| sed "s/CONFIG_MBSIZE/${batch_size}/" \
| sed "s/LOG_INTERVAL/${log_interval}/" \
| sed "s/ZERO_STAGE/${zero_stage}/" \
| sed "s/PRESCALE_GRAD/true/" \
| sed "s/CONFIG_FP16_ENABLED/true/" \
| sed "s/CONFIG_BF16_ENABLED/false/" \
> ${config_json}
fi
options=" \
--finetune \
--deepspeed \
--deepspeed_config ${config_json} \
--zero-stage ${zero_stage} \
--task ${task} \
--seed ${seed} \
--train-data ${train_data} \
--valid-data ${valid_data} \
--tokenizer-type BertWordPieceLowerCase \
--vocab-file ${vocab_file} \
--epochs ${epochs} \
--pretrained-checkpoint ${pretrained_checkpoint} \
--tensor-model-parallel-size ${mp_size} \
--pipeline-model-parallel-size ${pp_size} \
--num-layers ${num_layers} \
--hidden-size ${hidden_size} \
--num-attention-heads ${num_attn_heads} \
--global-batch-size ${global_batch_size} \
--micro-batch-size ${batch_size} \
--lr ${lr} \
--lr-decay-style linear \
--lr-warmup-fraction 0.065 \
--seq-length ${seq_len} \
--max-position-embeddings ${seq_len} \
--save-interval ${save_interval} \
--save ${checkpoint_path} \
--log-interval ${log_interval} \
--eval-interval ${eval_interval} \
--eval-iters ${eval_iters} \
--weight-decay 1.0e-1 \
--fp16"
if [ "${activation_checkpoint}" = "true" ]; then
options="${options} \
--checkpoint-activations \
--deepspeed-activation-checkpointing"
fi
if [[ "${no_pp}" = "true" ]]; then
options="${options} \
--no-pipeline-parallel"
fi
# After the fine-tuning finishes, you can find the dev set accuracy numbers by
# "grep -e "overall:" -e "metrics for" ${checkpoint_path}/output.log"
deepspeed ../../../../tasks/main.py ${options} &> ${checkpoint_path}/output.log
seed=1234
pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp"
###############################################################################
### Main configs
### The main configs are from Megatron-LM paper
### https://arxiv.org/abs/1909.08053. Choose based on your desired model size
### or build your own configs.
seq_len=512
## From Table 6 in https://arxiv.org/abs/1909.08053.
task="QQP"
train_data="/blob/data/GlueData/QQP/train.tsv"
valid_data="/blob/data/GlueData/QQP/dev.tsv"
## Adjust based on number of GPUs.
batch_size=16
## BERT 110M (same config as original BERT-Base model)
## This config is not included in Megatron-LM paper
# model_size=0.11
# num_layers=12
# hidden_size=768
# num_attn_heads=12
# global_batch_size=128
# lr=5e-5
# epochs=12
## BERT 336M (same config as original BERT-Large model)
model_size=0.336
num_layers=24
hidden_size=1024
num_attn_heads=16
global_batch_size=128
lr=5e-5
epochs=12
## BERT 1.3B
# model_size=1.3
# num_layers=24
# hidden_size=2048
# num_attn_heads=32
# global_batch_size=128
# lr=3e-5
# epochs=12
## BERT 3.9B
# model_size=3.9
# num_layers=48
# hidden_size=2560
# num_attn_heads=40
# global_batch_size=256
# lr=4e-5
# epochs=12
###############################################################################
### Parallelism configs
## Model parallelism, 1 is no MP
mp_size=1
## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
## Currently pipeline parallelism is not supported for BERT model: DeepSpeed's
## pipeline parallelism is only integrated with the GPT case, and currently
## DeepSpeed is not integrated with Megatron's own pipeline parallelism.
pp_size=1
no_pp="true"
## ZeRO stage
zero_stage=0
###############################################################################
### Misc configs
log_interval=10
eval_iters=50
eval_interval=100
save_interval=500000
## Activation checkpointing saves GPU memory, but reduces training speed
# activation_checkpoint="true"
activation_checkpoint="false"
###############################################################################
vocab_file="bert-large-uncased-vocab.txt"
if [ ! -f "$vocab_file" ]; then
wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
fi
jobname="${task}-bsz${global_batch_size}-lr${lr}-epochs${epochs}-seed${seed}"
checkpoint_path="${pretrained_checkpoint}-finetune/${jobname}"
mkdir -p ${checkpoint_path}
template_json="ds_config_bert_TEMPLATE.json"
config_json="ds_config_bert_bsz${global_batch_size}_mbsz${batch_size}_log${log_interval}_zero${zero_stage}.json"
if [[ $zero_stage -gt 0 ]]; then
sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
| sed "s/CONFIG_MBSIZE/${batch_size}/" \
| sed "s/LOG_INTERVAL/${log_interval}/" \
| sed "s/ZERO_STAGE/${zero_stage}/" \
| sed "s/PRESCALE_GRAD/false/" \
| sed "s/CONFIG_FP16_ENABLED/true/" \
| sed "s/CONFIG_BF16_ENABLED/false/" \
> ${config_json}
else
sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
| sed "s/CONFIG_MBSIZE/${batch_size}/" \
| sed "s/LOG_INTERVAL/${log_interval}/" \
| sed "s/ZERO_STAGE/${zero_stage}/" \
| sed "s/PRESCALE_GRAD/true/" \
| sed "s/CONFIG_FP16_ENABLED/true/" \
| sed "s/CONFIG_BF16_ENABLED/false/" \
> ${config_json}
fi
options=" \
--finetune \
--deepspeed \
--deepspeed_config ${config_json} \
--zero-stage ${zero_stage} \
--task ${task} \
--seed ${seed} \
--train-data ${train_data} \
--valid-data ${valid_data} \
--tokenizer-type BertWordPieceLowerCase \
--vocab-file ${vocab_file} \
--epochs ${epochs} \
--pretrained-checkpoint ${pretrained_checkpoint} \
--tensor-model-parallel-size ${mp_size} \
--pipeline-model-parallel-size ${pp_size} \
--num-layers ${num_layers} \
--hidden-size ${hidden_size} \
--num-attention-heads ${num_attn_heads} \
--global-batch-size ${global_batch_size} \
--micro-batch-size ${batch_size} \
--lr ${lr} \
--lr-decay-style linear \
--lr-warmup-fraction 0.065 \
--seq-length ${seq_len} \
--max-position-embeddings ${seq_len} \
--save-interval ${save_interval} \
--save ${checkpoint_path} \
--log-interval ${log_interval} \
--eval-interval ${eval_interval} \
--eval-iters ${eval_iters} \
--weight-decay 1.0e-1 \
--fp16"
if [ "${activation_checkpoint}" = "true" ]; then
options="${options} \
--checkpoint-activations \
--deepspeed-activation-checkpointing"
fi
if [[ "${no_pp}" = "true" ]]; then
options="${options} \
--no-pipeline-parallel"
fi
# After the fine-tuning finishes, you can find the dev set accuracy numbers by
# "grep -e "overall:" -e "metrics for" ${checkpoint_path}/output.log"
deepspeed ../../../../tasks/main.py ${options} &> ${checkpoint_path}/output.log
seed=1234
## RACE have two sub-tasks that need to be finetuned separately
difficulty="middle"
# difficulty="high"
pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp"
###############################################################################
### Main configs
### The main configs are from Megatron-LM paper
### https://arxiv.org/abs/1909.08053. Choose based on your desired model size
### or build your own configs.
seq_len=512
## From Table 6 in https://arxiv.org/abs/1909.08053.
task="RACE"
## Race dataset can be downloaded by:
## wget http://www.cs.cmu.edu/~glai1/data/race/RACE.tar.gz
train_data="/blob/data/RACE/train/${difficulty}"
## The Megatron paper https://arxiv.org/abs/1909.08053 says: "For the test set
## results of RACE, we first use the development set to find the checkpoint
## that gives us the median score on the 5 random seeds and we report the
## results from that checkpoint on the test set", which is a quite confusing
## description. For simplicity, instead we directly get the median dev and test
## set score on 5 random seeds from a single pretrained_checkpoint.
valid_data="/blob/data/RACE/dev/${difficulty} \
/blob/data/RACE/test/${difficulty}"
## Adjust based on number of GPUs.
batch_size=4
## BERT 110M (same config as original BERT-Base model)
## This config is not included in Megatron-LM paper
# model_size=0.11
# num_layers=12
# hidden_size=768
# num_attn_heads=12
# global_batch_size=32
# lr=2e-5
# epochs=3
## BERT 336M (same config as original BERT-Large model)
model_size=0.336
num_layers=24
hidden_size=1024
num_attn_heads=16
global_batch_size=32
lr=2e-5
epochs=3
## BERT 1.3B
# model_size=1.3
# num_layers=24
# hidden_size=2048
# num_attn_heads=32
# global_batch_size=16
# lr=1e-5
# epochs=3
## BERT 3.9B
# model_size=3.9
# num_layers=48
# hidden_size=2560
# num_attn_heads=40
# global_batch_size=32
# lr=2e-5
# epochs=3
###############################################################################
### Parallelism configs
## Model parallelism, 1 is no MP
mp_size=1
## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
## Currently pipeline parallelism is not supported for BERT model: DeepSpeed's
## pipeline parallelism is only integrated with the GPT case, and currently
## DeepSpeed is not integrated with Megatron's own pipeline parallelism.
pp_size=1
no_pp="true"
## ZeRO stage
zero_stage=0
###############################################################################
### Misc configs
log_interval=10
eval_iters=50
eval_interval=100
save_interval=100000
## Activation checkpointing saves GPU memory, but reduces training speed
# activation_checkpoint="true"
activation_checkpoint="false"
###############################################################################
vocab_file="bert-large-uncased-vocab.txt"
if [ ! -f "$vocab_file" ]; then
wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
fi
jobname="${task}-${difficulty}-bsz${global_batch_size}-lr${lr}-epochs${epochs}-seed${seed}"
checkpoint_path="${pretrained_checkpoint}-finetune/${jobname}"
mkdir -p ${checkpoint_path}
template_json="ds_config_bert_TEMPLATE.json"
config_json="ds_config_bert_bsz${global_batch_size}_mbsz${batch_size}_log${log_interval}_zero${zero_stage}.json"
if [[ $zero_stage -gt 0 ]]; then
sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
| sed "s/CONFIG_MBSIZE/${batch_size}/" \
| sed "s/LOG_INTERVAL/${log_interval}/" \
| sed "s/ZERO_STAGE/${zero_stage}/" \
| sed "s/PRESCALE_GRAD/false/" \
| sed "s/CONFIG_FP16_ENABLED/true/" \
| sed "s/CONFIG_BF16_ENABLED/false/" \
> ${config_json}
else
sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
| sed "s/CONFIG_MBSIZE/${batch_size}/" \
| sed "s/LOG_INTERVAL/${log_interval}/" \
| sed "s/ZERO_STAGE/${zero_stage}/" \
| sed "s/PRESCALE_GRAD/true/" \
| sed "s/CONFIG_FP16_ENABLED/true/" \
| sed "s/CONFIG_BF16_ENABLED/false/" \
> ${config_json}
fi
options=" \
--finetune \
--deepspeed \
--deepspeed_config ${config_json} \
--zero-stage ${zero_stage} \
--task ${task} \
--seed ${seed} \
--train-data ${train_data} \
--valid-data ${valid_data} \
--tokenizer-type BertWordPieceLowerCase \
--vocab-file ${vocab_file} \
--epochs ${epochs} \
--pretrained-checkpoint ${pretrained_checkpoint} \
--tensor-model-parallel-size ${mp_size} \
--pipeline-model-parallel-size ${pp_size} \
--num-layers ${num_layers} \
--hidden-size ${hidden_size} \
--num-attention-heads ${num_attn_heads} \
--global-batch-size ${global_batch_size} \
--micro-batch-size ${batch_size} \
--lr ${lr} \
--lr-decay-style linear \
--lr-warmup-fraction 0.06 \
--seq-length ${seq_len} \
--max-position-embeddings ${seq_len} \
--save-interval ${save_interval} \
--save ${checkpoint_path} \
--log-interval ${log_interval} \
--eval-interval ${eval_interval} \
--eval-iters ${eval_iters} \
--weight-decay 1.0e-1 \
--clip-grad 1.0 \
--fp16"
if [ "${activation_checkpoint}" = "true" ]; then
options="${options} \
--checkpoint-activations \
--deepspeed-activation-checkpointing"
fi
if [[ "${no_pp}" = "true" ]]; then
options="${options} \
--no-pipeline-parallel"
fi
# After the fine-tuning finishes, you can find the dev/test set accuracy numbers
# by "grep -e "overall:" -e "metrics for" ${checkpoint_path}/output.log"
deepspeed ../../../../tasks/main.py ${options} &> ${checkpoint_path}/output.log
import os
import statistics
def gather_numbers(fname, match_keywords, index_keywords, index_offsets):
results = {}
for k in index_keywords:
results[k] = []
file1 = open(fname, 'r')
while True:
line = file1.readline()
if not line:
break
splits = line.split(' ')
for i in range(len(match_keywords)):
if match_keywords[i] in line:
ref_idx = splits.index(index_keywords[i])
results[index_keywords[i]].append(float(splits[ref_idx+index_offsets[i]]))
file1.close()
return results
def gather_MNLI_results(result_path):
overall = []
matched = []
mismatched = []
for file in os.listdir(result_path):
if file.startswith('MNLI'):
fname = f'{result_path}/{file}/output.log'
if os.path.exists(fname):
results = gather_numbers(fname,
['overall:', 'metrics for dev-matched:', 'metrics for dev-mismatched:'],
['overall:', 'dev-matched:', 'dev-mismatched:'],
[9, 9, 9])
overall_candidate = results['overall:']
matched_candidate = results['dev-matched:']
mismatched_candidate = results['dev-mismatched:']
if len(overall_candidate) > 0:
assert len(overall_candidate) == len(matched_candidate) and len(overall_candidate) == len(mismatched_candidate)
best_index = overall_candidate.index(max(overall_candidate))
overall.append(overall_candidate[best_index])
matched.append(matched_candidate[best_index])
mismatched.append(mismatched_candidate[best_index])
if len(overall) > 0:
if len(overall) % 2 == 1:
median_idx = overall.index(statistics.median(overall))
else:
median_idx = overall.index(statistics.median_high(overall))
print(f'MNLI how Megatron paper reported: overall results median {statistics.median(overall)}, corresponding matched/mismatched: {matched[median_idx]}/{mismatched[median_idx]}')
print(f'MNLI other results:')
print(f'MNLI overall results {overall}, median {statistics.median(overall)} (corresponding matched/mismatched {matched[median_idx]}/{mismatched[median_idx]}), mean {statistics.mean(overall)}, std {statistics.stdev(overall)}')
print(f'MNLI matched results {matched}, median {statistics.median(matched)}, mean {statistics.mean(matched)}, std {statistics.stdev(matched)}')
print(f'MNLI mismatched results {mismatched}, median {statistics.median(mismatched)}, mean {statistics.mean(mismatched)}, std {statistics.stdev(mismatched)}')
else:
print("Didn't find any MNLI result")
def gather_QQP_results(result_path):
overall = []
for file in os.listdir(result_path):
if file.startswith('QQP'):
fname = f'{result_path}/{file}/output.log'
if os.path.exists(fname):
results = gather_numbers(fname, ['overall:'], ['overall:'], [9])
overall_candidate = results['overall:']
if len(overall_candidate) > 0:
best_index = overall_candidate.index(max(overall_candidate))
overall.append(overall_candidate[best_index])
if len(overall) > 0:
print(f'QQP how Megatron paper reported: overall results median {statistics.median(overall)}')
print(f'QQP other results:')
print(f'QQP overall results {overall}, median {statistics.median(overall)}, mean {statistics.mean(overall)}, std {statistics.stdev(overall)}')
else:
print("Didn't find any QQP result")
def gather_RACE_results(result_path, task):
dev = []
test = []
for file in os.listdir(result_path):
if file.startswith(f'RACE-{task}'):
fname = f'{result_path}/{file}/output.log'
if os.path.exists(fname):
results = gather_numbers(fname,
[f'metrics for dev-{task}:', f'metrics for test-{task}:'],
[f'dev-{task}:', f'test-{task}:'],
[9, 9])
dev_candidate = results[f'dev-{task}:']
test_candidate = results[f'test-{task}:']
if len(dev_candidate) > 0:
assert len(dev_candidate) == len(test_candidate)
dev.append(max(dev_candidate))
test.append(max(test_candidate))
if len(dev) > 0:
if len(dev) % 2 == 1:
median_idx = dev.index(statistics.median(dev))
else:
median_idx = dev.index(statistics.median_high(dev))
print(f'RACE-{task} how Megatron paper reported: test result from the median of dev results {test[median_idx]}')
print(f'RACE-{task} other results:')
print(f'RACE-{task} dev results {dev}, median {statistics.median(dev)}, mean {statistics.mean(dev)}, std {statistics.stdev(dev)}')
print(f'RACE-{task} test results {test}, median {statistics.median(test)}, mean {statistics.mean(test)}, std {statistics.stdev(test)}')
else:
print(f"Didn't find any RACE-{task} result")
def gather_finetune_results(result_path):
print(f'Gather finetune results for {result_path}')
gather_MNLI_results(result_path)
gather_QQP_results(result_path)
gather_RACE_results(result_path, 'middle')
gather_RACE_results(result_path, 'high')
if __name__ == '__main__':
result_path='/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp-finetune/'
gather_finetune_results(result_path)
\ No newline at end of file
{
"train_batch_size" : CONFIG_BATCH_SIZE,
"train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
"steps_per_print": LOG_INTERVAL,
"zero_optimization": {
"stage": ZERO_STAGE
},
"gradient_clipping": 1.0,
"prescale_gradients": PRESCALE_GRAD,
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 11
},
"wall_clock_breakdown" : false
}
hostname_and_rank=$1
master_port=$2
seed=$3
task=$4
lr=$5
pretrained_checkpoint=$6
# hostname_and_rank="worker-0:0,1,2,3"
# master_port=12345
# seed=1234
# task="MNLI"
# lr=2e-5
# pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp"
###############################################################################
### Main configs
seq_len=512
global_batch_size=32
epochs=3
train_data="/blob/data/GlueData/${task}/train.tsv"
valid_data="/blob/data/GlueData/${task}/dev.tsv"
if [[ "${task}" = "MNLI" ]]; then
valid_data="/blob/data/GlueData/MNLI/dev_matched.tsv \
/blob/data/GlueData/MNLI/dev_mismatched.tsv"
fi
## Adjust based on number of GPUs.
batch_size=8
## BERT 110M (BERT-Base)
# model_size=0.11
# num_layers=12
# hidden_size=768
# num_attn_heads=12
## BERT 336M (BERT-Large)
model_size=0.336
num_layers=24
hidden_size=1024
num_attn_heads=16
## BERT 1.3B
# model_size=1.3
# num_layers=24
# hidden_size=2048
# num_attn_heads=32
## BERT 3.9B
# model_size=3.9
# num_layers=48
# hidden_size=2560
# num_attn_heads=40
###############################################################################
### Parallelism configs
## Model parallelism, 1 is no MP
mp_size=1
## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
## Currently pipeline parallelism is not supported for BERT model: DeepSpeed's
## pipeline parallelism is only integrated with the GPT case, and currently
## DeepSpeed is not integrated with Megatron's own pipeline parallelism.
pp_size=1
no_pp="true"
## ZeRO stage
zero_stage=0
###############################################################################
### Misc configs
log_interval=10
eval_iters=50
eval_interval=100
## Activation checkpointing saves GPU memory, but reduces training speed
# activation_checkpoint="true"
activation_checkpoint="false"
###############################################################################
vocab_file="bert-large-uncased-vocab.txt"
if [ ! -f "$vocab_file" ]; then
wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
fi
jobname="${task}-bsz${global_batch_size}-lr${lr}-epochs${epochs}-seed${seed}"
# output_path="${pretrained_checkpoint}-finetune-glue-4v100/${jobname}"
output_path=$(basename "$pretrained_checkpoint")
output_path="glue-results/${output_path}-finetune-glue-4v100/${jobname}"
mkdir -p ${output_path}
template_json="ds_config_bert_TEMPLATE.json"
config_json="ds_config_bert_bsz${global_batch_size}_mbsz${batch_size}_log${log_interval}_zero${zero_stage}.json"
if [[ $zero_stage -gt 0 ]]; then
sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
| sed "s/CONFIG_MBSIZE/${batch_size}/" \
| sed "s/LOG_INTERVAL/${log_interval}/" \
| sed "s/ZERO_STAGE/${zero_stage}/" \
| sed "s/PRESCALE_GRAD/false/" \
| sed "s/CONFIG_FP16_ENABLED/true/" \
| sed "s/CONFIG_BF16_ENABLED/false/" \
> ${config_json}
else
sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
| sed "s/CONFIG_MBSIZE/${batch_size}/" \
| sed "s/LOG_INTERVAL/${log_interval}/" \
| sed "s/ZERO_STAGE/${zero_stage}/" \
| sed "s/PRESCALE_GRAD/true/" \
| sed "s/CONFIG_FP16_ENABLED/true/" \
| sed "s/CONFIG_BF16_ENABLED/false/" \
> ${config_json}
fi
options=" \
--finetune \
--deepspeed \
--deepspeed_config ${config_json} \
--zero-stage ${zero_stage} \
--task ${task} \
--seed ${seed} \
--train-data ${train_data} \
--valid-data ${valid_data} \
--tokenizer-type BertWordPieceLowerCase \
--vocab-file ${vocab_file} \
--epochs ${epochs} \
--pretrained-checkpoint ${pretrained_checkpoint} \
--tensor-model-parallel-size ${mp_size} \
--pipeline-model-parallel-size ${pp_size} \
--num-layers ${num_layers} \
--hidden-size ${hidden_size} \
--num-attention-heads ${num_attn_heads} \
--global-batch-size ${global_batch_size} \
--micro-batch-size ${batch_size} \
--lr ${lr} \
--lr-decay-style linear \
--lr-warmup-fraction 0.1 \
--seq-length ${seq_len} \
--max-position-embeddings ${seq_len} \
--log-interval ${log_interval} \
--eval-interval ${eval_interval} \
--eval-iters ${eval_iters} \
--weight-decay 1.0e-1 \
--fp16"
if [ "${activation_checkpoint}" = "true" ]; then
options="${options} \
--checkpoint-activations \
--deepspeed-activation-checkpointing"
fi
if [[ "${no_pp}" = "true" ]]; then
options="${options} \
--no-pipeline-parallel"
fi
# After the fine-tuning finishes, you can find the dev set accuracy numbers by
# "grep -e "overall:" -e "metrics for" ${output_path}/output.log"
deepspeed --include=${hostname_and_rank} --master_port=${master_port} ../../../../tasks/main.py ${options} &> ${output_path}/output.log
hostname_and_rank=$1
master_port=$2
pretrained_checkpoint=$3
# hostname_and_rank="worker-0:0,1,2,3"
# master_port=12345
# pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp"
tasks=(
RTE
MRPC
STS-B
CoLA
SST-2
QNLI
QQP
MNLI
)
seeds=(
1234
1235
1236
1237
1238
)
lrs=(
2e-5
3e-5
4e-5
5e-5
)
for ((i=0;i<${#tasks[@]};++i)); do
task=${tasks[i]}
for ((j=0;j<${#seeds[@]};++j)); do
seed=${seeds[j]}
for ((k=0;k<${#lrs[@]};++k)); do
lr=${lrs[k]}
bash ds_finetune_bert_glue.sh ${hostname_and_rank} ${master_port} ${seed} ${task} ${lr} ${pretrained_checkpoint}
done
done
done
\ No newline at end of file
import os
import statistics
def gather_numbers(fname, match_keywords, index_keywords, index_offsets):
results = {}
for k in index_keywords:
results[k] = []
file1 = open(fname, 'r')
while True:
line = file1.readline()
if not line:
break
splits = line.split(' ')
for i in range(len(match_keywords)):
if match_keywords[i] in line:
ref_idx = splits.index(index_keywords[i])
results[index_keywords[i]].append(float(splits[ref_idx+index_offsets[i]]))
file1.close()
return results
def gather_GLUE_results(result_path, key, lr):
result = []
mnli_matched_result = []
mnli_mismatched_result = []
for file in os.listdir(result_path):
if file.startswith(key) and lr in file:
fname = f'{result_path}/{file}/output.log'
if os.path.exists(fname):
if key == "STS-B":
results = gather_numbers(fname, ['metrics for'], ['spearmanr'], [2])
overall_candidate = results['spearmanr']
overall_candidate = [x * 100.0 for x in overall_candidate]
elif key == "CoLA":
results = gather_numbers(fname, ['metrics for'], ['mcc'], [2])
overall_candidate = results['mcc']
overall_candidate = [x * 100.0 for x in overall_candidate]
elif key == "MNLI":
results = gather_numbers(fname,
['overall:', 'metrics for dev-matched:', 'metrics for dev-mismatched:'],
['overall:', 'dev-matched:', 'dev-mismatched:'],
[9, 9, 9])
overall_candidate = results['overall:']
matched_candidate = results['dev-matched:']
mismatched_candidate = results['dev-mismatched:']
else:
results = gather_numbers(fname, ['overall:'], ['overall:'], [9])
overall_candidate = results['overall:']
if len(overall_candidate) > 0:
if len(overall_candidate) != 3:
print(f"{result_path} task {key} lr {lr} only has {len(overall_candidate)} epoch")
best_index = overall_candidate.index(max(overall_candidate))
result.append(overall_candidate[best_index])
if key == "MNLI":
mnli_matched_result.append(matched_candidate[best_index])
mnli_mismatched_result.append(mismatched_candidate[best_index])
if len(result) > 0:
if len(result) != 5:
print(f"{result_path} task {key} lr {lr} only has {len(result)} seed")
if key == "MNLI":
best_index = result.index(statistics.median_high(result))
return round(mnli_matched_result[best_index],2), round(statistics.stdev(mnli_matched_result),2), round(mnli_mismatched_result[best_index],2), round(statistics.stdev(mnli_mismatched_result),2)
else:
return round(statistics.median_high(result),2), round(statistics.stdev(result),2)
else:
if key == "MNLI":
return None, None, None, None
else:
return None, None
def gather_finetune_results(result_path, extra_col=[], lr="2e-5"):
output = ""
for field in extra_col:
output += f"{field} &"
task_output = ""
median_list, std_list = [], []
m_median, m_std, mm_median, mm_std = gather_GLUE_results(result_path, "MNLI", lr)
if m_median is not None:
median_list += [m_median, mm_median]
std_list += [m_std, mm_std]
task_output += f"{m_median}±{m_std} & {mm_median}±{mm_std} &"
tasks = ["QQP", "QNLI", "SST-2", "CoLA", "STS-B", "MRPC", "RTE"]
for task in tasks:
t_median, t_std = gather_GLUE_results(result_path, task, lr)
if t_median is not None:
median_list += [t_median]
std_list += [t_std]
if task == "RTE":
task_output += f"{t_median}±{t_std} "
else:
task_output += f"{t_median}±{t_std} &"
overall_median = round(sum(median_list) / len(median_list), 2)
overall_std = round(sum(std_list) / len(std_list), 2)
output += f"{overall_median}±{overall_std} &"
output += task_output
output += " \\\\"
print(output)
if __name__ == '__main__':
print("\\begin{table}")
print("\centering")
print("\\tiny")
text = "\\begin{tabular}{@{}l|"
for _ in range(11):
text += "c"
text += "@{}}"
print(text)
print("\\toprule")
print("Case & Train tokens & Average & MNLI-m & MNLI-mm & QQP & QNLI & SST-2 & CoLA & STS-B & MRPC & RTE \\\\")
print("\midrule")
result_path='/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp-finetune/'
gather_finetune_results(result_path)
print("\\bottomrule")
print("\end{tabular}")
print("\end{table}")
print("")
print("")
\ No newline at end of file
import zstandard
import sys
import time
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
os.path.pardir,os.path.pardir,os.path.pardir)))
from megatron.data import indexed_dataset
def pile_download(download_url, file_path, i):
start = time.time()
zstd_file_path = f"{file_path}{i:02}.jsonl.zst"
download_path = f"{download_url}{i:02}.jsonl.zst"
if not os.path.exists(zstd_file_path):
os.system(f"wget -P {file_path} {download_path}")
print(f"Finished downloading chunk {i} in {time.time() - start} sec")
def pile_decompress(download_url, file_path, i):
zstd_file_path = f"{file_path}{i:02}.jsonl.zst"
output_path = f"{file_path}{i:02}.jsonl"
if not os.path.exists(output_path):
if not os.path.exists(zstd_file_path):
pile_download(download_url, file_path, i)
start = time.time()
with open(zstd_file_path, 'rb') as compressed:
decomp = zstandard.ZstdDecompressor()
with open(output_path, 'wb') as destination:
decomp.copy_stream(compressed, destination)
os.remove(zstd_file_path)
print(f"Finished decompressing chunk {i} in {time.time() - start} sec")
def pile_preprocess(download_url, file_path, vocab_file, num_workers, i):
json_file_path = f"{file_path}{i:02}.jsonl"
output_prefix = f"{file_path}pile_bert_train_{i:02}"
if not os.path.exists(f"{output_prefix}_text_sentence.idx"):
if not os.path.exists(json_file_path):
pile_decompress(download_url, file_path, i)
start = time.time()
cmd = f"python ../../tools/preprocess_data.py \
--input {json_file_path} \
--output-prefix {output_prefix} \
--vocab {vocab_file} \
--dataset-impl mmap \
--tokenizer-type BertWordPieceLowerCase \
--split-sentences \
--workers {num_workers} "
# It's possible to hit MemoryError during above cmd since the memory
# usage is proportional to num_workers. In this case we delete the
# incomplete output and user shall retry with smaller num_workers.
# Our experience show that chunk 6, 7, 9, 17, 18, 20, 21, 24, 27
# particularly have large memory usage.
if os.system(cmd) == 0: # Success
os.remove(json_file_path)
else:
print(f"Error: chunk {i} preprocessing got error, delete \
incomplete output. If MemoryError appeared, please retry \
with num_workers smaller than {num_workers}.")
if os.path.exists(f"{output_prefix}_text_sentence.idx"):
os.remove(f"{output_prefix}_text_sentence.idx")
if os.path.exists(f"{output_prefix}_text_sentence.bin"):
os.remove(f"{output_prefix}_text_sentence.bin")
print(f"Finished preprocessing chunk {i} in {time.time() - start} sec")
def pile_merge(file_path):
start = time.time()
num_chunks = 30
vocab_size = 30524
for i in range(num_chunks):
output_prefix = f"{file_path}pile_bert_train_{i:02}"
assert os.path.exists(f"{output_prefix}_text_sentence.idx")
assert os.path.exists(f"{output_prefix}_text_sentence.bin")
builder = indexed_dataset.make_builder(
f"{file_path}pile_bert_train_text_sentence.bin", impl="mmap",
vocab_size=vocab_size)
for i in range(num_chunks):
chunk_file = f"{file_path}pile_bert_train_{i:02}_text_sentence"
print(f"Merging file {chunk_file}")
builder.merge_file_(chunk_file)
print("Finalizing merged file ...")
builder.finalize(f"{file_path}pile_bert_train_text_sentence.idx")
print(f"Finished merging in {time.time() - start} sec")
# After verifying the merged data with real training, you may want to
# delete the data chunks.
# for i in range(num_chunks):
# output_prefix = f"{file_path}pile_bert_train_{i:02}"
# os.remove(f"{output_prefix}_text_sentence.idx")
# os.remove(f"{output_prefix}_text_sentence.bin")
if __name__ == '__main__':
# Path to download and store all the output files during the whole process.
# Estimated max storage usage would be around 1.6 TB (or 780GB if skip the
# final merge). Memory usage is proportional to the num_workers below (can
# be as high as O(300GB) if num_workers is around 20).
file_path = "/blob/data/the_pile_bert/"
# The raw Pile data has 30 compressed .zst chunks. To run on single
# machine for all chunks, run "python prepare_pile_data.py range 0 30".
# You can also split and run on multiple machines to speed up, since
# processing one chunk can take hours. The whole process only uses CPU.
if sys.argv[1] == "merge":
# "python prepare_pile_data.py merge" means merge all 30 processed data
# chunks. Run it only after all 30 chunks are preprocessed. The memory
# usage during merge is about 600GB. If you don't have enough memory,
# one solution is to directly use the 30 data chunks as multiple
# datasets. See '--data-path' in
# github.com/microsoft/Megatron-DeepSpeed/blob/main/megatron/arguments.py
pile_merge(file_path)
else:
if sys.argv[1] == "range":
# "python prepare_pile_data.py range 0 30" means process chunk 0-29
selected_chunk = range(int(sys.argv[2]), int(sys.argv[3]))
else:
# "python prepare_pile_data.py 2 5 8" means process chunk 2, 5, 8
selected_chunk = [int(x) for x in sys.argv[1:]]
print("selected_chunk: ", selected_chunk)
# Number of process. Adjust based on your CPU/Memory.
num_workers = 20
# Where the raw Pile data can be downloaded. The url may change in
# future. Contact EleutherAI (https://github.com/EleutherAI/the-pile)
# if this url does not work.
download_url = "https://the-eye.eu/public/AI/pile/train/"
vocab_file = "bert-large-uncased-vocab.txt"
vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt"
if not os.path.exists(vocab_file):
os.system(f"wget {vocab_url}")
os.makedirs(file_path, exist_ok=True)
for i in selected_chunk:
pile_preprocess(download_url, file_path, vocab_file, num_workers, i)
{
"train_batch_size": GBSIZE,
"train_micro_batch_size_per_gpu": MBSIZE,
"steps_per_print": LOG_INTERVAL,
"zero_optimization": {
"stage": ZERO_STAGE
},
"gradient_clipping": 1.0,
"prescale_gradients": PRESCALE_GRAD,
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 11
},
"wall_clock_breakdown" : false,
"dataloader_drop_last": true,
"data_efficiency": {
"enabled": true,
"seed": DATA_EFFICIENCY_SEED,
"data_routing": {
"enabled": LTD_ENABLED,
"random_ltd":{
"enabled": LTD_ENABLED,
"total_layer_num": 24,
"random_ltd_layer_num": 22,
"random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22],
"model_mask_name": "attention_mask",
"model_type": "encoder",
"hidden_state_order": "seq_batch_dim",
"random_ltd_schedule": {
"min_value": LTD_MIN,
"max_value": LTD_MAX,
"schedule_type":"fixed_linear",
"schedule_config": {
"require_steps": LTD_STEP,
"seq_per_step": 16
}
}
}
},
"data_sampling": {
"enabled": CL_ENABLED,
"num_workers": DATA_SAMPLING_NUM_WORKERS,
"curriculum_learning": {
"enabled": CL_ENABLED,
"data_cluster_path": "CL_CLUSTER_PATH",
"curriculum_metrics": {
"CL_1st_METRIC_NAME": {
"index_to_sample_path": "CL_1st_SAMPLE_PATH",
"index_to_metric_path": "CL_1st_METRIC_PATH",
"difficulty_type": "CL_1st_DIFF_TYPE",
"clustering_type": "CL_1st_CLUSTER_TYPE",
"min_difficulty": CL_1st_MIN,
"max_difficulty": CL_1st_MAX,
"schedule_type": "fixed_root",
"schedule_config": {
"total_curriculum_step": CL_1st_TOTAL_STEP,
"difficulty_step": CL_1st_DIFF_STEP,
"root_degree": CL_1st_ROOT
}
}
}
}
}
}
}
{
"train_batch_size": GBSIZE,
"train_micro_batch_size_per_gpu": MBSIZE,
"steps_per_print": LOG_INTERVAL,
"zero_optimization": {
"stage": ZERO_STAGE
},
"gradient_clipping": 1.0,
"prescale_gradients": PRESCALE_GRAD,
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 11
},
"wall_clock_breakdown" : false,
"dataloader_drop_last": true,
"data_efficiency": {
"enabled": true,
"seed": DATA_EFFICIENCY_SEED,
"data_routing": {
"enabled": LTD_ENABLED,
"random_ltd":{
"enabled": LTD_ENABLED,
"total_layer_num": 24,
"random_ltd_layer_num": 22,
"random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22],
"model_mask_name": "attention_mask",
"model_type": "encoder",
"hidden_state_order": "seq_batch_dim",
"random_ltd_schedule": {
"min_value": LTD_MIN,
"max_value": LTD_MAX,
"schedule_type":"fixed_linear",
"schedule_config": {
"require_steps": LTD_STEP,
"seq_per_step": 16
}
}
}
},
"data_sampling": {
"enabled": CL_ENABLED,
"num_workers": DATA_SAMPLING_NUM_WORKERS,
"curriculum_learning": {
"enabled": CL_ENABLED,
"data_cluster_path": "CL_CLUSTER_PATH",
"curriculum_metrics": {
"CL_1st_METRIC_NAME": {
"index_to_sample_path": "CL_1st_SAMPLE_PATH",
"index_to_metric_path": "CL_1st_METRIC_PATH",
"difficulty_type": "CL_1st_DIFF_TYPE",
"clustering_type": "CL_1st_CLUSTER_TYPE",
"min_difficulty": CL_1st_MIN,
"max_difficulty": CL_1st_MAX,
"schedule_type": "fixed_root",
"schedule_config": {
"total_curriculum_step": CL_1st_TOTAL_STEP,
"difficulty_step": CL_1st_DIFF_STEP,
"root_degree": CL_1st_ROOT
}
},
"CL_2nd_METRIC_NAME": {
"index_to_sample_path": "CL_2nd_SAMPLE_PATH",
"index_to_metric_path": "CL_2nd_METRIC_PATH",
"difficulty_type": "CL_2nd_DIFF_TYPE",
"clustering_type": "CL_2nd_CLUSTER_TYPE",
"min_difficulty": CL_2nd_MIN,
"max_difficulty": CL_2nd_MAX,
"schedule_type": "fixed_root",
"schedule_config": {
"total_curriculum_step": CL_2nd_TOTAL_STEP,
"difficulty_step": CL_2nd_DIFF_STEP,
"root_degree": CL_2nd_ROOT
}
}
}
}
}
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment