Commit e5ca7e62 authored by hepj987's avatar hepj987
Browse files

初始化仓库

parents
Pipeline #437 failed with stages
in 0 seconds
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import csv
import os
import sys
class InputExample(object):
"""A single training/test example for simple sequence classification."""
def __init__(self, guid, text_a, text_b=None, label=None):
"""Constructs a InputExample.
Args:
guid: Unique id for the example.
text_a: string. The untokenized text of the first sequence. For
single sequence tasks, only this sequence must be specified.
text_b: (Optional) string. The untokenized text of the second
sequence. Only must be specified for sequence pair tasks.
label: (Optional) string. The label of the example. This should be
specified for train and dev examples, but not for test
examples.
"""
self.guid = guid
self.text_a = text_a
self.text_b = text_b
self.label = label
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self, input_ids, input_mask, segment_ids, label_id):
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.label_id = label_id
class DataProcessor(object):
"""Base class for data converters for sequence classification data sets."""
def get_train_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the train set."""
raise NotImplementedError()
def get_dev_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the dev set."""
raise NotImplementedError()
def get_labels(self):
"""Gets the list of labels for this data set."""
raise NotImplementedError()
@classmethod
def _read_tsv(cls, input_file, quotechar=None):
"""Reads a tab separated value file."""
with open(input_file, "r") as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
lines = []
for line in reader:
if sys.version_info[0] == 2:
line = list(unicode(cell, 'utf-8') for cell in line)
lines.append(line)
return lines
class MrpcProcessor(DataProcessor):
"""Processor for the MRPC data set (GLUE version)."""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")),
"train",
)
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev.tsv")),
"dev",
)
def get_labels(self):
"""See base class."""
return ["0", "1"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, i)
text_a = line[3]
text_b = line[4]
label = line[0]
examples.append(
InputExample(guid=guid,
text_a=text_a,
text_b=text_b,
label=label))
return examples
class MnliProcessor(DataProcessor):
"""Processor for the MultiNLI data set (GLUE version)."""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")),
"train",
)
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
"dev_matched",
)
def get_labels(self):
"""See base class."""
return ["contradiction", "entailment", "neutral"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, line[0])
text_a = line[8]
text_b = line[9]
label = line[-1]
examples.append(
InputExample(guid=guid,
text_a=text_a,
text_b=text_b,
label=label))
return examples
class ColaProcessor(DataProcessor):
"""Processor for the CoLA data set (GLUE version)."""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")),
"train",
)
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev.tsv")),
"dev",
)
def get_labels(self):
"""See base class."""
return ["0", "1"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
guid = "%s-%s" % (set_type, i)
text_a = line[3]
label = line[1]
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=None,
label=label))
return examples
class Sst2Processor(DataProcessor):
"""Processor for the CoLA data set (GLUE version)."""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")),
"train",
)
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev.tsv")),
"dev",
)
def get_labels(self):
"""See base class."""
return ["0", "1"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, i)
text_a = line[0]
label = line[1]
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=None,
label=label))
return examples
def convert_examples_to_features(examples, label_list, max_seq_length,
tokenizer):
"""Loads a data file into a list of `InputBatch`s."""
label_map = {label: i for i, label in enumerate(label_list)}
features = []
for (ex_index, example) in enumerate(examples):
tokens_a = tokenizer.tokenize(example.text_a)
tokens_b = None
if example.text_b:
tokens_b = tokenizer.tokenize(example.text_b)
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
else:
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[:(max_seq_length - 2)]
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambigiously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
segment_ids = [0] * len(tokens)
if tokens_b:
tokens += tokens_b + ["[SEP]"]
segment_ids += [1] * (len(tokens_b) + 1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
padding = [0] * (max_seq_length - len(input_ids))
input_ids += padding
input_mask += padding
segment_ids += padding
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
label_id = label_map[example.label]
features.append(
InputFeatures(input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
label_id=label_id))
return features, label_map
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
PROCESSORS = {
"cola": ColaProcessor,
"mnli": MnliProcessor,
"mrpc": MrpcProcessor,
"sst-2": Sst2Processor,
}
# progress bars in model download and training scripts
tqdm
# Accessing files from S3 directly.
boto3
# Used for downloading models over HTTP
requests
six
ipdb
#Data processing
h5py
html2text
nltk
progressbar
#Others
onnxruntime
git+https://github.com/NVIDIA/dllogger
#!/bin/bash
#export MIOPEN_DEBUG_DISABLE_FIND_DB=1
#export NCCL_SOCKET_IFNAME=ib0
#export HSA_USERPTR_FOR_PAGED_MEM=0
#export MIOPEN_DEBUG_DISABLE_FIND_DB=1
#export NCCL_SOCKET_IFNAME=eno1
export HSA_FORCE_FINE_GRAIN_PCIE=1
#source /public/software/apps/DeepLearning/PyTorch/pytorch-env.sh
export MIOPEN_FIND_MODE=3
#export MIOPEN_ENABLE_LOGGING_CMD=1
#export ROCBLAS_LAYER=3
module unload compiler/rocm/2.9
#export MIOPEN_DEBUG_CONV_DIRECT_ASM_3X3U=0
#export MIOPEN_DEBUG_CONV_DIRECT_ASM_WRW3X3=0
echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
#module load apps/PyTorch/1.5.0a0/hpcx-2.4.1-gcc-7.3.1-rocm3.3
#source /public/home/aiss/Pytorch/env_rocm3.3_torch1.5.sh
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
#下边是修改的
# CHECKPOINT=/public/home/hepj/model_source/pytorch_bert/model.ckpt-28252.pt
CHECKPOINT=/public/home/hepj/model_source/model_pytorch.ckpt.pt
APP="python3 run_squad_v1.py \
--train_file /public/home/hepj/data/sq1.1/train-v1.1.json \
--predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json \
--init_checkpoint $CHECKPOINT \
--vocab_file /public/home/hepj//model_source/pytorch_bert/vocab.txt \
--output_dir /public/home/hepj/outdir/tourch/SQuAD \
--config_file /public/home/hepj//model_source/pytorch_bert/bert_config.json \
--json-summary ./log/results-squad-fp16.json \
--bert_model bert-large-uncased \
--do_train \
--do_predict \
--do_eval \
--train_batch_size 4 \
--predict_batch_size 4 \
--gpus_per_node 1 \
--local_rank -1 \
--fp16 \
--amp \
--eval_script ./evaluate-v1.1.py
"
#--json-summary /public/home/hepj/out_dir/tourch/SQuAD/results.json
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
#echo GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
#GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=1
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
echo numactl --cpunodebind=1 --membind=1 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=2
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
echo numactl --cpunodebind=2 --membind=2 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
#!/bin/bash
#SBATCH --exclusive
#SBATCH --mem=0
#SBATCH --overcommit
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -eux
# The following variables variables need to be set
# Base container to be used - container built in step 1 on quick start guide
readonly docker_image="nvcr.io/nvidia/pytorch:20.06-py3"
# Location of dataset for phase 1
readonly datadir="/raid/datasets/bert/hdf5/shard_1472_test_split_10/seq_128_pred_20_dupe_5/training"
# Location of dataset for phase 2
readonly datadir_phase2="/raid/datasets/bert/hdf5/shard_1472_test_split_10/seq_512_pred_80_dupe_5/training"
# Path to where trained checkpoints will be saved on the system
readonly checkpointdir="$PWD/checkpoints"
readonly mounts=".:/workspace/bert,${datadir}:/workspace/data,${datadir_phase2}:/workspace/data_phase2,${checkpointdir}:/results"
BIND_CMD="./bind.sh --cpu=exclusive --ib=single --"
srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 mkdir -p "${checkpointdir}"
PHASE1="\
--train_batch_size=${BATCHSIZE:-16} \
--learning_rate=${LR:-6e-3} \
--warmup_proportion=${WARMUP_UPDATES:-0.2843} \
--input_dir=/workspace/data \
--max_seq_length=128 \
--max_predictions_per_seq=20 \
--max_steps=7038 \
--num_steps_per_checkpoint=2500 \
"
PHASE2="\
--train_batch_size=${BATCHSIZE:-4096} \
--learning_rate=${LR:-4e-3} \
--warmup_proportion=${WARMUP_UPDATES:-0.128} \
--input_dir=/workspace/data_phase2 \
--phase2 \
--max_seq_length=512 \
--max_predictions_per_seq=80 \
--max_steps=1563 \
--num_steps_per_checkpoint=1000 \
--resume_from_checkpoint --phase1_end_step=7038 \
"
PHASES=( "$PHASE1" "$PHASE2" )
PHASE=${PHASE:-1}
BERT_CMD="\
${BIND_CMD} python -u /workspace/bert/run_pretraining.py \
--seed=42 \
${PHASES[$((PHASE-1))]} \
--do_train \
--config_file=/workspace/bert/bert_config.json \
--output_dir=/results \
--fp16 \
--allreduce_post_accumulation --allreduce_post_accumulation_fp16 \
--gradient_accumulation_steps=${GRADIENT_STEPS:-2} \
--log_freq=1 \
--local_rank=\${SLURM_LOCALID}"
srun -l --container-image="${docker_image}" --container-mounts="${mounts}" sh -c "${BERT_CMD}"
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT finetuning runner."""
from __future__ import absolute_import, division, print_function
import pickle
import argparse
import logging
import os
import random
import wget
import json
import time
import dllogger
import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
import modeling
from tokenization import BertTokenizer
from optimization import BertAdam, warmup_linear
from schedulers import LinearWarmUpScheduler
from apex import amp
from sklearn.metrics import matthews_corrcoef, f1_score
from utils import (is_main_process, mkdir_by_main_process, format_step,
get_world_size)
from processors.glue import PROCESSORS, convert_examples_to_features
torch._C._jit_set_profiling_mode(False)
torch._C._jit_set_profiling_executor(False)
logging.basicConfig(
format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt='%m/%d/%Y %H:%M:%S',
level=logging.INFO,
)
logger = logging.getLogger(__name__)
def compute_metrics(task_name, preds, labels):
assert len(preds) == len(labels)
if task_name == "cola":
return {"mcc": matthews_corrcoef(labels, preds)}
elif task_name == "sst-2":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "mrpc":
return acc_and_f1(preds, labels)
elif task_name == "sts-b":
return pearson_and_spearman(preds, labels)
elif task_name == "qqp":
return acc_and_f1(preds, labels)
elif task_name == "mnli":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "mnli-mm":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "qnli":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "rte":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "wnli":
return {"acc": simple_accuracy(preds, labels)}
else:
raise KeyError(task_name)
def simple_accuracy(preds, labels):
return (preds == labels).mean()
def acc_and_f1(preds, labels):
acc = simple_accuracy(preds, labels)
f1 = f1_score(y_true=labels, y_pred=preds)
return {
"acc": acc,
"f1": f1,
"acc_and_f1": (acc + f1) / 2,
}
def accuracy(out, labels):
outputs = np.argmax(out, axis=1)
return np.sum(outputs == labels)
from apex.multi_tensor_apply import multi_tensor_applier
class GradientClipper:
"""
Clips gradient norm of an iterable of parameters.
"""
def __init__(self, max_grad_norm):
self.max_norm = max_grad_norm
if multi_tensor_applier.available:
import amp_C
self._overflow_buf = torch.cuda.IntTensor([0])
self.multi_tensor_l2norm = amp_C.multi_tensor_l2norm
self.multi_tensor_scale = amp_C.multi_tensor_scale
else:
raise RuntimeError('Gradient clipping requires cuda extensions')
def step(self, parameters):
l = [p.grad for p in parameters if p.grad is not None]
total_norm, _ = multi_tensor_applier(
self.multi_tensor_l2norm,
self._overflow_buf,
[l],
False,
)
total_norm = total_norm.item()
if (total_norm == float('inf')): return
clip_coef = self.max_norm / (total_norm + 1e-6)
if clip_coef < 1:
multi_tensor_applier(
self.multi_tensor_scale,
self._overflow_buf,
[l, l],
clip_coef,
)
def parse_args(parser=argparse.ArgumentParser()):
## Required parameters
parser.add_argument(
"--data_dir",
default=None,
type=str,
required=True,
help="The input data dir. Should contain the .tsv files (or other data "
"files) for the task.",
)
parser.add_argument(
"--bert_model",
default=None,
type=str,
required=True,
help="Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-large-cased, "
"bert-base-multilingual-uncased, bert-base-multilingual-cased, "
"bert-base-chinese.",
)
parser.add_argument(
"--task_name",
default=None,
type=str,
required=True,
choices=PROCESSORS.keys(),
help="The name of the task to train.",
)
parser.add_argument(
"--output_dir",
default=None,
type=str,
required=True,
help="The output directory where the model predictions and checkpoints "
"will be written.",
)
parser.add_argument(
"--init_checkpoint",
default=None,
type=str,
required=True,
help="The checkpoint file from pretraining",
)
## Other parameters
parser.add_argument(
"--max_seq_length",
default=128,
type=int,
help="The maximum total input sequence length after WordPiece "
"tokenization. \n"
"Sequences longer than this will be truncated, and sequences shorter \n"
"than this will be padded.",
)
parser.add_argument("--do_train",
action='store_true',
help="Whether to run training.")
parser.add_argument("--do_eval",
action='store_true',
help="Whether to get model-task performance on the dev"
" set by running eval.")
parser.add_argument("--do_predict",
action='store_true',
help="Whether to output prediction results on the dev "
"set by running eval.")
parser.add_argument("--do_lower_case",
action='store_true',
help="Set this flag if you are using an uncased model.")
parser.add_argument("--train_batch_size",
default=32,
type=int,
help="Batch size per GPU for training.")
parser.add_argument("--eval_batch_size",
default=8,
type=int,
help="Batch size per GPU for eval.")
parser.add_argument("--learning_rate",
default=5e-5,
type=float,
help="The initial learning rate for Adam.")
parser.add_argument("--num_train_epochs",
default=3.0,
type=float,
help="Total number of training epochs to perform.")
parser.add_argument("--max_steps",
default=-1.0,
type=float,
help="Total number of training steps to perform.")
parser.add_argument(
"--warmup_proportion",
default=0.1,
type=float,
help="Proportion of training to perform linear learning rate warmup "
"for. E.g., 0.1 = 10%% of training.",
)
parser.add_argument("--no_cuda",
action='store_true',
help="Whether not to use CUDA when available")
parser.add_argument("--local_rank",
type=int,
default=-1,
help="local_rank for distributed training on gpus")
parser.add_argument('--seed',
type=int,
default=1,
help="random seed for initialization")
parser.add_argument(
'--gradient_accumulation_steps',
type=int,
default=1,
help="Number of updates steps to accumulate before performing a "
"backward/update pass.")
parser.add_argument(
'--fp16',
action='store_true',
help="Mixed precision training",
)
parser.add_argument(
'--amp',
action='store_true',
help="Mixed precision training",
)
parser.add_argument(
'--loss_scale',
type=float,
default=0,
help="Loss scaling to improve fp16 numeric stability. Only used when "
"fp16 set to True.\n"
"0 (default value): dynamic loss scaling.\n"
"Positive power of 2: static loss scaling value.\n",
)
parser.add_argument('--server_ip',
type=str,
default='',
help="Can be used for distant debugging.")
parser.add_argument('--server_port',
type=str,
default='',
help="Can be used for distant debugging.")
parser.add_argument('--vocab_file',
type=str,
default=None,
required=True,
help="Vocabulary mapping/file BERT was pretrainined on")
parser.add_argument("--config_file",
default=None,
type=str,
required=True,
help="The BERT model config")
parser.add_argument('--skip_checkpoint',
default=False,
action='store_true',
help="Whether to save checkpoints")
return parser.parse_args()
def init_optimizer_and_amp(model, learning_rate, loss_scale, warmup_proportion,
num_train_optimization_steps, use_fp16):
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{
'params': [
p for n, p in param_optimizer
if not any(nd in n for nd in no_decay)
],
'weight_decay': 0.01
},
{
'params': [
p for n, p in param_optimizer if any(nd in n for nd in no_decay)
],
'weight_decay': 0.0
},
]
optimizer, scheduler = None, None
if use_fp16:
logger.info("using fp16")
try:
from apex.optimizers import FusedAdam
except ImportError:
raise ImportError("Please install apex from "
"https://www.github.com/nvidia/apex to use "
"distributed and fp16 training.")
if num_train_optimization_steps is not None:
optimizer = FusedAdam(
optimizer_grouped_parameters,
lr=learning_rate,
bias_correction=False,
)
amp_inits = amp.initialize(
model,
optimizers=optimizer,
opt_level="O2",
keep_batchnorm_fp32=False,
loss_scale="dynamic" if loss_scale == 0 else loss_scale,
)
model, optimizer = (amp_inits
if num_train_optimization_steps is not None else
(amp_inits, None))
if num_train_optimization_steps is not None:
scheduler = LinearWarmUpScheduler(
optimizer,
warmup=warmup_proportion,
total_steps=num_train_optimization_steps,
)
else:
logger.info("using fp32")
if num_train_optimization_steps is not None:
optimizer = BertAdam(
optimizer_grouped_parameters,
lr=learning_rate,
warmup=warmup_proportion,
t_total=num_train_optimization_steps,
)
return model, optimizer, scheduler
def gen_tensor_dataset(features):
all_input_ids = torch.tensor(
[f.input_ids for f in features],
dtype=torch.long,
)
all_input_mask = torch.tensor(
[f.input_mask for f in features],
dtype=torch.long,
)
all_segment_ids = torch.tensor(
[f.segment_ids for f in features],
dtype=torch.long,
)
all_label_ids = torch.tensor(
[f.label_id for f in features],
dtype=torch.long,
)
return TensorDataset(
all_input_ids,
all_input_mask,
all_segment_ids,
all_label_ids,
)
def get_train_features(data_dir, bert_model, max_seq_length, do_lower_case,
local_rank, train_batch_size,
gradient_accumulation_steps, num_train_epochs, tokenizer,
processor):
cached_train_features_file = os.path.join(
data_dir,
'{0}_{1}_{2}'.format(
list(filter(None, bert_model.split('/'))).pop(),
str(max_seq_length),
str(do_lower_case),
),
)
train_features = None
try:
with open(cached_train_features_file, "rb") as reader:
train_features = pickle.load(reader)
logger.info("Loaded pre-processed features from {}".format(
cached_train_features_file))
except:
logger.info("Did not find pre-processed features from {}".format(
cached_train_features_file))
train_examples = processor.get_train_examples(data_dir)
train_features, _ = convert_examples_to_features(
train_examples,
processor.get_labels(),
max_seq_length,
tokenizer,
)
if is_main_process():
logger.info(" Saving train features into cached file %s",
cached_train_features_file)
with open(cached_train_features_file, "wb") as writer:
pickle.dump(train_features, writer)
return train_features
def dump_predictions(path, label_map, preds, examples):
label_rmap = {label_idx: label for label, label_idx in label_map.items()}
predictions = {
example.guid: label_rmap[preds[i]] for i, example in enumerate(examples)
}
with open(path, "w") as writer:
json.dump(predictions, writer)
def main(args):
args.fp16 = args.fp16 or args.amp
if args.server_ip and args.server_port:
# Distant debugging - see
# https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
import ptvsd
logger.info("Waiting for debugger attach")
ptvsd.enable_attach(
address=(args.server_ip, args.server_port),
redirect_output=True,
)
ptvsd.wait_for_attach()
if args.local_rank == -1 or args.no_cuda:
device = torch.device(
"cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
n_gpu = torch.cuda.device_count()
else:
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
n_gpu = 1
# Initializes the distributed backend which will take care of
# sychronizing nodes/GPUs.
if not torch.distributed.is_initialized():
torch.distributed.init_process_group(backend='nccl')
logger.info("device: {} n_gpu: {}, distributed training: {}, "
"16-bits training: {}".format(
device,
n_gpu,
bool(args.local_rank != -1),
args.fp16,
))
if not args.do_train and not args.do_eval and not args.do_predict:
raise ValueError("At least one of `do_train`, `do_eval` or "
"`do_predict` must be True.")
if is_main_process():
if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and
args.do_train):
logger.warning("Output directory ({}) already exists and is not "
"empty.".format(args.output_dir))
mkdir_by_main_process(args.output_dir)
if is_main_process():
dllogger.init(backends=[
dllogger.JSONStreamBackend(
verbosity=dllogger.Verbosity.VERBOSE,
filename=os.path.join(args.output_dir, 'dllogger.json'),
),
dllogger.StdOutBackend(
verbosity=dllogger.Verbosity.VERBOSE,
step_format=format_step,
),
])
else:
dllogger.init(backends=[])
dllogger.log(step="PARAMETER", data={"Config": [str(args)]})
if args.gradient_accumulation_steps < 1:
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, "
"should be >= 1".format(
args.gradient_accumulation_steps))
if args.gradient_accumulation_steps > args.train_batch_size:
raise ValueError("gradient_accumulation_steps ({}) cannot be larger "
"train_batch_size ({}) - there cannot be a fraction "
"of one sample.".format(
args.gradient_accumulation_steps,
args.train_batch_size,
))
args.train_batch_size = (args.train_batch_size //
args.gradient_accumulation_steps)
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if n_gpu > 0:
torch.cuda.manual_seed_all(args.seed)
dllogger.log(step="PARAMETER", data={"SEED": args.seed})
processor = PROCESSORS[args.task_name]()
num_labels = len(processor.get_labels())
#tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
tokenizer = BertTokenizer(
args.vocab_file,
do_lower_case=args.do_lower_case,
max_len=512,
) # for bert large
num_train_optimization_steps = None
if args.do_train:
train_features = get_train_features(
args.data_dir,
args.bert_model,
args.max_seq_length,
args.do_lower_case,
args.local_rank,
args.train_batch_size,
args.gradient_accumulation_steps,
args.num_train_epochs,
tokenizer,
processor,
)
num_train_optimization_steps = int(
len(train_features) / args.train_batch_size /
args.gradient_accumulation_steps) * args.num_train_epochs
if args.local_rank != -1:
num_train_optimization_steps = (num_train_optimization_steps //
torch.distributed.get_world_size())
# Prepare model
config = modeling.BertConfig.from_json_file(args.config_file)
# Padding for divisibility by 8
if config.vocab_size % 8 != 0:
config.vocab_size += 8 - (config.vocab_size % 8)
modeling.ACT2FN["bias_gelu"] = modeling.bias_gelu_training
model = modeling.BertForSequenceClassification(
config,
num_labels=num_labels,
)
logger.info("USING CHECKPOINT from {}".format(args.init_checkpoint))
model.load_state_dict(
torch.load(args.init_checkpoint, map_location='cpu')["model"],
strict=False,
)
logger.info("USED CHECKPOINT from {}".format(args.init_checkpoint))
dllogger.log(
step="PARAMETER",
data={
"num_parameters":
sum([p.numel() for p in model.parameters() if p.requires_grad]),
},
)
model.to(device)
# Prepare optimizer
model, optimizer, scheduler = init_optimizer_and_amp(
model,
args.learning_rate,
args.loss_scale,
args.warmup_proportion,
num_train_optimization_steps,
args.fp16,
)
if args.local_rank != -1:
try:
from apex.parallel import DistributedDataParallel as DDP
except ImportError:
raise ImportError("Please install apex from "
"https://www.github.com/nvidia/apex to use "
"distributed and fp16 training.")
model = DDP(model)
elif n_gpu > 1:
model = torch.nn.DataParallel(model)
loss_fct = torch.nn.CrossEntropyLoss()
results = {}
if args.do_train:
logger.info("***** Running training *****")
logger.info(" Num examples = %d", len(train_features))
logger.info(" Batch size = %d", args.train_batch_size)
logger.info(" Num steps = %d", num_train_optimization_steps)
train_data = gen_tensor_dataset(train_features)
if args.local_rank == -1:
train_sampler = RandomSampler(train_data)
else:
train_sampler = DistributedSampler(train_data)
train_dataloader = DataLoader(
train_data,
sampler=train_sampler,
batch_size=args.train_batch_size,
)
global_step = 0
nb_tr_steps = 0
tr_loss = 0
latency_train = 0.0
nb_tr_examples = 0
model.train()
tic_train = time.perf_counter()
for _ in trange(int(args.num_train_epochs), desc="Epoch"):
tr_loss, nb_tr_steps = 0, 0
for step, batch in enumerate(
tqdm(train_dataloader, desc="Iteration")):
if args.max_steps > 0 and global_step > args.max_steps:
break
batch = tuple(t.to(device) for t in batch)
input_ids, input_mask, segment_ids, label_ids = batch
logits = model(input_ids, segment_ids, input_mask)
loss = loss_fct(
logits.view(-1, num_labels),
label_ids.view(-1),
)
if n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu.
if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps
if args.fp16:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
tr_loss += loss.item()
nb_tr_examples += input_ids.size(0)
nb_tr_steps += 1
if (step + 1) % args.gradient_accumulation_steps == 0:
if args.fp16:
# modify learning rate with special warm up for BERT
# which FusedAdam doesn't do
scheduler.step()
optimizer.step()
optimizer.zero_grad()
global_step += 1
latency_train = time.perf_counter() - tic_train
tr_loss = tr_loss / nb_tr_steps
results.update({
'global_step':
global_step,
'train:loss':
tr_loss,
'train:latency':
latency_train,
'train:num_samples_per_gpu':
nb_tr_examples,
'train:num_steps':
nb_tr_steps,
'train:throughput':
get_world_size() * nb_tr_examples / latency_train,
})
if is_main_process() and not args.skip_checkpoint:
model_to_save = model.module if hasattr(model, 'module') else model
torch.save(
{"model": model_to_save.state_dict()},
os.path.join(args.output_dir, modeling.WEIGHTS_NAME),
)
with open(
os.path.join(args.output_dir, modeling.CONFIG_NAME),
'w',
) as f:
f.write(model_to_save.config.to_json_string())
if (args.do_eval or args.do_predict) and is_main_process():
eval_examples = processor.get_dev_examples(args.data_dir)
eval_features, label_map = convert_examples_to_features(
eval_examples,
processor.get_labels(),
args.max_seq_length,
tokenizer,
)
logger.info("***** Running evaluation *****")
logger.info(" Num examples = %d", len(eval_examples))
logger.info(" Batch size = %d", args.eval_batch_size)
eval_data = gen_tensor_dataset(eval_features)
# Run prediction for full data
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(
eval_data,
sampler=eval_sampler,
batch_size=args.eval_batch_size,
)
model.eval()
preds = None
out_label_ids = None
eval_loss = 0
nb_eval_steps, nb_eval_examples = 0, 0
cuda_events = [(torch.cuda.Event(enable_timing=True),
torch.cuda.Event(enable_timing=True))
for _ in range(len(eval_dataloader))]
for i, (input_ids, input_mask, segment_ids, label_ids) in tqdm(
enumerate(eval_dataloader),
desc="Evaluating",
):
input_ids = input_ids.to(device)
input_mask = input_mask.to(device)
segment_ids = segment_ids.to(device)
label_ids = label_ids.to(device)
with torch.no_grad():
cuda_events[i][0].record()
logits = model(input_ids, segment_ids, input_mask)
cuda_events[i][1].record()
if args.do_eval:
eval_loss += loss_fct(
logits.view(-1, num_labels),
label_ids.view(-1),
).mean().item()
nb_eval_steps += 1
nb_eval_examples += input_ids.size(0)
if preds is None:
preds = logits.detach().cpu().numpy()
out_label_ids = label_ids.detach().cpu().numpy()
else:
preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
out_label_ids = np.append(
out_label_ids,
label_ids.detach().cpu().numpy(),
axis=0,
)
torch.cuda.synchronize()
eval_latencies = [
event_start.elapsed_time(event_end)
for event_start, event_end in cuda_events
]
eval_latencies = list(sorted(eval_latencies))
def infer_latency_sli(threshold):
index = int(len(eval_latencies) * threshold) - 1
index = min(max(index, 0), len(eval_latencies) - 1)
return eval_latencies[index]
eval_throughput = (args.eval_batch_size /
(np.mean(eval_latencies) / 1000))
results.update({
'eval:num_samples_per_gpu': nb_eval_examples,
'eval:num_steps': nb_eval_steps,
'infer:latency(ms):50%': infer_latency_sli(0.5),
'infer:latency(ms):90%': infer_latency_sli(0.9),
'infer:latency(ms):95%': infer_latency_sli(0.95),
'infer:latency(ms):99%': infer_latency_sli(0.99),
'infer:latency(ms):100%': infer_latency_sli(1.0),
'infer:latency(ms):avg': np.mean(eval_latencies),
'infer:latency(ms):std': np.std(eval_latencies),
'infer:latency(ms):sum': np.sum(eval_latencies),
'infer:throughput(samples/s):avg': eval_throughput,
})
preds = np.argmax(preds, axis=1)
if args.do_predict:
dump_predictions(
os.path.join(args.output_dir, 'predictions.json'),
label_map,
preds,
eval_examples,
)
if args.do_eval:
results['eval:loss'] = eval_loss / nb_eval_steps
eval_result = compute_metrics(args.task_name, preds, out_label_ids)
results.update(eval_result)
if is_main_process():
logger.info("***** Results *****")
for key in sorted(results.keys()):
logger.info(" %s = %s", key, str(results[key]))
with open(os.path.join(args.output_dir, "results.txt"), "w") as writer:
json.dump(results, writer)
dllogger_queries_from_results = {
'exact_match': 'acc',
'F1': 'f1',
'e2e_train_time': 'train:latency',
'training_sequences_per_second': 'train:throughput',
'e2e_inference_time': ('infer:latency(ms):sum', lambda x: x / 1000),
'inference_sequences_per_second': 'infer:throughput(samples/s):avg',
}
for key, query in dllogger_queries_from_results.items():
results_key, convert = (query if isinstance(query, tuple) else
(query, lambda x: x))
if results_key not in results:
continue
dllogger.log(
step=tuple(),
data={key: convert(results[results_key])},
)
dllogger.flush()
return results
if __name__ == "__main__":
main(parse_args())
# coding=utf-8
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT finetuning runner."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
# ==================
import csv
import os
import time
import argparse
import random
import h5py
from tqdm import tqdm, trange
import os
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Dataset
from torch.utils.data.distributed import DistributedSampler
import math
from apex import amp
import multiprocessing
from tokenization import BertTokenizer
import modeling
from apex.optimizers import FusedLAMB
from schedulers import PolyWarmUpScheduler
from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from utils import is_main_process, format_step, get_world_size, get_rank
from apex.parallel import DistributedDataParallel as DDP
from schedulers import LinearWarmUpScheduler
from apex.parallel.distributed import flat_dist_call
import amp_C
import apex_C
from apex.amp import _amp_state
import dllogger
from concurrent.futures import ProcessPoolExecutor
torch._C._jit_set_profiling_mode(False)
torch._C._jit_set_profiling_executor(False)
skipped_steps = 0
# Track whether a SIGTERM (cluster time up) has been handled
timeout_sent = False
import signal
# handle SIGTERM sent from the scheduler and mark so we
# can gracefully save & exit
def signal_handler(sig, frame):
global timeout_sent
timeout_sent = True
signal.signal(signal.SIGTERM, signal_handler)
#Workaround because python functions are not picklable
class WorkerInitObj(object):
def __init__(self, seed):
self.seed = seed
def __call__(self, id):
np.random.seed(seed=self.seed + id)
random.seed(self.seed + id)
def create_pretraining_dataset(input_file, max_pred_length, shared_list, args, worker_init):
train_data = pretraining_dataset(input_file=input_file, max_pred_length=max_pred_length)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler,
batch_size=args.train_batch_size * args.n_gpu,
num_workers=4, worker_init_fn=worker_init,
pin_memory=True)
return train_dataloader, input_file
class pretraining_dataset(Dataset):
def __init__(self, input_file, max_pred_length):
self.input_file = input_file
self.max_pred_length = max_pred_length
f = h5py.File(input_file, "r")
keys = ['input_ids', 'input_mask', 'segment_ids', 'masked_lm_positions', 'masked_lm_ids',
'next_sentence_labels']
self.inputs = [np.asarray(f[key][:]) for key in keys]
f.close()
def __len__(self):
'Denotes the total number of samples'
return len(self.inputs[0])
def __getitem__(self, index):
[input_ids, input_mask, segment_ids, masked_lm_positions, masked_lm_ids, next_sentence_labels] = [
torch.from_numpy(input[index].astype(np.int64)) if indice < 5 else torch.from_numpy(
np.asarray(input[index].astype(np.int64))) for indice, input in enumerate(self.inputs)]
masked_lm_labels = torch.ones(input_ids.shape, dtype=torch.long) * -1
index = self.max_pred_length
# store number of masked tokens in index
padded_mask_indices = (masked_lm_positions == 0).nonzero()
if len(padded_mask_indices) != 0:
index = padded_mask_indices[0].item()
masked_lm_labels[masked_lm_positions[:index]] = masked_lm_ids[:index]
return [input_ids, segment_ids, input_mask,
masked_lm_labels, next_sentence_labels]
class BertPretrainingCriterion(torch.nn.Module):
def __init__(self, vocab_size):
super(BertPretrainingCriterion, self).__init__()
self.loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-1)
self.vocab_size = vocab_size
def forward(self, prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels):
masked_lm_loss = self.loss_fn(prediction_scores.view(-1, self.vocab_size), masked_lm_labels.view(-1))
next_sentence_loss = self.loss_fn(seq_relationship_score.view(-1, 2), next_sentence_labels.view(-1))
total_loss = masked_lm_loss + next_sentence_loss
return total_loss
def parse_arguments():
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--input_dir",
default=None,
type=str,
required=True,
help="The input data dir. Should contain .hdf5 files for the task.")
parser.add_argument("--config_file",
default=None,
type=str,
required=True,
help="The BERT model config")
parser.add_argument("--bert_model", default="bert-large-uncased", type=str,
help="Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
parser.add_argument("--output_dir",
default=None,
type=str,
required=True,
help="The output directory where the model checkpoints will be written.")
## Other parameters
parser.add_argument("--init_checkpoint",
default=None,
type=str,
help="The initial checkpoint to start training from.")
parser.add_argument("--max_seq_length",
default=512,
type=int,
help="The maximum total input sequence length after WordPiece tokenization. \n"
"Sequences longer than this will be truncated, and sequences shorter \n"
"than this will be padded.")
parser.add_argument("--max_predictions_per_seq",
default=80,
type=int,
help="The maximum total of masked tokens in input sequence")
parser.add_argument("--train_batch_size",
default=32,
type=int,
help="Total batch size for training.")
parser.add_argument("--learning_rate",
default=5e-5,
type=float,
help="The initial learning rate for Adam.")
parser.add_argument("--num_train_epochs",
default=3.0,
type=float,
help="Total number of training epochs to perform.")
parser.add_argument("--max_steps",
default=1000,
type=float,
help="Total number of training steps to perform.")
parser.add_argument("--warmup_proportion",
default=0.01,
type=float,
help="Proportion of training to perform linear learning rate warmup for. "
"E.g., 0.1 = 10%% of training.")
parser.add_argument("--local_rank",
type=int,
default=os.getenv('LOCAL_RANK', -1),
help="local_rank for distributed training on gpus")
parser.add_argument('--seed',
type=int,
default=42,
help="random seed for initialization")
parser.add_argument('--gradient_accumulation_steps',
type=int,
default=1,
help="Number of updates steps to accumualte before performing a backward/update pass.")
parser.add_argument('--fp16',
default=False,
action='store_true',
help="Mixed precision training")
parser.add_argument('--amp',
default=False,
action='store_true',
help="Mixed precision training")
parser.add_argument('--loss_scale',
type=float, default=0.0,
help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
parser.add_argument('--log_freq',
type=float, default=1.0,
help='frequency of logging loss.')
parser.add_argument('--checkpoint_activations',
default=False,
action='store_true',
help="Whether to use gradient checkpointing")
parser.add_argument("--resume_from_checkpoint",
default=False,
action='store_true',
help="Whether to resume training from checkpoint.")
parser.add_argument('--resume_step',
type=int,
default=-1,
help="Step to resume training from.")
parser.add_argument('--num_steps_per_checkpoint',
type=int,
default=100,
help="Number of update steps until a model checkpoint is saved to disk.")
parser.add_argument('--skip_checkpoint',
default=False,
action='store_true',
help="Whether to save checkpoints")
parser.add_argument('--phase2',
default=False,
action='store_true',
help="Whether to train with seq len 512")
parser.add_argument('--allreduce_post_accumulation',
default=False,
action='store_true',
help="Whether to do allreduces during gradient accumulation steps.")
parser.add_argument('--allreduce_post_accumulation_fp16',
default=False,
action='store_true',
help="Whether to do fp16 allreduce post accumulation.")
parser.add_argument('--phase1_end_step',
type=int,
default=7038,
help="Number of training steps in Phase1 - seq len 128")
parser.add_argument('--init_loss_scale',
type=int,
default=2**20,
help="Initial loss scaler value")
parser.add_argument("--do_train",
default=False,
action='store_true',
help="Whether to run training.")
parser.add_argument('--json-summary', type=str, default="results/dllogger.json",
help='If provided, the json summary will be written to'
'the specified file.')
parser.add_argument("--use_env",
action='store_true',
help="Whether to read local rank from ENVVAR")
parser.add_argument('--disable_progress_bar',
default=False,
action='store_true',
help='Disable tqdm progress bar')
parser.add_argument('--steps_this_run', type=int, default=-1,
help='If provided, only run this many steps before exiting')
args = parser.parse_args()
args.fp16 = args.fp16 or args.amp
if args.steps_this_run < 0:
args.steps_this_run = args.max_steps
return args
def setup_training(args):
assert (torch.cuda.is_available())
if args.local_rank == -1:
device = torch.device("cuda")
args.n_gpu = torch.cuda.device_count()
args.allreduce_post_accumulation = False
args.allreduce_post_accumulation_fp16 = False
else:
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.distributed.init_process_group(backend='nccl', init_method='env://')
args.n_gpu = 1
if args.gradient_accumulation_steps == 1:
args.allreduce_post_accumulation = False
args.allreduce_post_accumulation_fp16 = False
if is_main_process():
dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
filename=args.json_summary),
dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step)])
else:
dllogger.init(backends=[])
print("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
device, args.n_gpu, bool(args.local_rank != -1), args.fp16))
if args.gradient_accumulation_steps < 1:
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
args.gradient_accumulation_steps))
if args.train_batch_size % args.gradient_accumulation_steps != 0:
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible".format(
args.gradient_accumulation_steps, args.train_batch_size))
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
if not args.do_train:
raise ValueError(" `do_train` must be True.")
if not args.resume_from_checkpoint and os.path.exists(args.output_dir) and (
os.listdir(args.output_dir) and any([i.startswith('ckpt') for i in os.listdir(args.output_dir)])):
raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
if (not args.resume_from_checkpoint or not os.path.exists(args.output_dir)) and is_main_process():
os.makedirs(args.output_dir, exist_ok=True)
return device, args
def prepare_model_and_optimizer(args, device):
# Prepare model
config = modeling.BertConfig.from_json_file(args.config_file)
# Padding for divisibility by 8
if config.vocab_size % 8 != 0:
config.vocab_size += 8 - (config.vocab_size % 8)
modeling.ACT2FN["bias_gelu"] = modeling.bias_gelu_training
model = modeling.BertForPreTraining(config)
checkpoint = None
if not args.resume_from_checkpoint:
global_step = 0
else:
if args.resume_step == -1 and not args.init_checkpoint:
model_names = [f for f in os.listdir(args.output_dir) if f.endswith(".pt")]
args.resume_step = max([int(x.split('.pt')[0].split('_')[1].strip()) for x in model_names])
global_step = args.resume_step if not args.init_checkpoint else 0
if not args.init_checkpoint:
checkpoint = torch.load(os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step)), map_location="cpu")
else:
checkpoint = torch.load(args.init_checkpoint, map_location="cpu")
model.load_state_dict(checkpoint['model'], strict=False)
if args.phase2 and not args.init_checkpoint:
global_step -= args.phase1_end_step
if is_main_process():
print("resume step from ", args.resume_step)
model.to(device)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta', 'LayerNorm']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
optimizer = FusedLAMB(optimizer_grouped_parameters,
lr=args.learning_rate)
lr_scheduler = PolyWarmUpScheduler(optimizer,
warmup=args.warmup_proportion,
total_steps=args.max_steps)
if args.fp16:
if args.loss_scale == 0:
model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic", cast_model_outputs=torch.float16)
else:
model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale=args.loss_scale, cast_model_outputs=torch.float16)
amp._amp_state.loss_scalers[0]._loss_scale = args.init_loss_scale
model.checkpoint_activations(args.checkpoint_activations)
if args.resume_from_checkpoint:
if args.phase2 or args.init_checkpoint:
keys = list(checkpoint['optimizer']['state'].keys())
#Override hyperparameters from previous checkpoint
for key in keys:
checkpoint['optimizer']['state'][key]['step'] = global_step
for iter, item in enumerate(checkpoint['optimizer']['param_groups']):
checkpoint['optimizer']['param_groups'][iter]['step'] = global_step
checkpoint['optimizer']['param_groups'][iter]['t_total'] = args.max_steps
checkpoint['optimizer']['param_groups'][iter]['warmup'] = args.warmup_proportion
checkpoint['optimizer']['param_groups'][iter]['lr'] = args.learning_rate
optimizer.load_state_dict(checkpoint['optimizer']) # , strict=False)
# Restore AMP master parameters
if args.fp16:
optimizer._lazy_init_maybe_master_weights()
optimizer._amp_stash.lazy_init_called = True
optimizer.load_state_dict(checkpoint['optimizer'])
for param, saved_param in zip(amp.master_params(optimizer), checkpoint['master params']):
param.data.copy_(saved_param.data)
if args.local_rank != -1:
if not args.allreduce_post_accumulation:
model = DDP(model, message_size=250000000, gradient_predivide_factor=get_world_size())
else:
flat_dist_call([param.data for param in model.parameters()], torch.distributed.broadcast, (0,) )
elif args.n_gpu > 1:
model = torch.nn.DataParallel(model)
criterion = BertPretrainingCriterion(config.vocab_size)
return model, optimizer, lr_scheduler, checkpoint, global_step, criterion
def take_optimizer_step(args, optimizer, model, overflow_buf, global_step):
global skipped_steps
if args.allreduce_post_accumulation:
# manually allreduce gradients after all accumulation steps
# check for Inf/NaN
# 1. allocate an uninitialized buffer for flattened gradient
loss_scale = _amp_state.loss_scalers[0].loss_scale() if args.fp16 else 1
master_grads = [p.grad for p in amp.master_params(optimizer) if p.grad is not None]
flat_grad_size = sum(p.numel() for p in master_grads)
allreduce_dtype = torch.float16 if args.allreduce_post_accumulation_fp16 else torch.float32
flat_raw = torch.empty(flat_grad_size, device='cuda', dtype=allreduce_dtype)
# 2. combine unflattening and predivision of unscaled 'raw' gradient
allreduced_views = apex_C.unflatten(flat_raw, master_grads)
overflow_buf.zero_()
amp_C.multi_tensor_scale(65536,
overflow_buf,
[master_grads, allreduced_views],
loss_scale / (get_world_size() * args.gradient_accumulation_steps))
# 3. sum gradient across ranks. Because of the predivision, this averages the gradient
torch.distributed.all_reduce(flat_raw)
# 4. combine unscaling and unflattening of allreduced gradient
overflow_buf.zero_()
amp_C.multi_tensor_scale(65536,
overflow_buf,
[allreduced_views, master_grads],
1./loss_scale)
# 5. update loss scale
if args.fp16:
scaler = _amp_state.loss_scalers[0]
old_overflow_buf = scaler._overflow_buf
scaler._overflow_buf = overflow_buf
had_overflow = scaler.update_scale()
scaler._overfloat_buf = old_overflow_buf
else:
had_overflow = 0
# 6. call optimizer step function
if had_overflow == 0:
optimizer.step()
global_step += 1
else:
# Overflow detected, print message and clear gradients
skipped_steps += 1
if is_main_process():
scaler = _amp_state.loss_scalers[0]
dllogger.log(step="PARAMETER", data={"loss_scale": scaler.loss_scale()})
if _amp_state.opt_properties.master_weights:
for param in optimizer._amp_stash.all_fp32_from_fp16_params:
param.grad = None
for param in model.parameters():
param.grad = None
else:
optimizer.step()
#optimizer.zero_grad()
for param in model.parameters():
param.grad = None
global_step += 1
return global_step
def main():
global timeout_sent
args = parse_arguments()
random.seed(args.seed + args.local_rank)
np.random.seed(args.seed + args.local_rank)
torch.manual_seed(args.seed + args.local_rank)
torch.cuda.manual_seed(args.seed + args.local_rank)
worker_init = WorkerInitObj(args.seed + args.local_rank)
device, args = setup_training(args)
dllogger.log(step="PARAMETER", data={"Config": [str(args)]})
# Prepare optimizer
model, optimizer, lr_scheduler, checkpoint, global_step, criterion = prepare_model_and_optimizer(args, device)
if is_main_process():
dllogger.log(step="PARAMETER", data={"SEED": args.seed})
raw_train_start = None
if args.do_train:
if is_main_process():
dllogger.log(step="PARAMETER", data={"train_start": True})
dllogger.log(step="PARAMETER", data={"batch_size_per_gpu": args.train_batch_size})
dllogger.log(step="PARAMETER", data={"learning_rate": args.learning_rate})
model.train()
most_recent_ckpts_paths = []
average_loss = 0.0 # averaged loss every args.log_freq steps
epoch = 0
training_steps = 0
pool = ProcessPoolExecutor(1)
# Note: We loop infinitely over epochs, termination is handled via iteration count
while True:
thread = None
restored_data_loader = None
if not args.resume_from_checkpoint or epoch > 0 or (args.phase2 and global_step < 1) or args.init_checkpoint:
files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if
os.path.isfile(os.path.join(args.input_dir, f)) and 'training' in f]
files.sort()
num_files = len(files)
random.Random(args.seed + epoch).shuffle(files)
f_start_id = 0
else:
f_start_id = checkpoint['files'][0]
files = checkpoint['files'][1:]
args.resume_from_checkpoint = False
num_files = len(files)
# may not exist in all checkpoints
epoch = checkpoint.get('epoch', 0)
restored_data_loader = checkpoint.get('data_loader', None)
shared_file_list = {}
if torch.distributed.is_initialized() and get_world_size() > num_files:
remainder = get_world_size() % num_files
data_file = files[(f_start_id*get_world_size()+get_rank() + remainder*f_start_id)%num_files]
else:
data_file = files[(f_start_id*get_world_size()+get_rank())%num_files]
previous_file = data_file
if restored_data_loader is None:
train_data = pretraining_dataset(data_file, args.max_predictions_per_seq)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler,
batch_size=args.train_batch_size * args.n_gpu,
num_workers=4, worker_init_fn=worker_init,
pin_memory=True)
# shared_file_list["0"] = (train_dataloader, data_file)
else:
train_dataloader = restored_data_loader
restored_data_loader = None
overflow_buf = None
if args.allreduce_post_accumulation:
overflow_buf = torch.cuda.IntTensor([0])
for f_id in range(f_start_id + 1 , len(files)):
if get_world_size() > num_files:
data_file = files[(f_id*get_world_size()+get_rank() + remainder*f_id)%num_files]
else:
data_file = files[(f_id*get_world_size()+get_rank())%num_files]
previous_file = data_file
dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, shared_file_list, args, worker_init)
train_iter = tqdm(train_dataloader, desc="Iteration", disable=args.disable_progress_bar) if is_main_process() else train_dataloader
if raw_train_start is None:
raw_train_start = time.time()
for step, batch in enumerate(train_iter):
training_steps += 1
batch = [t.to(device) for t in batch]
input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch
prediction_scores, seq_relationship_score = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels)
if args.n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu.
divisor = args.gradient_accumulation_steps
if args.gradient_accumulation_steps > 1:
if not args.allreduce_post_accumulation:
# this division was merged into predivision
loss = loss / args.gradient_accumulation_steps
divisor = 1.0
if args.fp16:
with amp.scale_loss(loss, optimizer, delay_overflow_check=args.allreduce_post_accumulation) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
average_loss += loss.item()
if training_steps % args.gradient_accumulation_steps == 0:
lr_scheduler.step() # learning rate warmup
global_step = take_optimizer_step(args, optimizer, model, overflow_buf, global_step)
if global_step >= args.steps_this_run or timeout_sent:
train_time_raw = time.time() - raw_train_start
last_num_steps = int(training_steps / args.gradient_accumulation_steps) % args.log_freq
last_num_steps = args.log_freq if last_num_steps == 0 else last_num_steps
average_loss = torch.tensor(average_loss, dtype=torch.float32).cuda()
average_loss = average_loss / (last_num_steps * divisor)
if (torch.distributed.is_initialized()):
average_loss /= get_world_size()
torch.distributed.all_reduce(average_loss)
final_loss = average_loss.item()
if is_main_process():
dllogger.log(step=(epoch, global_step, ), data={"final_loss": final_loss})
elif training_steps % (args.log_freq * args.gradient_accumulation_steps) == 0:
if is_main_process():
dllogger.log(step=(epoch, global_step, ), data={"average_loss": average_loss / (args.log_freq * divisor),
"step_loss": loss.item() * args.gradient_accumulation_steps / divisor,
"learning_rate": optimizer.param_groups[0]['lr']})
average_loss = 0
if global_step >= args.steps_this_run or training_steps % (
args.num_steps_per_checkpoint * args.gradient_accumulation_steps) == 0 or timeout_sent:
if is_main_process() and not args.skip_checkpoint:
# Save a trained model
dllogger.log(step="PARAMETER", data={"checkpoint_step": global_step})
model_to_save = model.module if hasattr(model,
'module') else model # Only save the model it-self
if args.resume_step < 0 or not args.phase2:
output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step))
else:
output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step + args.phase1_end_step))
if args.do_train:
torch.save({'model': model_to_save.state_dict(),
'optimizer': optimizer.state_dict(),
'master params': list(amp.master_params(optimizer)),
'files': [f_id] + files,
'epoch': epoch,
'data_loader': None if global_step >= args.max_steps else train_dataloader}, output_save_file)
most_recent_ckpts_paths.append(output_save_file)
if len(most_recent_ckpts_paths) > 3:
ckpt_to_be_removed = most_recent_ckpts_paths.pop(0)
os.remove(ckpt_to_be_removed)
# Exiting the training due to hitting max steps, or being sent a
# timeout from the cluster scheduler
if global_step >= args.steps_this_run or timeout_sent:
del train_dataloader
# thread.join()
return args, final_loss, train_time_raw, global_step
del train_dataloader
# thread.join()
# Make sure pool has finished and switch train_dataloader
# NOTE: Will block until complete
train_dataloader, data_file = dataset_future.result(timeout=None)
epoch += 1
if __name__ == "__main__":
now = time.time()
args, final_loss, train_time_raw, global_step = main()
gpu_count = args.n_gpu
global_step += args.phase1_end_step if (args.phase2 and args.resume_step > 0) else 0
if args.resume_step == -1:
args.resume_step = 0
if torch.distributed.is_initialized():
gpu_count = get_world_size()
if is_main_process():
e2e_time = time.time() - now
training_perf = args.train_batch_size * args.gradient_accumulation_steps * gpu_count\
* (global_step - args.resume_step + skipped_steps) / train_time_raw
dllogger.log(step=tuple(), data={"e2e_train_time": e2e_time, "training_sequences_per_second": training_perf,
"final_loss": final_loss, "raw_train_time": train_time_raw })
dllogger.flush()
# coding=utf-8
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT finetuning runner."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
# ==================
import csv
import os
import time
import argparse
import random
import h5py
from tqdm import tqdm, trange
import os
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Dataset
from torch.utils.data.distributed import DistributedSampler
import math
from apex import amp
import multiprocessing
from tokenization import BertTokenizer
import modeling
from apex.optimizers import FusedLAMB
from schedulers import PolyWarmUpScheduler
from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from utils import is_main_process, format_step, get_world_size, get_rank
from apex.parallel import DistributedDataParallel as DDP
from schedulers import LinearWarmUpScheduler
from apex.parallel.distributed import flat_dist_call
import amp_C
import apex_C
from apex.amp import _amp_state
import dllogger
from concurrent.futures import ProcessPoolExecutor
torch._C._jit_set_profiling_mode(False)
torch._C._jit_set_profiling_executor(False)
skipped_steps = 0
# Track whether a SIGTERM (cluster time up) has been handled
timeout_sent = False
import signal
# handle SIGTERM sent from the scheduler and mark so we
# can gracefully save & exit
def signal_handler(sig, frame):
global timeout_sent
timeout_sent = True
signal.signal(signal.SIGTERM, signal_handler)
#Workaround because python functions are not picklable
class WorkerInitObj(object):
def __init__(self, seed):
self.seed = seed
def __call__(self, id):
np.random.seed(seed=self.seed + id)
random.seed(self.seed + id)
def create_pretraining_dataset(input_file, max_pred_length, shared_list, args, worker_init):
train_data = pretraining_dataset(input_file=input_file, max_pred_length=max_pred_length)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler,
batch_size=args.train_batch_size * args.n_gpu,
num_workers=4, worker_init_fn=worker_init,
pin_memory=True)
return train_dataloader, input_file
class pretraining_dataset(Dataset):
def __init__(self, input_file, max_pred_length):
self.input_file = input_file
self.max_pred_length = max_pred_length
f = h5py.File(input_file, "r")
keys = ['input_ids', 'input_mask', 'segment_ids', 'masked_lm_positions', 'masked_lm_ids',
'next_sentence_labels']
self.inputs = [np.asarray(f[key][:]) for key in keys]
f.close()
def __len__(self):
'Denotes the total number of samples'
return len(self.inputs[0])
def __getitem__(self, index):
[input_ids, input_mask, segment_ids, masked_lm_positions, masked_lm_ids, next_sentence_labels] = [
torch.from_numpy(input[index].astype(np.int64)) if indice < 5 else torch.from_numpy(
np.asarray(input[index].astype(np.int64))) for indice, input in enumerate(self.inputs)]
masked_lm_labels = torch.ones(input_ids.shape, dtype=torch.long) * -1
index = self.max_pred_length
# store number of masked tokens in index
padded_mask_indices = (masked_lm_positions == 0).nonzero()
if len(padded_mask_indices) != 0:
index = padded_mask_indices[0].item()
masked_lm_labels[masked_lm_positions[:index]] = masked_lm_ids[:index]
return [input_ids, segment_ids, input_mask,
masked_lm_labels, next_sentence_labels]
class BertPretrainingCriterion(torch.nn.Module):
def __init__(self, vocab_size):
super(BertPretrainingCriterion, self).__init__()
self.loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-1)
self.vocab_size = vocab_size
def forward(self, prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels):
masked_lm_loss = self.loss_fn(prediction_scores.view(-1, self.vocab_size), masked_lm_labels.view(-1))
next_sentence_loss = self.loss_fn(seq_relationship_score.view(-1, 2), next_sentence_labels.view(-1))
total_loss = masked_lm_loss + next_sentence_loss
return total_loss
def parse_arguments():
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--input_dir",
default=None,
type=str,
required=True,
help="The input data dir. Should contain .hdf5 files for the task.")
parser.add_argument("--config_file",
default=None,
type=str,
required=True,
help="The BERT model config")
parser.add_argument("--bert_model", default="bert-large-uncased", type=str,
help="Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
parser.add_argument("--output_dir",
default=None,
type=str,
required=True,
help="The output directory where the model checkpoints will be written.")
## Other parameters
parser.add_argument("--init_checkpoint",
default=None,
type=str,
help="The initial checkpoint to start training from.")
parser.add_argument("--max_seq_length",
default=512,
type=int,
help="The maximum total input sequence length after WordPiece tokenization. \n"
"Sequences longer than this will be truncated, and sequences shorter \n"
"than this will be padded.")
parser.add_argument("--max_predictions_per_seq",
default=80,
type=int,
help="The maximum total of masked tokens in input sequence")
parser.add_argument("--train_batch_size",
default=32,
type=int,
help="Total batch size for training.")
parser.add_argument("--learning_rate",
default=5e-5,
type=float,
help="The initial learning rate for Adam.")
parser.add_argument("--num_train_epochs",
default=3.0,
type=float,
help="Total number of training epochs to perform.")
parser.add_argument("--max_steps",
default=1000,
type=float,
help="Total number of training steps to perform.")
parser.add_argument("--warmup_proportion",
default=0.01,
type=float,
help="Proportion of training to perform linear learning rate warmup for. "
"E.g., 0.1 = 10%% of training.")
parser.add_argument("--local_rank",
type=int,
default=os.getenv('LOCAL_RANK', -1),
help="local_rank for distributed training on gpus")
parser.add_argument('--seed',
type=int,
default=42,
help="random seed for initialization")
parser.add_argument('--gradient_accumulation_steps',
type=int,
default=1,
help="Number of updates steps to accumualte before performing a backward/update pass.")
parser.add_argument('--fp16',
default=False,
action='store_true',
help="Mixed precision training")
parser.add_argument('--amp',
default=False,
action='store_true',
help="Mixed precision training")
parser.add_argument('--loss_scale',
type=float, default=0.0,
help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
parser.add_argument('--log_freq',
type=float, default=1.0,
help='frequency of logging loss.')
parser.add_argument('--checkpoint_activations',
default=False,
action='store_true',
help="Whether to use gradient checkpointing")
parser.add_argument("--resume_from_checkpoint",
default=False,
action='store_true',
help="Whether to resume training from checkpoint.")
parser.add_argument('--resume_step',
type=int,
default=-1,
help="Step to resume training from.")
parser.add_argument('--num_steps_per_checkpoint',
type=int,
default=100,
help="Number of update steps until a model checkpoint is saved to disk.")
parser.add_argument('--skip_checkpoint',
default=False,
action='store_true',
help="Whether to save checkpoints")
parser.add_argument('--phase2',
default=False,
action='store_true',
help="Whether to train with seq len 512")
parser.add_argument('--allreduce_post_accumulation',
default=False,
action='store_true',
help="Whether to do allreduces during gradient accumulation steps.")
parser.add_argument('--allreduce_post_accumulation_fp16',
default=False,
action='store_true',
help="Whether to do fp16 allreduce post accumulation.")
parser.add_argument('--phase1_end_step',
type=int,
default=7038,
help="Number of training steps in Phase1 - seq len 128")
parser.add_argument('--init_loss_scale',
type=int,
default=2**20,
help="Initial loss scaler value")
parser.add_argument("--do_train",
default=False,
action='store_true',
help="Whether to run training.")
parser.add_argument('--json-summary', type=str, default="results/dllogger.json",
help='If provided, the json summary will be written to'
'the specified file.')
parser.add_argument("--use_env",
action='store_true',
help="Whether to read local rank from ENVVAR")
parser.add_argument('--disable_progress_bar',
default=False,
action='store_true',
help='Disable tqdm progress bar')
parser.add_argument('--steps_this_run', type=int, default=-1,
help='If provided, only run this many steps before exiting')
parser.add_argument("--dist_url",default='tcp://224.66.41.62:23456', type=str,
help='url used to set up distributed training')
parser.add_argument("--gpus_per_node",type=int,default=4,
help='num of gpus per node')
parser.add_argument("--world_size",type=int,default=1,
help="number of process")
args = parser.parse_args()
args.fp16 = args.fp16 or args.amp
if args.steps_this_run < 0:
args.steps_this_run = args.max_steps
return args
def setup_training(args):
assert (torch.cuda.is_available())
if args.local_rank == -1:
device = torch.device("cuda")
args.n_gpu = torch.cuda.device_count()
args.allreduce_post_accumulation = False
args.allreduce_post_accumulation_fp16 = False
else:
#torch.cuda.set_device(args.local_rank)
#device = torch.device("cuda", args.local_rank)
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
#torch.distributed.init_process_group(backend='nccl', init_method='env://')
#xuan
device_n = args.local_rank % 8
torch.cuda.set_device(device_n)
device = torch.device("cuda", device_n)
torch.distributed.init_process_group(backend='nccl', init_method=args.dist_url,
world_size=args.world_size, rank=args.local_rank)
args.n_gpu = 1
if args.gradient_accumulation_steps == 1:
args.allreduce_post_accumulation = False
args.allreduce_post_accumulation_fp16 = False
if is_main_process():
dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
filename=args.json_summary),
dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step)])
else:
dllogger.init(backends=[])
print("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
device, args.n_gpu, bool(args.local_rank != -1), args.fp16))
if args.gradient_accumulation_steps < 1:
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
args.gradient_accumulation_steps))
if args.train_batch_size % args.gradient_accumulation_steps != 0:
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible".format(
args.gradient_accumulation_steps, args.train_batch_size))
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
if not args.do_train:
raise ValueError(" `do_train` must be True.")
if not args.resume_from_checkpoint and os.path.exists(args.output_dir) and (
os.listdir(args.output_dir) and any([i.startswith('ckpt') for i in os.listdir(args.output_dir)])):
raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
if (not args.resume_from_checkpoint or not os.path.exists(args.output_dir)) and is_main_process():
os.makedirs(args.output_dir, exist_ok=True)
return device, args
def prepare_model_and_optimizer(args, device):
# Prepare model
config = modeling.BertConfig.from_json_file(args.config_file)
# Padding for divisibility by 8
if config.vocab_size % 8 != 0:
config.vocab_size += 8 - (config.vocab_size % 8)
modeling.ACT2FN["bias_gelu"] = modeling.bias_gelu_training
model = modeling.BertForPreTraining(config)
checkpoint = None
if not args.resume_from_checkpoint:
global_step = 0
else:
if args.resume_step == -1 and not args.init_checkpoint:
model_names = [f for f in os.listdir(args.output_dir) if f.endswith(".pt")]
args.resume_step = max([int(x.split('.pt')[0].split('_')[1].strip()) for x in model_names])
global_step = args.resume_step if not args.init_checkpoint else 0
if not args.init_checkpoint:
checkpoint = torch.load(os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step)), map_location="cpu")
else:
checkpoint = torch.load(args.init_checkpoint, map_location="cpu")
model.load_state_dict(checkpoint['model'], strict=False)
if args.phase2 and not args.init_checkpoint:
global_step -= args.phase1_end_step
if is_main_process():
print("resume step from ", args.resume_step)
model.to(device)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta', 'LayerNorm']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
optimizer = FusedLAMB(optimizer_grouped_parameters,
lr=args.learning_rate)
lr_scheduler = PolyWarmUpScheduler(optimizer,
warmup=args.warmup_proportion,
total_steps=args.max_steps)
if args.fp16:
if args.loss_scale == 0:
model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic", cast_model_outputs=torch.float16)
else:
model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale=args.loss_scale, cast_model_outputs=torch.float16)
amp._amp_state.loss_scalers[0]._loss_scale = args.init_loss_scale
model.checkpoint_activations(args.checkpoint_activations)
if args.resume_from_checkpoint:
if args.phase2 or args.init_checkpoint:
keys = list(checkpoint['optimizer']['state'].keys())
#Override hyperparameters from previous checkpoint
for key in keys:
checkpoint['optimizer']['state'][key]['step'] = global_step
for iter, item in enumerate(checkpoint['optimizer']['param_groups']):
checkpoint['optimizer']['param_groups'][iter]['step'] = global_step
checkpoint['optimizer']['param_groups'][iter]['t_total'] = args.max_steps
checkpoint['optimizer']['param_groups'][iter]['warmup'] = args.warmup_proportion
checkpoint['optimizer']['param_groups'][iter]['lr'] = args.learning_rate
optimizer.load_state_dict(checkpoint['optimizer']) # , strict=False)
# Restore AMP master parameters
if args.fp16:
optimizer._lazy_init_maybe_master_weights()
optimizer._amp_stash.lazy_init_called = True
optimizer.load_state_dict(checkpoint['optimizer'])
for param, saved_param in zip(amp.master_params(optimizer), checkpoint['master params']):
param.data.copy_(saved_param.data)
if args.local_rank != -1:
if not args.allreduce_post_accumulation:
model = DDP(model, message_size=250000000, gradient_predivide_factor=get_world_size())
else:
flat_dist_call([param.data for param in model.parameters()], torch.distributed.broadcast, (0,) )
elif args.n_gpu > 1:
model = torch.nn.DataParallel(model)
criterion = BertPretrainingCriterion(config.vocab_size)
return model, optimizer, lr_scheduler, checkpoint, global_step, criterion
def take_optimizer_step(args, optimizer, model, overflow_buf, global_step):
global skipped_steps
if args.allreduce_post_accumulation:
# manually allreduce gradients after all accumulation steps
# check for Inf/NaN
# 1. allocate an uninitialized buffer for flattened gradient
loss_scale = _amp_state.loss_scalers[0].loss_scale() if args.fp16 else 1
master_grads = [p.grad for p in amp.master_params(optimizer) if p.grad is not None]
flat_grad_size = sum(p.numel() for p in master_grads)
allreduce_dtype = torch.float16 if args.allreduce_post_accumulation_fp16 else torch.float32
flat_raw = torch.empty(flat_grad_size, device='cuda', dtype=allreduce_dtype)
# 2. combine unflattening and predivision of unscaled 'raw' gradient
allreduced_views = apex_C.unflatten(flat_raw, master_grads)
overflow_buf.zero_()
amp_C.multi_tensor_scale(65536,
overflow_buf,
[master_grads, allreduced_views],
loss_scale / (get_world_size() * args.gradient_accumulation_steps))
# 3. sum gradient across ranks. Because of the predivision, this averages the gradient
torch.distributed.all_reduce(flat_raw)
# 4. combine unscaling and unflattening of allreduced gradient
overflow_buf.zero_()
amp_C.multi_tensor_scale(65536,
overflow_buf,
[allreduced_views, master_grads],
1./loss_scale)
# 5. update loss scale
if args.fp16:
scaler = _amp_state.loss_scalers[0]
old_overflow_buf = scaler._overflow_buf
scaler._overflow_buf = overflow_buf
had_overflow = scaler.update_scale()
scaler._overfloat_buf = old_overflow_buf
else:
had_overflow = 0
# 6. call optimizer step function
if had_overflow == 0:
optimizer.step()
global_step += 1
else:
# Overflow detected, print message and clear gradients
skipped_steps += 1
if is_main_process():
scaler = _amp_state.loss_scalers[0]
dllogger.log(step="PARAMETER", data={"loss_scale": scaler.loss_scale()})
if _amp_state.opt_properties.master_weights:
for param in optimizer._amp_stash.all_fp32_from_fp16_params:
param.grad = None
for param in model.parameters():
param.grad = None
else:
optimizer.step()
#optimizer.zero_grad()
for param in model.parameters():
param.grad = None
global_step += 1
return global_step
def main():
global timeout_sent
args = parse_arguments()
random.seed(args.seed + args.local_rank)
np.random.seed(args.seed + args.local_rank)
torch.manual_seed(args.seed + args.local_rank)
torch.cuda.manual_seed(args.seed + args.local_rank)
worker_init = WorkerInitObj(args.seed + args.local_rank)
device, args = setup_training(args)
dllogger.log(step="PARAMETER", data={"Config": [str(args)]})
# Prepare optimizer
model, optimizer, lr_scheduler, checkpoint, global_step, criterion = prepare_model_and_optimizer(args, device)
if is_main_process():
dllogger.log(step="PARAMETER", data={"SEED": args.seed})
raw_train_start = None
if args.do_train:
if is_main_process():
dllogger.log(step="PARAMETER", data={"train_start": True})
dllogger.log(step="PARAMETER", data={"batch_size_per_gpu": args.train_batch_size})
dllogger.log(step="PARAMETER", data={"learning_rate": args.learning_rate})
model.train()
most_recent_ckpts_paths = []
average_loss = 0.0 # averaged loss every args.log_freq steps
epoch = 0
training_steps = 0
pool = ProcessPoolExecutor(1)
# Note: We loop infinitely over epochs, termination is handled via iteration count
while True:
thread = None
restored_data_loader = None
if not args.resume_from_checkpoint or epoch > 0 or (args.phase2 and global_step < 1) or args.init_checkpoint:
files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if
os.path.isfile(os.path.join(args.input_dir, f)) and 'training' in f]
files.sort()
num_files = len(files)
random.Random(args.seed + epoch).shuffle(files)
f_start_id = 0
else:
f_start_id = checkpoint['files'][0]
files = checkpoint['files'][1:]
args.resume_from_checkpoint = False
num_files = len(files)
# may not exist in all checkpoints
epoch = checkpoint.get('epoch', 0)
restored_dataloader = checkpoint.get('data_loader', None)
shared_file_list = {}
if torch.distributed.is_initialized() and get_world_size() > num_files:
remainder = get_world_size() % num_files
data_file = files[(f_start_id*get_world_size()+get_rank() + remainder*f_start_id)%num_files]
else:
data_file = files[(f_start_id*get_world_size()+get_rank())%num_files]
previous_file = data_file
if restored_data_loader is None:
train_data = pretraining_dataset(data_file, args.max_predictions_per_seq)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler,
batch_size=args.train_batch_size * args.n_gpu,
num_workers=4, worker_init_fn=worker_init,
pin_memory=True)
# shared_file_list["0"] = (train_dataloader, data_file)
else:
train_dataloader = restored_data_loader
restored_data_loader = None
overflow_buf = None
if args.allreduce_post_accumulation:
overflow_buf = torch.cuda.IntTensor([0])
for f_id in range(f_start_id + 1 , len(files)):
if get_world_size() > num_files:
data_file = files[(f_id*get_world_size()+get_rank() + remainder*f_id)%num_files]
else:
data_file = files[(f_id*get_world_size()+get_rank())%num_files]
previous_file = data_file
dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, shared_file_list, args, worker_init)
train_iter = tqdm(train_dataloader, desc="Iteration", disable=args.disable_progress_bar) if is_main_process() else train_dataloader
if raw_train_start is None:
raw_train_start = time.time()
for step, batch in enumerate(train_iter):
training_steps += 1
batch = [t.to(device) for t in batch]
input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch
prediction_scores, seq_relationship_score = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels)
if args.n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu.
divisor = args.gradient_accumulation_steps
if args.gradient_accumulation_steps > 1:
if not args.allreduce_post_accumulation:
# this division was merged into predivision
loss = loss / args.gradient_accumulation_steps
divisor = 1.0
if args.fp16:
with amp.scale_loss(loss, optimizer, delay_overflow_check=args.allreduce_post_accumulation) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
average_loss += loss.item()
if training_steps % args.gradient_accumulation_steps == 0:
lr_scheduler.step() # learning rate warmup
global_step = take_optimizer_step(args, optimizer, model, overflow_buf, global_step)
if global_step >= args.steps_this_run or timeout_sent:
train_time_raw = time.time() - raw_train_start
last_num_steps = int(training_steps / args.gradient_accumulation_steps) % args.log_freq
last_num_steps = args.log_freq if last_num_steps == 0 else last_num_steps
average_loss = torch.tensor(average_loss, dtype=torch.float32).cuda()
average_loss = average_loss / (last_num_steps * divisor)
if (torch.distributed.is_initialized()):
average_loss /= get_world_size()
torch.distributed.all_reduce(average_loss)
final_loss = average_loss.item()
if is_main_process():
dllogger.log(step=(epoch, global_step, ), data={"final_loss": final_loss})
elif training_steps % (args.log_freq * args.gradient_accumulation_steps) == 0:
if is_main_process():
dllogger.log(step=(epoch, global_step, ), data={"average_loss": average_loss / (args.log_freq * divisor),
"step_loss": loss.item() * args.gradient_accumulation_steps / divisor,
"learning_rate": optimizer.param_groups[0]['lr']})
average_loss = 0
if global_step >= args.steps_this_run or training_steps % (
args.num_steps_per_checkpoint * args.gradient_accumulation_steps) == 0 or timeout_sent:
if is_main_process() and not args.skip_checkpoint:
# Save a trained model
dllogger.log(step="PARAMETER", data={"checkpoint_step": global_step})
model_to_save = model.module if hasattr(model,
'module') else model # Only save the model it-self
if args.resume_step < 0 or not args.phase2:
output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step))
else:
output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step + args.phase1_end_step))
if args.do_train:
torch.save({'model': model_to_save.state_dict(),
'optimizer': optimizer.state_dict(),
'master params': list(amp.master_params(optimizer)),
'files': [f_id] + files,
'epoch': epoch,
'data_loader': None if global_step >= args.max_steps else train_dataloader}, output_save_file)
most_recent_ckpts_paths.append(output_save_file)
if len(most_recent_ckpts_paths) > 3:
ckpt_to_be_removed = most_recent_ckpts_paths.pop(0)
os.remove(ckpt_to_be_removed)
# Exiting the training due to hitting max steps, or being sent a
# timeout from the cluster scheduler
if global_step >= args.steps_this_run or timeout_sent:
del train_dataloader
# thread.join()
return args, final_loss, train_time_raw, global_step
del train_dataloader
# thread.join()
# Make sure pool has finished and switch train_dataloader
# NOTE: Will block until complete
train_dataloader, data_file = dataset_future.result(timeout=None)
epoch += 1
if __name__ == "__main__":
now = time.time()
args, final_loss, train_time_raw, global_step = main()
gpu_count = args.n_gpu
global_step += args.phase1_end_step if (args.phase2 and args.resume_step > 0) else 0
if args.resume_step == -1:
args.resume_step = 0
if torch.distributed.is_initialized():
gpu_count = get_world_size()
if is_main_process():
e2e_time = time.time() - now
training_perf = args.train_batch_size * args.gradient_accumulation_steps * gpu_count\
* (global_step - args.resume_step + skipped_steps) / train_time_raw
dllogger.log(step=tuple(), data={"e2e_train_time": e2e_time, "training_sequences_per_second": training_perf,
"final_loss": final_loss, "raw_train_time": train_time_raw })
dllogger.flush()
# coding=utf-8
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT finetuning runner."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
# ==================
import csv
import os
import time
import argparse
import random
import h5py
from tqdm import tqdm, trange
import os
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Dataset
from torch.utils.data.distributed import DistributedSampler
import math
from apex import amp
import multiprocessing
from tokenization import BertTokenizer
import modeling
from apex.optimizers import FusedLAMB
from schedulers import PolyWarmUpScheduler
from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from utils import is_main_process, format_step, get_world_size, get_rank
from apex.parallel import DistributedDataParallel as DDP
from schedulers import LinearWarmUpScheduler
from apex.parallel.distributed import flat_dist_call
import amp_C
import apex_C
from apex.amp import _amp_state
import dllogger
from concurrent.futures import ProcessPoolExecutor
os.environ["HIP_VISIBLE_DEVICES"] = "0,1,2,3"
torch._C._jit_set_profiling_mode(False)
torch._C._jit_set_profiling_executor(False)
skipped_steps = 0
# Track whether a SIGTERM (cluster time up) has been handled
timeout_sent = False
import signal
# handle SIGTERM sent from the scheduler and mark so we
# can gracefully save & exit
def signal_handler(sig, frame):
global timeout_sent
timeout_sent = True
signal.signal(signal.SIGTERM, signal_handler)
#Workaround because python functions are not picklable
class WorkerInitObj(object):
def __init__(self, seed):
self.seed = seed
def __call__(self, id):
np.random.seed(seed=self.seed + id)
random.seed(self.seed + id)
def create_pretraining_dataset(input_file, max_pred_length, shared_list, args, worker_init):
train_data = pretraining_dataset(input_file=input_file, max_pred_length=max_pred_length)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler,
batch_size=args.train_batch_size * args.n_gpu,
num_workers=1, worker_init_fn=worker_init,
pin_memory=True)
return train_dataloader, input_file
class pretraining_dataset(Dataset):
def __init__(self, input_file, max_pred_length):
self.input_file = input_file
self.max_pred_length = max_pred_length
f = h5py.File(input_file, "r")
keys = ['input_ids', 'input_mask', 'segment_ids', 'masked_lm_positions', 'masked_lm_ids',
'next_sentence_labels']
self.inputs = [np.asarray(f[key][:]) for key in keys]
f.close()
def __len__(self):
'Denotes the total number of samples'
return len(self.inputs[0])
def __getitem__(self, index):
[input_ids, input_mask, segment_ids, masked_lm_positions, masked_lm_ids, next_sentence_labels] = [
torch.from_numpy(input[index].astype(np.int64)) if indice < 5 else torch.from_numpy(
np.asarray(input[index].astype(np.int64))) for indice, input in enumerate(self.inputs)]
masked_lm_labels = torch.ones(input_ids.shape, dtype=torch.long) * -1
index = self.max_pred_length
# store number of masked tokens in index
padded_mask_indices = (masked_lm_positions == 0).nonzero()
if len(padded_mask_indices) != 0:
index = padded_mask_indices[0].item()
masked_lm_labels[masked_lm_positions[:index]] = masked_lm_ids[:index]
return [input_ids, segment_ids, input_mask,
masked_lm_labels, next_sentence_labels]
class BertPretrainingCriterion(torch.nn.Module):
def __init__(self, vocab_size):
super(BertPretrainingCriterion, self).__init__()
self.loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-1)
self.vocab_size = vocab_size
def forward(self, prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels):
masked_lm_loss = self.loss_fn(prediction_scores.view(-1, self.vocab_size), masked_lm_labels.view(-1))
next_sentence_loss = self.loss_fn(seq_relationship_score.view(-1, 2), next_sentence_labels.view(-1))
total_loss = masked_lm_loss + next_sentence_loss
return total_loss
def parse_arguments():
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--input_dir",
default=None,
type=str,
required=True,
help="The input data dir. Should contain .hdf5 files for the task.")
parser.add_argument("--config_file",
default=None,
type=str,
required=True,
help="The BERT model config")
parser.add_argument("--bert_model", default="bert-large-uncased", type=str,
help="Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
parser.add_argument("--output_dir",
default=None,
type=str,
required=True,
help="The output directory where the model checkpoints will be written.")
## Other parameters
parser.add_argument("--init_checkpoint",
default=None,
type=str,
help="The initial checkpoint to start training from.")
parser.add_argument("--max_seq_length",
default=512,
type=int,
help="The maximum total input sequence length after WordPiece tokenization. \n"
"Sequences longer than this will be truncated, and sequences shorter \n"
"than this will be padded.")
parser.add_argument("--max_predictions_per_seq",
default=80,
type=int,
help="The maximum total of masked tokens in input sequence")
parser.add_argument("--train_batch_size",
default=32,
type=int,
help="Total batch size for training.")
parser.add_argument("--learning_rate",
default=5e-5,
type=float,
help="The initial learning rate for Adam.")
parser.add_argument("--num_train_epochs",
default=3.0,
type=float,
help="Total number of training epochs to perform.")
parser.add_argument("--max_steps",
default=1000,
type=float,
help="Total number of training steps to perform.")
parser.add_argument("--warmup_proportion",
default=0.01,
type=float,
help="Proportion of training to perform linear learning rate warmup for. "
"E.g., 0.1 = 10%% of training.")
parser.add_argument("--local_rank",
type=int,
default=os.getenv('LOCAL_RANK', -1),
help="local_rank for distributed training on gpus")
parser.add_argument('--seed',
type=int,
default=42,
help="random seed for initialization")
parser.add_argument('--gradient_accumulation_steps',
type=int,
default=1,
help="Number of updates steps to accumualte before performing a backward/update pass.")
parser.add_argument('--fp16',
default=False,
action='store_true',
help="Mixed precision training")
parser.add_argument('--amp',
default=False,
action='store_true',
help="Mixed precision training")
parser.add_argument('--loss_scale',
type=float, default=0.0,
help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
parser.add_argument('--log_freq',
type=float, default=1.0,
help='frequency of logging loss.')
parser.add_argument('--checkpoint_activations',
default=False,
action='store_true',
help="Whether to use gradient checkpointing")
parser.add_argument("--resume_from_checkpoint",
default=False,
action='store_true',
help="Whether to resume training from checkpoint.")
parser.add_argument('--resume_step',
type=int,
default=-1,
help="Step to resume training from.")
parser.add_argument('--num_steps_per_checkpoint',
type=int,
default=100,
help="Number of update steps until a model checkpoint is saved to disk.")
parser.add_argument('--skip_checkpoint',
default=False,
action='store_true',
help="Whether to save checkpoints")
parser.add_argument('--phase2',
default=False,
action='store_true',
help="Whether to train with seq len 512")
parser.add_argument('--allreduce_post_accumulation',
default=False,
action='store_true',
help="Whether to do allreduces during gradient accumulation steps.")
parser.add_argument('--allreduce_post_accumulation_fp16',
default=False,
action='store_true',
help="Whether to do fp16 allreduce post accumulation.")
parser.add_argument('--phase1_end_step',
type=int,
default=7038,
help="Number of training steps in Phase1 - seq len 128")
parser.add_argument('--init_loss_scale',
type=int,
default=2**20,
help="Initial loss scaler value")
parser.add_argument("--do_train",
default=False,
action='store_true',
help="Whether to run training.")
parser.add_argument('--json-summary', type=str, default="results/dllogger.json",
help='If provided, the json summary will be written to'
'the specified file.')
parser.add_argument("--use_env",
action='store_true',
help="Whether to read local rank from ENVVAR")
parser.add_argument('--disable_progress_bar',
default=False,
action='store_true',
help='Disable tqdm progress bar')
parser.add_argument('--steps_this_run', type=int, default=-1,
help='If provided, only run this many steps before exiting')
parser.add_argument("--dist_url",default='tcp://224.66.41.62:23456', type=str,
help='url used to set up distributed training')
parser.add_argument("--gpus_per_node",type=int,default=4,
help='num of gpus per node')
parser.add_argument("--world_size",type=int,default=1,
help="number of process")
args = parser.parse_args()
args.fp16 = args.fp16 or args.amp
if args.steps_this_run < 0:
args.steps_this_run = args.max_steps
return args
def setup_training(args):
assert (torch.cuda.is_available())
if args.local_rank == -1:
device = torch.device("cuda")
args.n_gpu = torch.cuda.device_count()
args.allreduce_post_accumulation = False
args.allreduce_post_accumulation_fp16 = False
else:
#torch.cuda.set_device(args.local_rank)
#device = torch.device("cuda", args.local_rank)
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
#torch.distributed.init_process_group(backend='nccl', init_method='env://')
#xuan
device_n = args.local_rank % 8
torch.cuda.set_device(device_n)
device = torch.device("cuda", device_n)
torch.distributed.init_process_group(backend='nccl', init_method=args.dist_url,
world_size=args.world_size, rank=args.local_rank)
args.n_gpu = 1
if args.gradient_accumulation_steps == 1:
args.allreduce_post_accumulation = False
args.allreduce_post_accumulation_fp16 = False
if is_main_process():
dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
filename=args.json_summary),
dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step)])
else:
dllogger.init(backends=[])
print("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
device, args.n_gpu, bool(args.local_rank != -1), args.fp16))
if args.gradient_accumulation_steps < 1:
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
args.gradient_accumulation_steps))
if args.train_batch_size % args.gradient_accumulation_steps != 0:
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible".format(
args.gradient_accumulation_steps, args.train_batch_size))
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
if not args.do_train:
raise ValueError(" `do_train` must be True.")
if not args.resume_from_checkpoint and os.path.exists(args.output_dir) and (
os.listdir(args.output_dir) and any([i.startswith('ckpt') for i in os.listdir(args.output_dir)])):
raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
if (not args.resume_from_checkpoint or not os.path.exists(args.output_dir)) and is_main_process():
os.makedirs(args.output_dir, exist_ok=True)
return device, args
def prepare_model_and_optimizer(args, device):
# Prepare model
config = modeling.BertConfig.from_json_file(args.config_file)
# Padding for divisibility by 8
if config.vocab_size % 8 != 0:
config.vocab_size += 8 - (config.vocab_size % 8)
modeling.ACT2FN["bias_gelu"] = modeling.bias_gelu_training
model = modeling.BertForPreTraining(config)
checkpoint = None
if not args.resume_from_checkpoint:
global_step = 0
else:
if args.resume_step == -1 and not args.init_checkpoint:
model_names = [f for f in os.listdir(args.output_dir) if f.endswith(".pt")]
args.resume_step = max([int(x.split('.pt')[0].split('_')[1].strip()) for x in model_names])
global_step = args.resume_step if not args.init_checkpoint else 0
if not args.init_checkpoint:
checkpoint = torch.load(os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step)), map_location="cpu")
else:
checkpoint = torch.load(args.init_checkpoint, map_location="cpu")
model.load_state_dict(checkpoint['model'], strict=False)
if args.phase2 and not args.init_checkpoint:
global_step -= args.phase1_end_step
if is_main_process():
print("resume step from ", args.resume_step)
model.to(device)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta', 'LayerNorm']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
optimizer = FusedLAMB(optimizer_grouped_parameters,
lr=args.learning_rate)
lr_scheduler = PolyWarmUpScheduler(optimizer,
warmup=args.warmup_proportion,
total_steps=args.max_steps)
if args.fp16:
if args.loss_scale == 0:
model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic", cast_model_outputs=torch.float16)
else:
model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale=args.loss_scale, cast_model_outputs=torch.float16)
amp._amp_state.loss_scalers[0]._loss_scale = args.init_loss_scale
model.checkpoint_activations(args.checkpoint_activations)
if args.resume_from_checkpoint:
if args.phase2 or args.init_checkpoint:
keys = list(checkpoint['optimizer']['state'].keys())
#Override hyperparameters from previous checkpoint
for key in keys:
checkpoint['optimizer']['state'][key]['step'] = global_step
for iter, item in enumerate(checkpoint['optimizer']['param_groups']):
checkpoint['optimizer']['param_groups'][iter]['step'] = global_step
checkpoint['optimizer']['param_groups'][iter]['t_total'] = args.max_steps
checkpoint['optimizer']['param_groups'][iter]['warmup'] = args.warmup_proportion
checkpoint['optimizer']['param_groups'][iter]['lr'] = args.learning_rate
optimizer.load_state_dict(checkpoint['optimizer']) # , strict=False)
# Restore AMP master parameters
if args.fp16:
optimizer._lazy_init_maybe_master_weights()
optimizer._amp_stash.lazy_init_called = True
optimizer.load_state_dict(checkpoint['optimizer'])
for param, saved_param in zip(amp.master_params(optimizer), checkpoint['master params']):
param.data.copy_(saved_param.data)
if args.local_rank != -1:
if not args.allreduce_post_accumulation:
model = DDP(model, message_size=250000000, gradient_predivide_factor=get_world_size())
else:
flat_dist_call([param.data for param in model.parameters()], torch.distributed.broadcast, (0,) )
elif args.n_gpu > 1:
model = torch.nn.DataParallel(model)
criterion = BertPretrainingCriterion(config.vocab_size)
return model, optimizer, lr_scheduler, checkpoint, global_step, criterion
def take_optimizer_step(args, optimizer, model, overflow_buf, global_step):
global skipped_steps
if args.allreduce_post_accumulation:
# manually allreduce gradients after all accumulation steps
# check for Inf/NaN
# 1. allocate an uninitialized buffer for flattened gradient
loss_scale = _amp_state.loss_scalers[0].loss_scale() if args.fp16 else 1
master_grads = [p.grad for p in amp.master_params(optimizer) if p.grad is not None]
flat_grad_size = sum(p.numel() for p in master_grads)
allreduce_dtype = torch.float16 if args.allreduce_post_accumulation_fp16 else torch.float32
flat_raw = torch.empty(flat_grad_size, device='cuda', dtype=allreduce_dtype)
# 2. combine unflattening and predivision of unscaled 'raw' gradient
allreduced_views = apex_C.unflatten(flat_raw, master_grads)
overflow_buf.zero_()
amp_C.multi_tensor_scale(65536,
overflow_buf,
[master_grads, allreduced_views],
loss_scale / (get_world_size() * args.gradient_accumulation_steps))
# 3. sum gradient across ranks. Because of the predivision, this averages the gradient
torch.distributed.all_reduce(flat_raw)
# 4. combine unscaling and unflattening of allreduced gradient
overflow_buf.zero_()
amp_C.multi_tensor_scale(65536,
overflow_buf,
[allreduced_views, master_grads],
1./loss_scale)
# 5. update loss scale
if args.fp16:
scaler = _amp_state.loss_scalers[0]
old_overflow_buf = scaler._overflow_buf
scaler._overflow_buf = overflow_buf
had_overflow = scaler.update_scale()
scaler._overfloat_buf = old_overflow_buf
else:
had_overflow = 0
# 6. call optimizer step function
if had_overflow == 0:
optimizer.step()
global_step += 1
else:
# Overflow detected, print message and clear gradients
skipped_steps += 1
if is_main_process():
scaler = _amp_state.loss_scalers[0]
dllogger.log(step="PARAMETER", data={"loss_scale": scaler.loss_scale()})
if _amp_state.opt_properties.master_weights:
for param in optimizer._amp_stash.all_fp32_from_fp16_params:
param.grad = None
for param in model.parameters():
param.grad = None
else:
optimizer.step()
#optimizer.zero_grad()
for param in model.parameters():
param.grad = None
global_step += 1
return global_step
def main():
global timeout_sent
args = parse_arguments()
random.seed(args.seed + args.local_rank)
np.random.seed(args.seed + args.local_rank)
torch.manual_seed(args.seed + args.local_rank)
torch.cuda.manual_seed(args.seed + args.local_rank)
worker_init = WorkerInitObj(args.seed + args.local_rank)
device, args = setup_training(args)
dllogger.log(step="PARAMETER", data={"Config": [str(args)]})
# Prepare optimizer
model, optimizer, lr_scheduler, checkpoint, global_step, criterion = prepare_model_and_optimizer(args, device)
if is_main_process():
dllogger.log(step="PARAMETER", data={"SEED": args.seed})
raw_train_start = None
if args.do_train:
if is_main_process():
dllogger.log(step="PARAMETER", data={"train_start": True})
dllogger.log(step="PARAMETER", data={"batch_size_per_gpu": args.train_batch_size})
dllogger.log(step="PARAMETER", data={"learning_rate": args.learning_rate})
model.train()
most_recent_ckpts_paths = []
average_loss = 0.0 # averaged loss every args.log_freq steps
epoch = 0
training_steps = 0
pool = ProcessPoolExecutor(1)
# Note: We loop infinitely over epochs, termination is handled via iteration count
while True:
thread = None
restored_data_loader = None
if not args.resume_from_checkpoint or epoch > 0 or (args.phase2 and global_step < 1) or args.init_checkpoint:
files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if
os.path.isfile(os.path.join(args.input_dir, f)) and 'training' in f]
files.sort()
num_files = len(files)
random.Random(args.seed + epoch).shuffle(files)
f_start_id = 0
else:
f_start_id = checkpoint['files'][0]
files = checkpoint['files'][1:]
args.resume_from_checkpoint = False
num_files = len(files)
# may not exist in all checkpoints
epoch = checkpoint.get('epoch', 0)
restored_dataloader = checkpoint.get('data_loader', None)
shared_file_list = {}
if torch.distributed.is_initialized() and get_world_size() > num_files:
remainder = get_world_size() % num_files
data_file = files[(f_start_id*get_world_size()+get_rank() + remainder*f_start_id)%num_files]
else:
data_file = files[(f_start_id*get_world_size()+get_rank())%num_files]
previous_file = data_file
if restored_data_loader is None:
train_data = pretraining_dataset(data_file, args.max_predictions_per_seq)
if args.local_rank == -1:
train_sampler = RandomSampler(train_data)
else:
train_sampler = DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler,
batch_size=args.train_batch_size * args.n_gpu,
num_workers=4, worker_init_fn=worker_init,
pin_memory=True)
# shared_file_list["0"] = (train_dataloader, data_file)
else:
train_dataloader = restored_data_loader
restored_data_loader = None
overflow_buf = None
if args.allreduce_post_accumulation:
overflow_buf = torch.cuda.IntTensor([0])
for f_id in range(f_start_id + 1 , len(files)):
if get_world_size() > num_files:
data_file = files[(f_id*get_world_size()+get_rank() + remainder*f_id)%num_files]
else:
data_file = files[(f_id*get_world_size()+get_rank())%num_files]
previous_file = data_file
dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, shared_file_list, args, worker_init)
train_iter = tqdm(train_dataloader, desc="Iteration", disable=args.disable_progress_bar) if is_main_process() else train_dataloader
if raw_train_start is None:
raw_train_start = time.time()
for step, batch in enumerate(train_iter):
training_steps += 1
batch = [t.to(device) for t in batch]
input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch
prediction_scores, seq_relationship_score = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels)
if args.n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu.
divisor = args.gradient_accumulation_steps
if args.gradient_accumulation_steps > 1:
if not args.allreduce_post_accumulation:
# this division was merged into predivision
loss = loss / args.gradient_accumulation_steps
divisor = 1.0
if args.fp16:
with amp.scale_loss(loss, optimizer, delay_overflow_check=args.allreduce_post_accumulation) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
average_loss += loss.item()
if training_steps % args.gradient_accumulation_steps == 0:
lr_scheduler.step() # learning rate warmup
global_step = take_optimizer_step(args, optimizer, model, overflow_buf, global_step)
if global_step >= args.steps_this_run or timeout_sent:
train_time_raw = time.time() - raw_train_start
last_num_steps = int(training_steps / args.gradient_accumulation_steps) % args.log_freq
last_num_steps = args.log_freq if last_num_steps == 0 else last_num_steps
average_loss = torch.tensor(average_loss, dtype=torch.float32).cuda()
average_loss = average_loss / (last_num_steps * divisor)
if (torch.distributed.is_initialized()):
average_loss /= get_world_size()
torch.distributed.all_reduce(average_loss)
final_loss = average_loss.item()
if is_main_process():
dllogger.log(step=(epoch, global_step, ), data={"final_loss": final_loss})
elif training_steps % (args.log_freq * args.gradient_accumulation_steps) == 0:
if is_main_process():
dllogger.log(step=(epoch, global_step, ), data={"average_loss": average_loss / (args.log_freq * divisor),
"step_loss": loss.item() * args.gradient_accumulation_steps / divisor,
"learning_rate": optimizer.param_groups[0]['lr']})
average_loss = 0
if global_step >= args.steps_this_run or training_steps % (
args.num_steps_per_checkpoint * args.gradient_accumulation_steps) == 0 or timeout_sent:
if is_main_process() and not args.skip_checkpoint:
# Save a trained model
dllogger.log(step="PARAMETER", data={"checkpoint_step": global_step})
model_to_save = model.module if hasattr(model,
'module') else model # Only save the model it-self
if args.resume_step < 0 or not args.phase2:
output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step))
else:
output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step + args.phase1_end_step))
if args.do_train:
torch.save({'model': model_to_save.state_dict(),
'optimizer': optimizer.state_dict(),
'master params': list(amp.master_params(optimizer)),
'files': [f_id] + files,
'epoch': epoch,
'data_loader': None if global_step >= args.max_steps else train_dataloader}, output_save_file)
most_recent_ckpts_paths.append(output_save_file)
if len(most_recent_ckpts_paths) > 3:
ckpt_to_be_removed = most_recent_ckpts_paths.pop(0)
os.remove(ckpt_to_be_removed)
# Exiting the training due to hitting max steps, or being sent a
# timeout from the cluster scheduler
if global_step >= args.steps_this_run or timeout_sent:
del train_dataloader
# thread.join()
return args, final_loss, train_time_raw, global_step
del train_dataloader
# thread.join()
# Make sure pool has finished and switch train_dataloader
# NOTE: Will block until complete
train_dataloader, data_file = dataset_future.result(timeout=None)
epoch += 1
if __name__ == "__main__":
now = time.time()
args, final_loss, train_time_raw, global_step = main()
gpu_count = args.n_gpu
global_step += args.phase1_end_step if (args.phase2 and args.resume_step > 0) else 0
if args.resume_step == -1:
args.resume_step = 0
if torch.distributed.is_initialized():
gpu_count = get_world_size()
if is_main_process():
e2e_time = time.time() - now
training_perf = args.train_batch_size * args.gradient_accumulation_steps * gpu_count\
* (global_step - args.resume_step + skipped_steps) / train_time_raw
dllogger.log(step=tuple(), data={"e2e_train_time": e2e_time, "training_sequences_per_second": training_perf,
"final_loss": final_loss, "raw_train_time": train_time_raw })
dllogger.flush()
# coding=utf-8
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run BERT on SQuAD."""
from __future__ import absolute_import, division, print_function
import argparse
import collections
import json
import logging
import math
import os
import random
import sys
from io import open
import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
from apex import amp
from schedulers import LinearWarmUpScheduler
from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
import modeling
from optimization import BertAdam, warmup_linear
from tokenization import (BasicTokenizer, BertTokenizer, whitespace_tokenize)
from utils import is_main_process, format_step
import dllogger, time
torch._C._jit_set_profiling_mode(False)
torch._C._jit_set_profiling_executor(False)
if sys.version_info[0] == 2:
import cPickle as pickle
else:
import pickle
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt='%m/%d/%Y %H:%M:%S',
level=logging.INFO)
logger = logging.getLogger(__name__)
class SquadExample(object):
"""
A single training/test example for the Squad dataset.
For examples without an answer, the start and end position are -1.
"""
def __init__(self,
qas_id,
question_text,
doc_tokens,
orig_answer_text=None,
start_position=None,
end_position=None,
is_impossible=None):
self.qas_id = qas_id
self.question_text = question_text
self.doc_tokens = doc_tokens
self.orig_answer_text = orig_answer_text
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
def __str__(self):
return self.__repr__()
def __repr__(self):
s = ""
s += "qas_id: %s" % (self.qas_id)
s += ", question_text: %s" % (
self.question_text)
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.end_position:
s += ", end_position: %d" % (self.end_position)
if self.is_impossible:
s += ", is_impossible: %r" % (self.is_impossible)
return s
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self,
unique_id,
example_index,
doc_span_index,
tokens,
token_to_orig_map,
token_is_max_context,
input_ids,
input_mask,
segment_ids,
start_position=None,
end_position=None,
is_impossible=None):
self.unique_id = unique_id
self.example_index = example_index
self.doc_span_index = doc_span_index
self.tokens = tokens
self.token_to_orig_map = token_to_orig_map
self.token_is_max_context = token_is_max_context
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
def read_squad_examples(input_file, is_training, version_2_with_negative):
"""Read a SQuAD json file into a list of SquadExample."""
with open(input_file, "r", encoding='utf-8') as reader:
input_data = json.load(reader)["data"]
def is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
return True
return False
examples = []
for entry in input_data:
for paragraph in entry["paragraphs"]:
paragraph_text = paragraph["context"]
doc_tokens = []
char_to_word_offset = []
prev_is_whitespace = True
for c in paragraph_text:
if is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
doc_tokens.append(c)
else:
doc_tokens[-1] += c
prev_is_whitespace = False
char_to_word_offset.append(len(doc_tokens) - 1)
for qa in paragraph["qas"]:
qas_id = qa["id"]
question_text = qa["question"]
start_position = None
end_position = None
orig_answer_text = None
is_impossible = False
if is_training:
if version_2_with_negative:
is_impossible = qa["is_impossible"]
if (len(qa["answers"]) != 1) and (not is_impossible):
raise ValueError(
"For training, each question should have exactly 1 answer.")
if not is_impossible:
answer = qa["answers"][0]
orig_answer_text = answer["text"]
answer_offset = answer["answer_start"]
answer_length = len(orig_answer_text)
start_position = char_to_word_offset[answer_offset]
end_position = char_to_word_offset[answer_offset + answer_length - 1]
# Only add answers where the text can be exactly recovered from the
# document. If this CAN'T happen it's likely due to weird Unicode
# stuff so we will just skip the example.
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
cleaned_answer_text = " ".join(
whitespace_tokenize(orig_answer_text))
if actual_text.find(cleaned_answer_text) == -1:
logger.warning("Could not find answer: '%s' vs. '%s'",
actual_text, cleaned_answer_text)
continue
else:
start_position = -1
end_position = -1
orig_answer_text = ""
example = SquadExample(
qas_id=qas_id,
question_text=question_text,
doc_tokens=doc_tokens,
orig_answer_text=orig_answer_text,
start_position=start_position,
end_position=end_position,
is_impossible=is_impossible)
examples.append(example)
return examples
def convert_examples_to_features(examples, tokenizer, max_seq_length,
doc_stride, max_query_length, is_training):
"""Loads a data file into a list of `InputBatch`s."""
unique_id = 1000000000
features = []
for (example_index, example) in enumerate(examples):
query_tokens = tokenizer.tokenize(example.question_text)
if len(query_tokens) > max_query_length:
query_tokens = query_tokens[0:max_query_length]
tok_to_orig_index = []
orig_to_tok_index = []
all_doc_tokens = []
for (i, token) in enumerate(example.doc_tokens):
orig_to_tok_index.append(len(all_doc_tokens))
sub_tokens = tokenizer.tokenize(token)
for sub_token in sub_tokens:
tok_to_orig_index.append(i)
all_doc_tokens.append(sub_token)
tok_start_position = None
tok_end_position = None
if is_training and example.is_impossible:
tok_start_position = -1
tok_end_position = -1
if is_training and not example.is_impossible:
tok_start_position = orig_to_tok_index[example.start_position]
if example.end_position < len(example.doc_tokens) - 1:
tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
else:
tok_end_position = len(all_doc_tokens) - 1
(tok_start_position, tok_end_position) = _improve_answer_span(
all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
example.orig_answer_text)
# The -3 accounts for [CLS], [SEP] and [SEP]
max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
# We can have documents that are longer than the maximum sequence length.
# To deal with this we do a sliding window approach, where we take chunks
# of the up to our max length with a stride of `doc_stride`.
_DocSpan = collections.namedtuple( # pylint: disable=invalid-name
"DocSpan", ["start", "length"])
doc_spans = []
start_offset = 0
while start_offset < len(all_doc_tokens):
length = len(all_doc_tokens) - start_offset
if length > max_tokens_for_doc:
length = max_tokens_for_doc
doc_spans.append(_DocSpan(start=start_offset, length=length))
if start_offset + length == len(all_doc_tokens):
break
start_offset += min(length, doc_stride)
for (doc_span_index, doc_span) in enumerate(doc_spans):
tokens = []
token_to_orig_map = {}
token_is_max_context = {}
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in query_tokens:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
for i in range(doc_span.length):
split_token_index = doc_span.start + i
token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
is_max_context = _check_is_max_context(doc_spans, doc_span_index,
split_token_index)
token_is_max_context[len(tokens)] = is_max_context
tokens.append(all_doc_tokens[split_token_index])
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
start_position = None
end_position = None
if is_training and not example.is_impossible:
# For training, if our document chunk does not contain an annotation
# we throw it out, since there is nothing to predict.
doc_start = doc_span.start
doc_end = doc_span.start + doc_span.length - 1
out_of_span = False
if not (tok_start_position >= doc_start and
tok_end_position <= doc_end):
out_of_span = True
if out_of_span:
start_position = 0
end_position = 0
else:
doc_offset = len(query_tokens) + 2
start_position = tok_start_position - doc_start + doc_offset
end_position = tok_end_position - doc_start + doc_offset
if is_training and example.is_impossible:
start_position = 0
end_position = 0
features.append(
InputFeatures(
unique_id=unique_id,
example_index=example_index,
doc_span_index=doc_span_index,
tokens=tokens,
token_to_orig_map=token_to_orig_map,
token_is_max_context=token_is_max_context,
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
start_position=start_position,
end_position=end_position,
is_impossible=example.is_impossible))
unique_id += 1
return features
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
orig_answer_text):
"""Returns tokenized answer spans that better match the annotated answer."""
# The SQuAD annotations are character based. We first project them to
# whitespace-tokenized words. But then after WordPiece tokenization, we can
# often find a "better match". For example:
#
# Question: What year was John Smith born?
# Context: The leader was John Smith (1895-1943).
# Answer: 1895
#
# The original whitespace-tokenized answer will be "(1895-1943).". However
# after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
# the exact answer, 1895.
#
# However, this is not always possible. Consider the following:
#
# Question: What country is the top exporter of electornics?
# Context: The Japanese electronics industry is the lagest in the world.
# Answer: Japan
#
# In this case, the annotator chose "Japan" as a character sub-span of
# the word "Japanese". Since our WordPiece tokenizer does not split
# "Japanese", we just use "Japanese" as the annotation. This is fairly rare
# in SQuAD, but does happen.
tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
for new_start in range(input_start, input_end + 1):
for new_end in range(input_end, new_start - 1, -1):
text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
if text_span == tok_answer_text:
return (new_start, new_end)
return (input_start, input_end)
def _check_is_max_context(doc_spans, cur_span_index, position):
"""Check if this is the 'max context' doc span for the token."""
# Because of the sliding window approach taken to scoring documents, a single
# token can appear in multiple documents. E.g.
# Doc: the man went to the store and bought a gallon of milk
# Span A: the man went to the
# Span B: to the store and bought
# Span C: and bought a gallon of
# ...
#
# Now the word 'bought' will have two scores from spans B and C. We only
# want to consider the score with "maximum context", which we define as
# the *minimum* of its left and right context (the *sum* of left and
# right context will always be the same, of course).
#
# In the example the maximum context for 'bought' would be span C since
# it has 1 left context and 3 right context, while span B has 4 left context
# and 0 right context.
best_score = None
best_span_index = None
for (span_index, doc_span) in enumerate(doc_spans):
end = doc_span.start + doc_span.length - 1
if position < doc_span.start:
continue
if position > end:
continue
num_left_context = position - doc_span.start
num_right_context = end - position
score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
if best_score is None or score > best_score:
best_score = score
best_span_index = span_index
return cur_span_index == best_span_index
RawResult = collections.namedtuple("RawResult",
["unique_id", "start_logits", "end_logits"])
def get_answers(examples, features, results, args):
predictions = collections.defaultdict(list) #it is possible that one example corresponds to multiple features
Prediction = collections.namedtuple('Prediction', ['text', 'start_logit', 'end_logit'])
if args.version_2_with_negative:
null_vals = collections.defaultdict(lambda: (float("inf"),0,0))
for ex, feat, result in match_results(examples, features, results):
start_indices = _get_best_indices(result.start_logits, args.n_best_size)
end_indices = _get_best_indices(result.end_logits, args.n_best_size)
prelim_predictions = get_valid_prelim_predictions(start_indices, end_indices, feat, result, args)
prelim_predictions = sorted(
prelim_predictions,
key=lambda x: (x.start_logit + x.end_logit),
reverse=True)
if args.version_2_with_negative:
score = result.start_logits[0] + result.end_logits[0]
if score < null_vals[ex.qas_id][0]:
null_vals[ex.qas_id] = (score, result.start_logits[0], result.end_logits[0])
curr_predictions = []
seen_predictions = []
for pred in prelim_predictions:
if len(curr_predictions) == args.n_best_size:
break
if pred.start_index > 0: # this is a non-null prediction TODO: this probably is irrelevant
final_text = get_answer_text(ex, feat, pred, args)
if final_text in seen_predictions:
continue
else:
final_text = ""
seen_predictions.append(final_text)
curr_predictions.append(Prediction(final_text, pred.start_logit, pred.end_logit))
predictions[ex.qas_id] += curr_predictions
#Add empty prediction
if args.version_2_with_negative:
for qas_id in predictions.keys():
predictions[qas_id].append(Prediction('',
null_vals[ex.qas_id][1],
null_vals[ex.qas_id][2]))
nbest_answers = collections.defaultdict(list)
answers = {}
for qas_id, preds in predictions.items():
nbest = sorted(
preds,
key=lambda x: (x.start_logit + x.end_logit),
reverse=True)[:args.n_best_size]
# In very rare edge cases we could only have single null prediction.
# So we just create a nonce prediction in this case to avoid failure.
if not nbest:
nbest.append(Prediction(text="empty", start_logit=0.0, end_logit=0.0))
total_scores = []
best_non_null_entry = None
for entry in nbest:
total_scores.append(entry.start_logit + entry.end_logit)
if not best_non_null_entry and entry.text:
best_non_null_entry = entry
probs = _compute_softmax(total_scores)
for (i, entry) in enumerate(nbest):
output = collections.OrderedDict()
output["text"] = entry.text
output["probability"] = probs[i]
output["start_logit"] = entry.start_logit
output["end_logit"] = entry.end_logit
nbest_answers[qas_id].append(output)
if args.version_2_with_negative:
score_diff = null_vals[qas_id][0] - best_non_null_entry.start_logit - best_non_null_entry.end_logit
if score_diff > args.null_score_diff_threshold:
answers[qas_id] = ""
else:
answers[qas_id] = best_non_null_entry.text
else:
answers[qas_id] = nbest_answers[qas_id][0]['text']
return answers, nbest_answers
def get_answer_text(example, feature, pred, args):
tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
orig_doc_start = feature.token_to_orig_map[pred.start_index]
orig_doc_end = feature.token_to_orig_map[pred.end_index]
orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
tok_text = " ".join(tok_tokens)
# De-tokenize WordPieces that have been split off.
tok_text = tok_text.replace(" ##", "")
tok_text = tok_text.replace("##", "")
# Clean whitespace
tok_text = tok_text.strip()
tok_text = " ".join(tok_text.split())
orig_text = " ".join(orig_tokens)
final_text = get_final_text(tok_text, orig_text, args.do_lower_case, args.verbose_logging)
return final_text
def get_valid_prelim_predictions(start_indices, end_indices, feature, result, args):
_PrelimPrediction = collections.namedtuple(
"PrelimPrediction",
["start_index", "end_index", "start_logit", "end_logit"])
prelim_predictions = []
for start_index in start_indices:
for end_index in end_indices:
if start_index >= len(feature.tokens):
continue
if end_index >= len(feature.tokens):
continue
if start_index not in feature.token_to_orig_map:
continue
if end_index not in feature.token_to_orig_map:
continue
if not feature.token_is_max_context.get(start_index, False):
continue
if end_index < start_index:
continue
length = end_index - start_index + 1
if length > args.max_answer_length:
continue
prelim_predictions.append(
_PrelimPrediction(
start_index=start_index,
end_index=end_index,
start_logit=result.start_logits[start_index],
end_logit=result.end_logits[end_index]))
return prelim_predictions
def match_results(examples, features, results):
unique_f_ids = set([f.unique_id for f in features])
unique_r_ids = set([r.unique_id for r in results])
matching_ids = unique_f_ids & unique_r_ids
features = [f for f in features if f.unique_id in matching_ids]
results = [r for r in results if r.unique_id in matching_ids]
features.sort(key=lambda x: x.unique_id)
results.sort(key=lambda x: x.unique_id)
for f, r in zip(features, results): #original code assumes strict ordering of examples. TODO: rewrite this
yield examples[f.example_index], f, r
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
"""Project the tokenized prediction back to the original text."""
# When we created the data, we kept track of the alignment between original
# (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
# now `orig_text` contains the span of our original text corresponding to the
# span that we predicted.
#
# However, `orig_text` may contain extra characters that we don't want in
# our prediction.
#
# For example, let's say:
# pred_text = steve smith
# orig_text = Steve Smith's
#
# We don't want to return `orig_text` because it contains the extra "'s".
#
# We don't want to return `pred_text` because it's already been normalized
# (the SQuAD eval script also does punctuation stripping/lower casing but
# our tokenizer does additional normalization like stripping accent
# characters).
#
# What we really want to return is "Steve Smith".
#
# Therefore, we have to apply a semi-complicated alignment heruistic between
# `pred_text` and `orig_text` to get a character-to-charcter alignment. This
# can fail in certain cases in which case we just return `orig_text`.
def _strip_spaces(text):
ns_chars = []
ns_to_s_map = collections.OrderedDict()
for (i, c) in enumerate(text):
if c == " ":
continue
ns_to_s_map[len(ns_chars)] = i
ns_chars.append(c)
ns_text = "".join(ns_chars)
return (ns_text, ns_to_s_map)
# We first tokenize `orig_text`, strip whitespace from the result
# and `pred_text`, and check if they are the same length. If they are
# NOT the same length, the heuristic has failed. If they are the same
# length, we assume the characters are one-to-one aligned.
tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
tok_text = " ".join(tokenizer.tokenize(orig_text))
start_position = tok_text.find(pred_text)
if start_position == -1:
if verbose_logging:
logger.info(
"Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
return orig_text
end_position = start_position + len(pred_text) - 1
(orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
(tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
if len(orig_ns_text) != len(tok_ns_text):
if verbose_logging:
logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
orig_ns_text, tok_ns_text)
return orig_text
# We then project the characters in `pred_text` back to `orig_text` using
# the character-to-character alignment.
tok_s_to_ns_map = {}
for (i, tok_index) in tok_ns_to_s_map.items():
tok_s_to_ns_map[tok_index] = i
orig_start_position = None
if start_position in tok_s_to_ns_map:
ns_start_position = tok_s_to_ns_map[start_position]
if ns_start_position in orig_ns_to_s_map:
orig_start_position = orig_ns_to_s_map[ns_start_position]
if orig_start_position is None:
if verbose_logging:
logger.info("Couldn't map start position")
return orig_text
orig_end_position = None
if end_position in tok_s_to_ns_map:
ns_end_position = tok_s_to_ns_map[end_position]
if ns_end_position in orig_ns_to_s_map:
orig_end_position = orig_ns_to_s_map[ns_end_position]
if orig_end_position is None:
if verbose_logging:
logger.info("Couldn't map end position")
return orig_text
output_text = orig_text[orig_start_position:(orig_end_position + 1)]
return output_text
def _get_best_indices(logits, n_best_size):
"""Get the n-best logits from a list."""
index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
best_indices = []
for i in range(len(index_and_score)):
if i >= n_best_size:
break
best_indices.append(index_and_score[i][0])
return best_indices
def _compute_softmax(scores):
"""Compute softmax probability over raw logits."""
if not scores:
return []
max_score = None
for score in scores:
if max_score is None or score > max_score:
max_score = score
exp_scores = []
total_sum = 0.0
for score in scores:
x = math.exp(score - max_score)
exp_scores.append(x)
total_sum += x
probs = []
for score in exp_scores:
probs.append(score / total_sum)
return probs
from apex.multi_tensor_apply import multi_tensor_applier
class GradientClipper:
"""
Clips gradient norm of an iterable of parameters.
"""
def __init__(self, max_grad_norm):
self.max_norm = max_grad_norm
if multi_tensor_applier.available:
import amp_C
self._overflow_buf = torch.cuda.IntTensor([0])
self.multi_tensor_l2norm = amp_C.multi_tensor_l2norm
self.multi_tensor_scale = amp_C.multi_tensor_scale
else:
raise RuntimeError('Gradient clipping requires cuda extensions')
def step(self, parameters):
l = [p.grad for p in parameters if p.grad is not None]
total_norm, _ = multi_tensor_applier(self.multi_tensor_l2norm, self._overflow_buf, [l], False)
total_norm = total_norm.item()
if (total_norm == float('inf')): return
clip_coef = self.max_norm / (total_norm + 1e-6)
if clip_coef < 1:
multi_tensor_applier(self.multi_tensor_scale, self._overflow_buf, [l, l], clip_coef)
def main():
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--bert_model", default=None, type=str, required=True,
help="Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
"bert-base-multilingual-cased, bert-base-chinese.")
parser.add_argument("--output_dir", default=None, type=str, required=True,
help="The output directory where the model checkpoints and predictions will be written.")
parser.add_argument("--init_checkpoint",
default=None,
type=str,
required=True,
help="The checkpoint file from pretraining")
## Other parameters
parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json")
parser.add_argument("--predict_file", default=None, type=str,
help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
parser.add_argument("--max_seq_length", default=384, type=int,
help="The maximum total input sequence length after WordPiece tokenization. Sequences "
"longer than this will be truncated, and sequences shorter than this will be padded.")
parser.add_argument("--doc_stride", default=128, type=int,
help="When splitting up a long document into chunks, how much stride to take between chunks.")
parser.add_argument("--max_query_length", default=64, type=int,
help="The maximum number of tokens for the question. Questions longer than this will "
"be truncated to this length.")
parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.")
parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")
parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
parser.add_argument("--num_train_epochs", default=3.0, type=float,
help="Total number of training epochs to perform.")
parser.add_argument("--max_steps", default=-1.0, type=float,
help="Total number of training steps to perform.")
parser.add_argument("--warmup_proportion", default=0.1, type=float,
help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% "
"of training.")
parser.add_argument("--n_best_size", default=20, type=int,
help="The total number of n-best predictions to generate in the nbest_predictions.json "
"output file.")
parser.add_argument("--max_answer_length", default=30, type=int,
help="The maximum length of an answer that can be generated. This is needed because the start "
"and end predictions are not conditioned on one another.")
parser.add_argument("--verbose_logging", action='store_true',
help="If true, all of the warnings related to data processing will be printed. "
"A number of warnings are expected for a normal SQuAD evaluation.")
parser.add_argument("--no_cuda",
action='store_true',
help="Whether not to use CUDA when available")
parser.add_argument('--seed',
type=int,
default=42,
help="random seed for initialization")
parser.add_argument('--gradient_accumulation_steps',
type=int,
default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument("--do_lower_case",
action='store_true',
help="Whether to lower case the input text. True for uncased models, False for cased models.")
parser.add_argument("--local_rank",
type=int,
default=os.getenv('LOCAL_RANK', -1),
help="local_rank for distributed training on gpus")
parser.add_argument('--fp16',
default=False,
action='store_true',
help="Mixed precision training")
parser.add_argument('--amp',
default=False,
action='store_true',
help="Mixed precision training")
parser.add_argument('--loss_scale',
type=float, default=0,
help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
"0 (default value): dynamic loss scaling.\n"
"Positive power of 2: static loss scaling value.\n")
parser.add_argument('--version_2_with_negative',
action='store_true',
help='If true, the SQuAD examples contain some that do not have an answer.')
parser.add_argument('--null_score_diff_threshold',
type=float, default=0.0,
help="If null_score - best_non_null is greater than the threshold predict null.")
parser.add_argument('--vocab_file',
type=str, default=None, required=True,
help="Vocabulary mapping/file BERT was pretrainined on")
parser.add_argument("--config_file",
default=None,
type=str,
required=True,
help="The BERT model config")
parser.add_argument('--log_freq',
type=int, default=50,
help='frequency of logging loss.')
parser.add_argument('--json-summary', type=str, default="results/dllogger.json",
help='If provided, the json summary will be written to'
'the specified file.')
parser.add_argument("--eval_script",
help="Script to evaluate squad predictions",
default="evaluate.py",
type=str)
parser.add_argument("--do_eval",
action='store_true',
help="Whether to use evaluate accuracy of predictions")
parser.add_argument("--use_env",
action='store_true',
help="Whether to read local rank from ENVVAR")
parser.add_argument('--skip_checkpoint',
default=False,
action='store_true',
help="Whether to save checkpoints")
parser.add_argument('--disable-progress-bar',
default=False,
action='store_true',
help='Disable tqdm progress bar')
parser.add_argument("--skip_cache",
default=False,
action='store_true',
help="Whether to cache train features")
parser.add_argument("--cache_dir",
default=None,
type=str,
help="Location to cache train feaures. Will default to the dataset directory")
args = parser.parse_args()
args.fp16 = args.fp16 or args.amp
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
n_gpu = torch.cuda.device_count()
else:
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.distributed.init_process_group(backend='nccl', init_method='env://')
n_gpu = 1
if is_main_process():
dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
filename=args.json_summary),
dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step)])
else:
dllogger.init(backends=[])
print("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
device, n_gpu, bool(args.local_rank != -1), args.fp16))
dllogger.log(step="PARAMETER", data={"Config": [str(args)]})
if args.gradient_accumulation_steps < 1:
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
args.gradient_accumulation_steps))
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
dllogger.log(step="PARAMETER", data={"SEED": args.seed})
if n_gpu > 0:
torch.cuda.manual_seed_all(args.seed)
if not args.do_train and not args.do_predict:
raise ValueError("At least one of `do_train` or `do_predict` must be True.")
if args.do_train:
if not args.train_file:
raise ValueError(
"If `do_train` is True, then `train_file` must be specified.")
if args.do_predict:
if not args.predict_file:
raise ValueError(
"If `do_predict` is True, then `predict_file` must be specified.")
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and os.listdir(args.output_dir)!=['logfile.txt']:
print("WARNING: Output directory {} already exists and is not empty.".format(args.output_dir), os.listdir(args.output_dir))
if not os.path.exists(args.output_dir) and is_main_process():
os.makedirs(args.output_dir)
tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large
# tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
train_examples = None
num_train_optimization_steps = None
if args.do_train:
train_examples = read_squad_examples(
input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
num_train_optimization_steps = int(
len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
if args.local_rank != -1:
num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
# Prepare model
config = modeling.BertConfig.from_json_file(args.config_file)
# Padding for divisibility by 8
if config.vocab_size % 8 != 0:
config.vocab_size += 8 - (config.vocab_size % 8)
modeling.ACT2FN["bias_gelu"] = modeling.bias_gelu_training
model = modeling.BertForQuestionAnswering(config)
# model = modeling.BertForQuestionAnswering.from_pretrained(args.bert_model,
# cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)))
dllogger.log(step="PARAMETER", data={"loading_checkpoint": True})
model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')["model"], strict=False)
dllogger.log(step="PARAMETER", data={"loaded_checkpoint": True})
model.to(device)
num_weights = sum([p.numel() for p in model.parameters() if p.requires_grad])
dllogger.log(step="PARAMETER", data={"model_weights_num":num_weights})
# Prepare optimizer
param_optimizer = list(model.named_parameters())
# hack to remove pooler, which is not used
# thus it produce None grad that break apex
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
if args.do_train:
if args.fp16:
try:
from apex.optimizers import FusedAdam
except ImportError:
raise ImportError(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
optimizer = FusedAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
bias_correction=False)
if args.loss_scale == 0:
model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False,
loss_scale="dynamic")
else:
model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale=args.loss_scale)
if args.do_train:
scheduler = LinearWarmUpScheduler(optimizer, warmup=args.warmup_proportion, total_steps=num_train_optimization_steps)
else:
optimizer = BertAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
if args.local_rank != -1:
try:
from apex.parallel import DistributedDataParallel as DDP
except ImportError:
raise ImportError(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
model = DDP(model)
elif n_gpu > 1:
model = torch.nn.DataParallel(model)
global_step = 0
if args.do_train:
if args.cache_dir is None:
cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format(
list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride),
str(args.max_query_length))
else:
cached_train_features_file = args.cache_dir.strip('/') + '/' + args.train_file.split('/')[-1] + '_{0}_{1}_{2}_{3}'.format(
list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride),
str(args.max_query_length))
train_features = None
try:
with open(cached_train_features_file, "rb") as reader:
train_features = pickle.load(reader)
except:
train_features = convert_examples_to_features(
examples=train_examples,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length,
is_training=True)
if not args.skip_cache and is_main_process():
dllogger.log(step="PARAMETER", data={"Cached_train features_file": cached_train_features_file})
with open(cached_train_features_file, "wb") as writer:
pickle.dump(train_features, writer)
dllogger.log(step="PARAMETER", data={"train_start": True})
dllogger.log(step="PARAMETER", data={"training_samples": len(train_examples)})
dllogger.log(step="PARAMETER", data={"training_features": len(train_features)})
dllogger.log(step="PARAMETER", data={"train_batch_size":args.train_batch_size})
dllogger.log(step="PARAMETER", data={"steps":num_train_optimization_steps})
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
all_start_positions, all_end_positions)
if args.local_rank == -1:
train_sampler = RandomSampler(train_data)
else:
train_sampler = DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size * n_gpu)
model.train()
gradClipper = GradientClipper(max_grad_norm=1.0)
final_loss = None
train_start = time.time()
for epoch in range(int(args.num_train_epochs)):
train_iter = tqdm(train_dataloader, desc="Iteration", disable=args.disable_progress_bar) if is_main_process() else train_dataloader
for step, batch in enumerate(train_iter):
# Terminate early for benchmarking
if args.max_steps > 0 and global_step > args.max_steps:
break
if n_gpu == 1:
batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self
input_ids, input_mask, segment_ids, start_positions, end_positions = batch
start_logits, end_logits = model(input_ids, segment_ids, input_mask)
# If we are on multi-GPU, split add a dimension
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# sometimes the start/end positions are outside our model inputs, we ignore these terms
ignored_index = start_logits.size(1)
start_positions.clamp_(0, ignored_index)
end_positions.clamp_(0, ignored_index)
loss_fct = torch.nn.CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
loss = (start_loss + end_loss) / 2
if n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu.
if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps
if args.fp16:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
# gradient clipping
gradClipper.step(amp.master_params(optimizer))
if (step + 1) % args.gradient_accumulation_steps == 0:
if args.fp16 :
# modify learning rate with special warm up for BERT which FusedAdam doesn't do
scheduler.step()
optimizer.step()
optimizer.zero_grad()
global_step += 1
final_loss = loss.item()
if step % args.log_freq == 0:
dllogger.log(step=(epoch, global_step,), data={"step_loss": final_loss,
"learning_rate": optimizer.param_groups[0]['lr']})
time_to_train = time.time() - train_start
if args.do_train and is_main_process() and not args.skip_checkpoint:
# Save a trained model and the associated configuration
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
output_model_file = os.path.join(args.output_dir, modeling.WEIGHTS_NAME)
torch.save({"model":model_to_save.state_dict()}, output_model_file)
output_config_file = os.path.join(args.output_dir, modeling.CONFIG_NAME)
with open(output_config_file, 'w') as f:
f.write(model_to_save.config.to_json_string())
if args.do_predict and (args.local_rank == -1 or is_main_process()):
if not args.do_train and args.fp16:
model.half()
eval_examples = read_squad_examples(
input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative)
eval_features = convert_examples_to_features(
examples=eval_examples,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length,
is_training=False)
dllogger.log(step="PARAMETER", data={"infer_start": True})
dllogger.log(step="PARAMETER", data={"eval_samples": len(eval_examples)})
dllogger.log(step="PARAMETER", data={"eval_features": len(eval_features)})
dllogger.log(step="PARAMETER", data={"predict_batch_size": args.predict_batch_size})
all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
# Run prediction for full data
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
infer_start = time.time()
model.eval()
all_results = []
dllogger.log(step="PARAMETER", data={"eval_start": True})
for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.disable_progress_bar):
if len(all_results) % 1000 == 0:
dllogger.log(step="PARAMETER", data={"sample_number": len(all_results)})
input_ids = input_ids.to(device)
input_mask = input_mask.to(device)
segment_ids = segment_ids.to(device)
with torch.no_grad():
batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask)
for i, example_index in enumerate(example_indices):
start_logits = batch_start_logits[i].detach().cpu().tolist()
end_logits = batch_end_logits[i].detach().cpu().tolist()
eval_feature = eval_features[example_index.item()]
unique_id = int(eval_feature.unique_id)
all_results.append(RawResult(unique_id=unique_id,
start_logits=start_logits,
end_logits=end_logits))
time_to_infer = time.time() - infer_start
output_prediction_file = os.path.join(args.output_dir, "predictions.json")
output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
answers, nbest_answers = get_answers(eval_examples, eval_features, all_results, args)
with open(output_prediction_file, "w") as f:
f.write(json.dumps(answers, indent=4) + "\n")
with open(output_nbest_file, "w") as f:
f.write(json.dumps(nbest_answers, indent=4) + "\n")
# output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
# write_predictions(eval_examples, eval_features, all_results,
# args.n_best_size, args.max_answer_length,
# args.do_lower_case, output_prediction_file,
# output_nbest_file, output_null_log_odds_file, args.verbose_logging,
# args.version_2_with_negative, args.null_score_diff_threshold)
if args.do_eval and is_main_process():
import sys
import subprocess
eval_out = subprocess.check_output([sys.executable, args.eval_script,
args.predict_file, args.output_dir + "/predictions.json"])
scores = str(eval_out).strip()
exact_match = float(scores.split(":")[1].split(",")[0])
f1 = float(scores.split(":")[2].split("}")[0])
if args.do_train:
gpu_count = n_gpu
if torch.distributed.is_initialized():
gpu_count = torch.distributed.get_world_size()
if args.max_steps == -1:
dllogger.log(step=tuple(), data={"e2e_train_time": time_to_train,
"training_sequences_per_second": len(train_features) * args.num_train_epochs / time_to_train,
"final_loss": final_loss})
else:
dllogger.log(step=tuple(), data={"e2e_train_time": time_to_train,
"training_sequences_per_second": args.train_batch_size * args.gradient_accumulation_steps \
* args.max_steps * gpu_count / time_to_train,
"final_loss": final_loss})
if args.do_predict and is_main_process():
dllogger.log(step=tuple(), data={"e2e_inference_time": time_to_infer,
"inference_sequences_per_second": len(eval_features) / time_to_infer})
if args.do_eval and is_main_process():
dllogger.log(step=tuple(), data={"exact_match": exact_match, "F1": f1})
if __name__ == "__main__":
main()
dllogger.flush()
# coding=utf-8
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run BERT on SQuAD."""
from __future__ import absolute_import, division, print_function
import argparse
import collections
import json
import logging
import math
import os
import random
import sys
from io import open
import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
from apex import amp
from schedulers import LinearWarmUpScheduler
from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
import modeling
from optimization import BertAdam, warmup_linear
from tokenization import (BasicTokenizer, BertTokenizer, whitespace_tokenize)
from utils import is_main_process, format_step
import dllogger, time
torch._C._jit_set_profiling_mode(False)
#torch._C._jit_set_profiling_executor(False)
if sys.version_info[0] == 2:
import cPickle as pickle
else:
import pickle
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt='%m/%d/%Y %H:%M:%S',
level=logging.INFO)
logger = logging.getLogger(__name__)
class SquadExample(object):
"""
A single training/test example for the Squad dataset.
For examples without an answer, the start and end position are -1.
"""
def __init__(self,
qas_id,
question_text,
doc_tokens,
orig_answer_text=None,
start_position=None,
end_position=None,
is_impossible=None):
self.qas_id = qas_id
self.question_text = question_text
self.doc_tokens = doc_tokens
self.orig_answer_text = orig_answer_text
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
def __str__(self):
return self.__repr__()
def __repr__(self):
s = ""
s += "qas_id: %s" % (self.qas_id)
s += ", question_text: %s" % (
self.question_text)
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.end_position:
s += ", end_position: %d" % (self.end_position)
if self.is_impossible:
s += ", is_impossible: %r" % (self.is_impossible)
return s
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self,
unique_id,
example_index,
doc_span_index,
tokens,
token_to_orig_map,
token_is_max_context,
input_ids,
input_mask,
segment_ids,
start_position=None,
end_position=None,
is_impossible=None):
self.unique_id = unique_id
self.example_index = example_index
self.doc_span_index = doc_span_index
self.tokens = tokens
self.token_to_orig_map = token_to_orig_map
self.token_is_max_context = token_is_max_context
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
def read_squad_examples(input_file, is_training, version_2_with_negative):
"""Read a SQuAD json file into a list of SquadExample."""
with open(input_file, "r", encoding='utf-8') as reader:
input_data = json.load(reader)["data"]
def is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
return True
return False
examples = []
for entry in input_data:
for paragraph in entry["paragraphs"]:
paragraph_text = paragraph["context"]
doc_tokens = []
char_to_word_offset = []
prev_is_whitespace = True
for c in paragraph_text:
if is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
doc_tokens.append(c)
else:
doc_tokens[-1] += c
prev_is_whitespace = False
char_to_word_offset.append(len(doc_tokens) - 1)
for qa in paragraph["qas"]:
qas_id = qa["id"]
question_text = qa["question"]
start_position = None
end_position = None
orig_answer_text = None
is_impossible = False
if is_training:
if version_2_with_negative:
is_impossible = qa["is_impossible"]
if (len(qa["answers"]) != 1) and (not is_impossible):
raise ValueError(
"For training, each question should have exactly 1 answer.")
if not is_impossible:
answer = qa["answers"][0]
orig_answer_text = answer["text"]
answer_offset = answer["answer_start"]
answer_length = len(orig_answer_text)
start_position = char_to_word_offset[answer_offset]
end_position = char_to_word_offset[answer_offset + answer_length - 1]
# Only add answers where the text can be exactly recovered from the
# document. If this CAN'T happen it's likely due to weird Unicode
# stuff so we will just skip the example.
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
cleaned_answer_text = " ".join(
whitespace_tokenize(orig_answer_text))
if actual_text.find(cleaned_answer_text) == -1:
logger.warning("Could not find answer: '%s' vs. '%s'",
actual_text, cleaned_answer_text)
continue
else:
start_position = -1
end_position = -1
orig_answer_text = ""
example = SquadExample(
qas_id=qas_id,
question_text=question_text,
doc_tokens=doc_tokens,
orig_answer_text=orig_answer_text,
start_position=start_position,
end_position=end_position,
is_impossible=is_impossible)
examples.append(example)
return examples
def convert_examples_to_features(examples, tokenizer, max_seq_length,
doc_stride, max_query_length, is_training):
"""Loads a data file into a list of `InputBatch`s."""
unique_id = 1000000000
features = []
for (example_index, example) in enumerate(examples):
query_tokens = tokenizer.tokenize(example.question_text)
if len(query_tokens) > max_query_length:
query_tokens = query_tokens[0:max_query_length]
tok_to_orig_index = []
orig_to_tok_index = []
all_doc_tokens = []
for (i, token) in enumerate(example.doc_tokens):
orig_to_tok_index.append(len(all_doc_tokens))
sub_tokens = tokenizer.tokenize(token)
for sub_token in sub_tokens:
tok_to_orig_index.append(i)
all_doc_tokens.append(sub_token)
tok_start_position = None
tok_end_position = None
if is_training and example.is_impossible:
tok_start_position = -1
tok_end_position = -1
if is_training and not example.is_impossible:
tok_start_position = orig_to_tok_index[example.start_position]
if example.end_position < len(example.doc_tokens) - 1:
tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
else:
tok_end_position = len(all_doc_tokens) - 1
(tok_start_position, tok_end_position) = _improve_answer_span(
all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
example.orig_answer_text)
# The -3 accounts for [CLS], [SEP] and [SEP]
max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
# We can have documents that are longer than the maximum sequence length.
# To deal with this we do a sliding window approach, where we take chunks
# of the up to our max length with a stride of `doc_stride`.
_DocSpan = collections.namedtuple( # pylint: disable=invalid-name
"DocSpan", ["start", "length"])
doc_spans = []
start_offset = 0
while start_offset < len(all_doc_tokens):
length = len(all_doc_tokens) - start_offset
if length > max_tokens_for_doc:
length = max_tokens_for_doc
doc_spans.append(_DocSpan(start=start_offset, length=length))
if start_offset + length == len(all_doc_tokens):
break
start_offset += min(length, doc_stride)
for (doc_span_index, doc_span) in enumerate(doc_spans):
tokens = []
token_to_orig_map = {}
token_is_max_context = {}
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in query_tokens:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
for i in range(doc_span.length):
split_token_index = doc_span.start + i
token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
is_max_context = _check_is_max_context(doc_spans, doc_span_index,
split_token_index)
token_is_max_context[len(tokens)] = is_max_context
tokens.append(all_doc_tokens[split_token_index])
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
start_position = None
end_position = None
if is_training and not example.is_impossible:
# For training, if our document chunk does not contain an annotation
# we throw it out, since there is nothing to predict.
doc_start = doc_span.start
doc_end = doc_span.start + doc_span.length - 1
out_of_span = False
if not (tok_start_position >= doc_start and
tok_end_position <= doc_end):
out_of_span = True
if out_of_span:
start_position = 0
end_position = 0
else:
doc_offset = len(query_tokens) + 2
start_position = tok_start_position - doc_start + doc_offset
end_position = tok_end_position - doc_start + doc_offset
if is_training and example.is_impossible:
start_position = 0
end_position = 0
features.append(
InputFeatures(
unique_id=unique_id,
example_index=example_index,
doc_span_index=doc_span_index,
tokens=tokens,
token_to_orig_map=token_to_orig_map,
token_is_max_context=token_is_max_context,
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
start_position=start_position,
end_position=end_position,
is_impossible=example.is_impossible))
unique_id += 1
return features
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
orig_answer_text):
"""Returns tokenized answer spans that better match the annotated answer."""
# The SQuAD annotations are character based. We first project them to
# whitespace-tokenized words. But then after WordPiece tokenization, we can
# often find a "better match". For example:
#
# Question: What year was John Smith born?
# Context: The leader was John Smith (1895-1943).
# Answer: 1895
#
# The original whitespace-tokenized answer will be "(1895-1943).". However
# after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
# the exact answer, 1895.
#
# However, this is not always possible. Consider the following:
#
# Question: What country is the top exporter of electornics?
# Context: The Japanese electronics industry is the lagest in the world.
# Answer: Japan
#
# In this case, the annotator chose "Japan" as a character sub-span of
# the word "Japanese". Since our WordPiece tokenizer does not split
# "Japanese", we just use "Japanese" as the annotation. This is fairly rare
# in SQuAD, but does happen.
tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
for new_start in range(input_start, input_end + 1):
for new_end in range(input_end, new_start - 1, -1):
text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
if text_span == tok_answer_text:
return (new_start, new_end)
return (input_start, input_end)
def _check_is_max_context(doc_spans, cur_span_index, position):
"""Check if this is the 'max context' doc span for the token."""
# Because of the sliding window approach taken to scoring documents, a single
# token can appear in multiple documents. E.g.
# Doc: the man went to the store and bought a gallon of milk
# Span A: the man went to the
# Span B: to the store and bought
# Span C: and bought a gallon of
# ...
#
# Now the word 'bought' will have two scores from spans B and C. We only
# want to consider the score with "maximum context", which we define as
# the *minimum* of its left and right context (the *sum* of left and
# right context will always be the same, of course).
#
# In the example the maximum context for 'bought' would be span C since
# it has 1 left context and 3 right context, while span B has 4 left context
# and 0 right context.
best_score = None
best_span_index = None
for (span_index, doc_span) in enumerate(doc_spans):
end = doc_span.start + doc_span.length - 1
if position < doc_span.start:
continue
if position > end:
continue
num_left_context = position - doc_span.start
num_right_context = end - position
score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
if best_score is None or score > best_score:
best_score = score
best_span_index = span_index
return cur_span_index == best_span_index
RawResult = collections.namedtuple("RawResult",
["unique_id", "start_logits", "end_logits"])
def get_answers(examples, features, results, args):
predictions = collections.defaultdict(list) #it is possible that one example corresponds to multiple features
Prediction = collections.namedtuple('Prediction', ['text', 'start_logit', 'end_logit'])
if args.version_2_with_negative:
null_vals = collections.defaultdict(lambda: (float("inf"),0,0))
for ex, feat, result in match_results(examples, features, results):
start_indices = _get_best_indices(result.start_logits, args.n_best_size)
end_indices = _get_best_indices(result.end_logits, args.n_best_size)
prelim_predictions = get_valid_prelim_predictions(start_indices, end_indices, feat, result, args)
prelim_predictions = sorted(
prelim_predictions,
key=lambda x: (x.start_logit + x.end_logit),
reverse=True)
if args.version_2_with_negative:
score = result.start_logits[0] + result.end_logits[0]
if score < null_vals[ex.qas_id][0]:
null_vals[ex.qas_id] = (score, result.start_logits[0], result.end_logits[0])
curr_predictions = []
seen_predictions = []
for pred in prelim_predictions:
if len(curr_predictions) == args.n_best_size:
break
if pred.start_index > 0: # this is a non-null prediction TODO: this probably is irrelevant
final_text = get_answer_text(ex, feat, pred, args)
if final_text in seen_predictions:
continue
else:
final_text = ""
seen_predictions.append(final_text)
curr_predictions.append(Prediction(final_text, pred.start_logit, pred.end_logit))
predictions[ex.qas_id] += curr_predictions
#Add empty prediction
if args.version_2_with_negative:
for qas_id in predictions.keys():
predictions[qas_id].append(Prediction('',
null_vals[ex.qas_id][1],
null_vals[ex.qas_id][2]))
nbest_answers = collections.defaultdict(list)
answers = {}
for qas_id, preds in predictions.items():
nbest = sorted(
preds,
key=lambda x: (x.start_logit + x.end_logit),
reverse=True)[:args.n_best_size]
# In very rare edge cases we could only have single null prediction.
# So we just create a nonce prediction in this case to avoid failure.
if not nbest:
nbest.append(Prediction(text="empty", start_logit=0.0, end_logit=0.0))
total_scores = []
best_non_null_entry = None
for entry in nbest:
total_scores.append(entry.start_logit + entry.end_logit)
if not best_non_null_entry and entry.text:
best_non_null_entry = entry
probs = _compute_softmax(total_scores)
for (i, entry) in enumerate(nbest):
output = collections.OrderedDict()
output["text"] = entry.text
output["probability"] = probs[i]
output["start_logit"] = entry.start_logit
output["end_logit"] = entry.end_logit
nbest_answers[qas_id].append(output)
if args.version_2_with_negative:
score_diff = null_vals[qas_id][0] - best_non_null_entry.start_logit - best_non_null_entry.end_logit
if score_diff > args.null_score_diff_threshold:
answers[qas_id] = ""
else:
answers[qas_id] = best_non_null_entry.text
else:
answers[qas_id] = nbest_answers[qas_id][0]['text']
return answers, nbest_answers
def get_answer_text(example, feature, pred, args):
tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
orig_doc_start = feature.token_to_orig_map[pred.start_index]
orig_doc_end = feature.token_to_orig_map[pred.end_index]
orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
tok_text = " ".join(tok_tokens)
# De-tokenize WordPieces that have been split off.
tok_text = tok_text.replace(" ##", "")
tok_text = tok_text.replace("##", "")
# Clean whitespace
tok_text = tok_text.strip()
tok_text = " ".join(tok_text.split())
orig_text = " ".join(orig_tokens)
final_text = get_final_text(tok_text, orig_text, args.do_lower_case, args.verbose_logging)
return final_text
def get_valid_prelim_predictions(start_indices, end_indices, feature, result, args):
_PrelimPrediction = collections.namedtuple(
"PrelimPrediction",
["start_index", "end_index", "start_logit", "end_logit"])
prelim_predictions = []
for start_index in start_indices:
for end_index in end_indices:
if start_index >= len(feature.tokens):
continue
if end_index >= len(feature.tokens):
continue
if start_index not in feature.token_to_orig_map:
continue
if end_index not in feature.token_to_orig_map:
continue
if not feature.token_is_max_context.get(start_index, False):
continue
if end_index < start_index:
continue
length = end_index - start_index + 1
if length > args.max_answer_length:
continue
prelim_predictions.append(
_PrelimPrediction(
start_index=start_index,
end_index=end_index,
start_logit=result.start_logits[start_index],
end_logit=result.end_logits[end_index]))
return prelim_predictions
def match_results(examples, features, results):
unique_f_ids = set([f.unique_id for f in features])
unique_r_ids = set([r.unique_id for r in results])
matching_ids = unique_f_ids & unique_r_ids
features = [f for f in features if f.unique_id in matching_ids]
results = [r for r in results if r.unique_id in matching_ids]
features.sort(key=lambda x: x.unique_id)
results.sort(key=lambda x: x.unique_id)
for f, r in zip(features, results): #original code assumes strict ordering of examples. TODO: rewrite this
yield examples[f.example_index], f, r
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
"""Project the tokenized prediction back to the original text."""
# When we created the data, we kept track of the alignment between original
# (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
# now `orig_text` contains the span of our original text corresponding to the
# span that we predicted.
#
# However, `orig_text` may contain extra characters that we don't want in
# our prediction.
#
# For example, let's say:
# pred_text = steve smith
# orig_text = Steve Smith's
#
# We don't want to return `orig_text` because it contains the extra "'s".
#
# We don't want to return `pred_text` because it's already been normalized
# (the SQuAD eval script also does punctuation stripping/lower casing but
# our tokenizer does additional normalization like stripping accent
# characters).
#
# What we really want to return is "Steve Smith".
#
# Therefore, we have to apply a semi-complicated alignment heruistic between
# `pred_text` and `orig_text` to get a character-to-charcter alignment. This
# can fail in certain cases in which case we just return `orig_text`.
def _strip_spaces(text):
ns_chars = []
ns_to_s_map = collections.OrderedDict()
for (i, c) in enumerate(text):
if c == " ":
continue
ns_to_s_map[len(ns_chars)] = i
ns_chars.append(c)
ns_text = "".join(ns_chars)
return (ns_text, ns_to_s_map)
# We first tokenize `orig_text`, strip whitespace from the result
# and `pred_text`, and check if they are the same length. If they are
# NOT the same length, the heuristic has failed. If they are the same
# length, we assume the characters are one-to-one aligned.
tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
tok_text = " ".join(tokenizer.tokenize(orig_text))
start_position = tok_text.find(pred_text)
if start_position == -1:
if verbose_logging:
logger.info(
"Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
return orig_text
end_position = start_position + len(pred_text) - 1
(orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
(tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
if len(orig_ns_text) != len(tok_ns_text):
if verbose_logging:
logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
orig_ns_text, tok_ns_text)
return orig_text
# We then project the characters in `pred_text` back to `orig_text` using
# the character-to-character alignment.
tok_s_to_ns_map = {}
for (i, tok_index) in tok_ns_to_s_map.items():
tok_s_to_ns_map[tok_index] = i
orig_start_position = None
if start_position in tok_s_to_ns_map:
ns_start_position = tok_s_to_ns_map[start_position]
if ns_start_position in orig_ns_to_s_map:
orig_start_position = orig_ns_to_s_map[ns_start_position]
if orig_start_position is None:
if verbose_logging:
logger.info("Couldn't map start position")
return orig_text
orig_end_position = None
if end_position in tok_s_to_ns_map:
ns_end_position = tok_s_to_ns_map[end_position]
if ns_end_position in orig_ns_to_s_map:
orig_end_position = orig_ns_to_s_map[ns_end_position]
if orig_end_position is None:
if verbose_logging:
logger.info("Couldn't map end position")
return orig_text
output_text = orig_text[orig_start_position:(orig_end_position + 1)]
return output_text
def _get_best_indices(logits, n_best_size):
"""Get the n-best logits from a list."""
index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
best_indices = []
for i in range(len(index_and_score)):
if i >= n_best_size:
break
best_indices.append(index_and_score[i][0])
return best_indices
def _compute_softmax(scores):
"""Compute softmax probability over raw logits."""
if not scores:
return []
max_score = None
for score in scores:
if max_score is None or score > max_score:
max_score = score
exp_scores = []
total_sum = 0.0
for score in scores:
x = math.exp(score - max_score)
exp_scores.append(x)
total_sum += x
probs = []
for score in exp_scores:
probs.append(score / total_sum)
return probs
from apex.multi_tensor_apply import multi_tensor_applier
class GradientClipper:
"""
Clips gradient norm of an iterable of parameters.
"""
def __init__(self, max_grad_norm):
self.max_norm = max_grad_norm
if multi_tensor_applier.available:
import amp_C
self._overflow_buf = torch.cuda.IntTensor([0])
self.multi_tensor_l2norm = amp_C.multi_tensor_l2norm
self.multi_tensor_scale = amp_C.multi_tensor_scale
else:
raise RuntimeError('Gradient clipping requires cuda extensions')
def step(self, parameters):
l = [p.grad for p in parameters if p.grad is not None]
total_norm, _ = multi_tensor_applier(self.multi_tensor_l2norm, self._overflow_buf, [l], False)
total_norm = total_norm.item()
if (total_norm == float('inf')): return
clip_coef = self.max_norm / (total_norm + 1e-6)
if clip_coef < 1:
multi_tensor_applier(self.multi_tensor_scale, self._overflow_buf, [l, l], clip_coef)
def main():
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--bert_model", default=None, type=str, required=True,
help="Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
"bert-base-multilingual-cased, bert-base-chinese.")
parser.add_argument("--output_dir", default=None, type=str, required=True,
help="The output directory where the model checkpoints and predictions will be written.")
parser.add_argument("--init_checkpoint",
default=None,
type=str,
required=True,
help="The checkpoint file from pretraining")
## Other parameters
parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json")
parser.add_argument("--predict_file", default=None, type=str,
help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
parser.add_argument("--max_seq_length", default=384, type=int,
help="The maximum total input sequence length after WordPiece tokenization. Sequences "
"longer than this will be truncated, and sequences shorter than this will be padded.")
parser.add_argument("--doc_stride", default=128, type=int,
help="When splitting up a long document into chunks, how much stride to take between chunks.")
parser.add_argument("--max_query_length", default=64, type=int,
help="The maximum number of tokens for the question. Questions longer than this will "
"be truncated to this length.")
parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.")
parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.")
parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
parser.add_argument("--num_train_epochs", default=3.0, type=float,
help="Total number of training epochs to perform.")
parser.add_argument("--max_steps", default=-1.0, type=float,
help="Total number of training steps to perform.")
parser.add_argument("--warmup_proportion", default=0.1, type=float,
help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% "
"of training.")
parser.add_argument("--n_best_size", default=20, type=int,
help="The total number of n-best predictions to generate in the nbest_predictions.json "
"output file.")
parser.add_argument("--max_answer_length", default=30, type=int,
help="The maximum length of an answer that can be generated. This is needed because the start "
"and end predictions are not conditioned on one another.")
parser.add_argument("--verbose_logging", action='store_true',
help="If true, all of the warnings related to data processing will be printed. "
"A number of warnings are expected for a normal SQuAD evaluation.")
parser.add_argument("--no_cuda",
action='store_true',
help="Whether not to use CUDA when available")
parser.add_argument('--seed',
type=int,
default=42,
help="random seed for initialization")
parser.add_argument('--gradient_accumulation_steps',
type=int,
default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument("--do_lower_case",
action='store_true',
help="Whether to lower case the input text. True for uncased models, False for cased models.")
parser.add_argument("--local_rank",
type=int,
default=os.getenv('LOCAL_RANK', -1),
help="local_rank for distributed training on gpus")
parser.add_argument('--fp16',
default=False,
action='store_true',
help="Mixed precision training")
parser.add_argument('--amp',
default=False,
action='store_true',
help="Mixed precision training")
parser.add_argument('--loss_scale',
type=float, default=0,
help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
"0 (default value): dynamic loss scaling.\n"
"Positive power of 2: static loss scaling value.\n")
parser.add_argument('--version_2_with_negative',
action='store_true',
help='If true, the SQuAD examples contain some that do not have an answer.')
parser.add_argument('--null_score_diff_threshold',
type=float, default=0.0,
help="If null_score - best_non_null is greater than the threshold predict null.")
parser.add_argument('--vocab_file',
type=str, default=None, required=True,
help="Vocabulary mapping/file BERT was pretrainined on")
parser.add_argument("--config_file",
default=None,
type=str,
required=True,
help="The BERT model config")
parser.add_argument('--log_freq',
type=int, default=50,
help='frequency of logging loss.')
parser.add_argument('--json-summary', type=str, default="results/dllogger.json",
help='If provided, the json summary will be written to'
'the specified file.')
parser.add_argument("--eval_script",
help="Script to evaluate squad predictions",
default="evaluate.py",
type=str)
parser.add_argument("--do_eval",
action='store_true',
help="Whether to use evaluate accuracy of predictions")
parser.add_argument("--use_env",
action='store_true',
help="Whether to read local rank from ENVVAR")
parser.add_argument('--skip_checkpoint',
default=False,
action='store_true',
help="Whether to save checkpoints")
parser.add_argument('--disable-progress-bar',
default=False,
action='store_true',
help='Disable tqdm progress bar')
parser.add_argument("--skip_cache",
default=False,
action='store_true',
help="Whether to cache train features")
parser.add_argument("--cache_dir",
default=None,
type=str,
help="Location to cache train feaures. Will default to the dataset directory")
parser.add_argument("--dist_url",default='tcp://224.66.41.62:23456', type=str,
help='url used to set up distributed training')
parser.add_argument("--gpus_per_node",type=int,default=4,
help='num of gpus per node')
parser.add_argument("--world_size",type=int,default=1,
help="number of process")
args = parser.parse_args()
args.fp16 = args.fp16 or args.amp
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
n_gpu = torch.cuda.device_count()
else:
#xuan
device_n = args.local_rank % 8
torch.cuda.set_device(device_n)
device = torch.device("cuda", device_n)
#torch.cuda.set_device(args.local_rank)
#device = torch.device("cuda", args.local_rank)
#device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
#torch.distributed.init_process_group(backend='gloo', init_method='env://')
#xuan
#if args.world_size > 1:
# args.local_rank = args.local_rank * args.gpus_per_node
torch.distributed.init_process_group(backend='nccl', init_method=args.dist_url,
world_size=args.world_size, rank=args.local_rank)
n_gpu = 1
if is_main_process():
dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
filename=args.json_summary),
dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step)])
else:
dllogger.init(backends=[])
print("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
device, n_gpu, bool(args.local_rank != -1), args.fp16))
dllogger.log(step="PARAMETER", data={"Config": [str(args)]})
if args.gradient_accumulation_steps < 1:
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
args.gradient_accumulation_steps))
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
dllogger.log(step="PARAMETER", data={"SEED": args.seed})
if n_gpu > 0:
torch.cuda.manual_seed_all(args.seed)
if not args.do_train and not args.do_predict:
raise ValueError("At least one of `do_train` or `do_predict` must be True.")
if args.do_train:
if not args.train_file:
raise ValueError(
"If `do_train` is True, then `train_file` must be specified.")
if args.do_predict:
if not args.predict_file:
raise ValueError(
"If `do_predict` is True, then `predict_file` must be specified.")
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and os.listdir(args.output_dir)!=['logfile.txt']:
print("WARNING: Output directory {} already exists and is not empty.".format(args.output_dir), os.listdir(args.output_dir))
if not os.path.exists(args.output_dir) and is_main_process():
os.makedirs(args.output_dir)
tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large
# tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
train_examples = None
num_train_optimization_steps = None
if args.do_train:
train_examples = read_squad_examples(
input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
num_train_optimization_steps = int(
len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
if args.local_rank != -1:
num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
# Prepare model
config = modeling.BertConfig.from_json_file(args.config_file)
# Padding for divisibility by 8
if config.vocab_size % 8 != 0:
config.vocab_size += 8 - (config.vocab_size % 8)
modeling.ACT2FN["bias_gelu"] = modeling.bias_gelu_training
model = modeling.BertForQuestionAnswering(config)
# model = modeling.BertForQuestionAnswering.from_pretrained(args.bert_model,
# cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)))
dllogger.log(step="PARAMETER", data={"loading_checkpoint": True})
model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False)
dllogger.log(step="PARAMETER", data={"loaded_checkpoint": True})
model.to(device)
#model = model.cuda()
num_weights = sum([p.numel() for p in model.parameters() if p.requires_grad])
dllogger.log(step="PARAMETER", data={"model_weights_num":num_weights})
# Prepare optimizer
param_optimizer = list(model.named_parameters())
# hack to remove pooler, which is not used
# thus it produce None grad that break apex
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
if args.do_train:
if args.fp16:
try:
from apex.optimizers import FusedAdam
except ImportError:
raise ImportError(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
optimizer = FusedAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
bias_correction=False)
if args.loss_scale == 0:
model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False,
loss_scale="dynamic")
else:
model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale=args.loss_scale)
if args.do_train:
scheduler = LinearWarmUpScheduler(optimizer, warmup=args.warmup_proportion, total_steps=num_train_optimization_steps)
else:
optimizer = BertAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
if args.local_rank != -1:
try:
from apex.parallel import DistributedDataParallel as DDP
except ImportError:
raise ImportError(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
model = DDP(model)
# model = torch.nn.parallel.DistributedDataParallel(model,device_ids=[device_n])
elif n_gpu > 1:
model = torch.nn.DataParallel(model)
global_step = 0
if args.do_train:
if args.cache_dir is None:
cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format(
list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride),
str(args.max_query_length))
else:
cached_train_features_file = args.cache_dir.strip('/') + '/' + args.train_file.split('/')[-1] + '_{0}_{1}_{2}_{3}'.format(
list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride),
str(args.max_query_length))
train_features = None
try:
with open(cached_train_features_file, "rb") as reader:
train_features = pickle.load(reader)
except:
train_features = convert_examples_to_features(
examples=train_examples,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length,
is_training=True)
if not args.skip_cache and is_main_process():
dllogger.log(step="PARAMETER", data={"Cached_train features_file": cached_train_features_file})
with open(cached_train_features_file, "wb") as writer:
pickle.dump(train_features, writer)
dllogger.log(step="PARAMETER", data={"train_start": True})
dllogger.log(step="PARAMETER", data={"training_samples": len(train_examples)})
dllogger.log(step="PARAMETER", data={"training_features": len(train_features)})
dllogger.log(step="PARAMETER", data={"train_batch_size":args.train_batch_size})
dllogger.log(step="PARAMETER", data={"steps":num_train_optimization_steps})
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
all_start_positions, all_end_positions)
if args.local_rank == -1:
train_sampler = RandomSampler(train_data)
else:
train_sampler = DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size * n_gpu)
model.train()
gradClipper = GradientClipper(max_grad_norm=1.0)
final_loss = None
train_start = time.time()
for epoch in range(int(args.num_train_epochs)):
train_iter = tqdm(train_dataloader, desc="Iteration", disable=args.disable_progress_bar) if is_main_process() else train_dataloader
for step, batch in enumerate(train_iter):
# Terminate early for benchmarking
if args.max_steps > 0 and global_step > args.max_steps:
break
if n_gpu == 1:
batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self
input_ids, input_mask, segment_ids, start_positions, end_positions = batch
start_logits, end_logits = model(input_ids, segment_ids, input_mask)
# If we are on multi-GPU, split add a dimension
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# sometimes the start/end positions are outside our model inputs, we ignore these terms
ignored_index = start_logits.size(1)
start_positions.clamp_(0, ignored_index)
end_positions.clamp_(0, ignored_index)
loss_fct = torch.nn.CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
loss = (start_loss + end_loss) / 2
if n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu.
if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps
if args.fp16:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
# gradient clipping
gradClipper.step(amp.master_params(optimizer))
if (step + 1) % args.gradient_accumulation_steps == 0:
if args.fp16 :
# modify learning rate with special warm up for BERT which FusedAdam doesn't do
scheduler.step()
optimizer.step()
optimizer.zero_grad()
global_step += 1
final_loss = loss.item()
if step % args.log_freq == 0:
dllogger.log(step=(epoch, global_step,), data={"step_loss": final_loss,
"learning_rate": optimizer.param_groups[0]['lr']})
time_to_train = time.time() - train_start
if args.do_train and is_main_process() and not args.skip_checkpoint:
# Save a trained model and the associated configuration
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
output_model_file = os.path.join(args.output_dir, modeling.WEIGHTS_NAME)
torch.save({"model":model_to_save.state_dict()}, output_model_file)
output_config_file = os.path.join(args.output_dir, modeling.CONFIG_NAME)
with open(output_config_file, 'w') as f:
f.write(model_to_save.config.to_json_string())
if args.do_predict and (args.local_rank == -1 or is_main_process()):
if not args.do_train and args.fp16:
model.half()
eval_examples = read_squad_examples(
input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative)
eval_features = convert_examples_to_features(
examples=eval_examples,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length,
is_training=False)
dllogger.log(step="PARAMETER", data={"infer_start": True})
dllogger.log(step="PARAMETER", data={"eval_samples": len(eval_examples)})
dllogger.log(step="PARAMETER", data={"eval_features": len(eval_features)})
dllogger.log(step="PARAMETER", data={"predict_batch_size": args.predict_batch_size})
all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
# Run prediction for full data
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
infer_start = time.time()
model.eval()
all_results = []
dllogger.log(step="PARAMETER", data={"eval_start": True})
for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.disable_progress_bar):
if len(all_results) % 1000 == 0:
dllogger.log(step="PARAMETER", data={"sample_number": len(all_results)})
input_ids = input_ids.to(device)
input_mask = input_mask.to(device)
segment_ids = segment_ids.to(device)
with torch.no_grad():
batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask)
for i, example_index in enumerate(example_indices):
start_logits = batch_start_logits[i].detach().cpu().tolist()
end_logits = batch_end_logits[i].detach().cpu().tolist()
eval_feature = eval_features[example_index.item()]
unique_id = int(eval_feature.unique_id)
all_results.append(RawResult(unique_id=unique_id,
start_logits=start_logits,
end_logits=end_logits))
time_to_infer = time.time() - infer_start
output_prediction_file = os.path.join(args.output_dir, "predictions.json")
output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
answers, nbest_answers = get_answers(eval_examples, eval_features, all_results, args)
with open(output_prediction_file, "w") as f:
f.write(json.dumps(answers, indent=4) + "\n")
with open(output_nbest_file, "w") as f:
f.write(json.dumps(nbest_answers, indent=4) + "\n")
# output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
# write_predictions(eval_examples, eval_features, all_results,
# args.n_best_size, args.max_answer_length,
# args.do_lower_case, output_prediction_file,
# output_nbest_file, output_null_log_odds_file, args.verbose_logging,
# args.version_2_with_negative, args.null_score_diff_threshold)
if args.do_eval and is_main_process():
import sys
import subprocess
eval_out = subprocess.check_output([sys.executable, args.eval_script,
args.predict_file, args.output_dir + "/predictions.json"])
scores = str(eval_out).strip()
exact_match = float(scores.split(":")[1].split(",")[0])
f1 = float(scores.split(":")[2].split("}")[0])
if args.do_train:
gpu_count = n_gpu
if torch.distributed.is_initialized():
gpu_count = torch.distributed.get_world_size()
if args.max_steps == -1:
dllogger.log(step=tuple(), data={"e2e_train_time": time_to_train,
"training_sequences_per_second": len(train_features) * args.num_train_epochs / time_to_train,
"final_loss": final_loss})
else:
dllogger.log(step=tuple(), data={"e2e_train_time": time_to_train,
"training_sequences_per_second": args.train_batch_size * args.gradient_accumulation_steps \
* args.max_steps * gpu_count / time_to_train,
"final_loss": final_loss})
if args.do_predict and is_main_process():
dllogger.log(step=tuple(), data={"e2e_inference_time": time_to_infer,
"inference_sequences_per_second": len(eval_features) / time_to_infer})
if args.do_eval and is_main_process():
dllogger.log(step=tuple(), data={"exact_match": exact_match, "F1": f1})
if __name__ == "__main__":
main()
dllogger.flush()
# coding=utf-8
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run BERT on SQuAD."""
from __future__ import absolute_import, division, print_function
import argparse
import collections
import json
import logging
import math
import os
import random
import sys
from io import open
import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
from apex import amp
from schedulers import LinearWarmUpScheduler
from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
import modeling
from optimization import BertAdam, warmup_linear
from tokenization import (BasicTokenizer, BertTokenizer, whitespace_tokenize)
from utils import is_main_process, format_step
import dllogger, time
os.environ["HIP_VISIBLE_DEVICES"] = "0,1,2,3"
torch._C._jit_set_profiling_mode(False)
#torch._C._jit_set_profiling_executor(False)
if sys.version_info[0] == 2:
import cPickle as pickle
else:
import pickle
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt='%m/%d/%Y %H:%M:%S',
level=logging.INFO)
logger = logging.getLogger(__name__)
class SquadExample(object):
"""
A single training/test example for the Squad dataset.
For examples without an answer, the start and end position are -1.
"""
def __init__(self,
qas_id,
question_text,
doc_tokens,
orig_answer_text=None,
start_position=None,
end_position=None,
is_impossible=None):
self.qas_id = qas_id
self.question_text = question_text
self.doc_tokens = doc_tokens
self.orig_answer_text = orig_answer_text
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
def __str__(self):
return self.__repr__()
def __repr__(self):
s = ""
s += "qas_id: %s" % (self.qas_id)
s += ", question_text: %s" % (
self.question_text)
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.end_position:
s += ", end_position: %d" % (self.end_position)
if self.is_impossible:
s += ", is_impossible: %r" % (self.is_impossible)
return s
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self,
unique_id,
example_index,
doc_span_index,
tokens,
token_to_orig_map,
token_is_max_context,
input_ids,
input_mask,
segment_ids,
start_position=None,
end_position=None,
is_impossible=None):
self.unique_id = unique_id
self.example_index = example_index
self.doc_span_index = doc_span_index
self.tokens = tokens
self.token_to_orig_map = token_to_orig_map
self.token_is_max_context = token_is_max_context
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
def read_squad_examples(input_file, is_training, version_2_with_negative):
"""Read a SQuAD json file into a list of SquadExample."""
with open(input_file, "r", encoding='utf-8') as reader:
input_data = json.load(reader)["data"]
def is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
return True
return False
examples = []
for entry in input_data:
for paragraph in entry["paragraphs"]:
paragraph_text = paragraph["context"]
doc_tokens = []
char_to_word_offset = []
prev_is_whitespace = True
for c in paragraph_text:
if is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
doc_tokens.append(c)
else:
doc_tokens[-1] += c
prev_is_whitespace = False
char_to_word_offset.append(len(doc_tokens) - 1)
for qa in paragraph["qas"]:
qas_id = qa["id"]
question_text = qa["question"]
start_position = None
end_position = None
orig_answer_text = None
is_impossible = False
if is_training:
if version_2_with_negative:
is_impossible = qa["is_impossible"]
if (len(qa["answers"]) != 1) and (not is_impossible):
raise ValueError(
"For training, each question should have exactly 1 answer.")
if not is_impossible:
answer = qa["answers"][0]
orig_answer_text = answer["text"]
answer_offset = answer["answer_start"]
answer_length = len(orig_answer_text)
start_position = char_to_word_offset[answer_offset]
end_position = char_to_word_offset[answer_offset + answer_length - 1]
# Only add answers where the text can be exactly recovered from the
# document. If this CAN'T happen it's likely due to weird Unicode
# stuff so we will just skip the example.
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
cleaned_answer_text = " ".join(
whitespace_tokenize(orig_answer_text))
if actual_text.find(cleaned_answer_text) == -1:
logger.warning("Could not find answer: '%s' vs. '%s'",
actual_text, cleaned_answer_text)
continue
else:
start_position = -1
end_position = -1
orig_answer_text = ""
example = SquadExample(
qas_id=qas_id,
question_text=question_text,
doc_tokens=doc_tokens,
orig_answer_text=orig_answer_text,
start_position=start_position,
end_position=end_position,
is_impossible=is_impossible)
examples.append(example)
return examples
def convert_examples_to_features(examples, tokenizer, max_seq_length,
doc_stride, max_query_length, is_training):
"""Loads a data file into a list of `InputBatch`s."""
unique_id = 1000000000
features = []
for (example_index, example) in enumerate(examples):
query_tokens = tokenizer.tokenize(example.question_text)
if len(query_tokens) > max_query_length:
query_tokens = query_tokens[0:max_query_length]
tok_to_orig_index = []
orig_to_tok_index = []
all_doc_tokens = []
for (i, token) in enumerate(example.doc_tokens):
orig_to_tok_index.append(len(all_doc_tokens))
sub_tokens = tokenizer.tokenize(token)
for sub_token in sub_tokens:
tok_to_orig_index.append(i)
all_doc_tokens.append(sub_token)
tok_start_position = None
tok_end_position = None
if is_training and example.is_impossible:
tok_start_position = -1
tok_end_position = -1
if is_training and not example.is_impossible:
tok_start_position = orig_to_tok_index[example.start_position]
if example.end_position < len(example.doc_tokens) - 1:
tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
else:
tok_end_position = len(all_doc_tokens) - 1
(tok_start_position, tok_end_position) = _improve_answer_span(
all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
example.orig_answer_text)
# The -3 accounts for [CLS], [SEP] and [SEP]
max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
# We can have documents that are longer than the maximum sequence length.
# To deal with this we do a sliding window approach, where we take chunks
# of the up to our max length with a stride of `doc_stride`.
_DocSpan = collections.namedtuple( # pylint: disable=invalid-name
"DocSpan", ["start", "length"])
doc_spans = []
start_offset = 0
while start_offset < len(all_doc_tokens):
length = len(all_doc_tokens) - start_offset
if length > max_tokens_for_doc:
length = max_tokens_for_doc
doc_spans.append(_DocSpan(start=start_offset, length=length))
if start_offset + length == len(all_doc_tokens):
break
start_offset += min(length, doc_stride)
for (doc_span_index, doc_span) in enumerate(doc_spans):
tokens = []
token_to_orig_map = {}
token_is_max_context = {}
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in query_tokens:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
for i in range(doc_span.length):
split_token_index = doc_span.start + i
token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
is_max_context = _check_is_max_context(doc_spans, doc_span_index,
split_token_index)
token_is_max_context[len(tokens)] = is_max_context
tokens.append(all_doc_tokens[split_token_index])
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
start_position = None
end_position = None
if is_training and not example.is_impossible:
# For training, if our document chunk does not contain an annotation
# we throw it out, since there is nothing to predict.
doc_start = doc_span.start
doc_end = doc_span.start + doc_span.length - 1
out_of_span = False
if not (tok_start_position >= doc_start and
tok_end_position <= doc_end):
out_of_span = True
if out_of_span:
start_position = 0
end_position = 0
else:
doc_offset = len(query_tokens) + 2
start_position = tok_start_position - doc_start + doc_offset
end_position = tok_end_position - doc_start + doc_offset
if is_training and example.is_impossible:
start_position = 0
end_position = 0
features.append(
InputFeatures(
unique_id=unique_id,
example_index=example_index,
doc_span_index=doc_span_index,
tokens=tokens,
token_to_orig_map=token_to_orig_map,
token_is_max_context=token_is_max_context,
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
start_position=start_position,
end_position=end_position,
is_impossible=example.is_impossible))
unique_id += 1
return features
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
orig_answer_text):
"""Returns tokenized answer spans that better match the annotated answer."""
# The SQuAD annotations are character based. We first project them to
# whitespace-tokenized words. But then after WordPiece tokenization, we can
# often find a "better match". For example:
#
# Question: What year was John Smith born?
# Context: The leader was John Smith (1895-1943).
# Answer: 1895
#
# The original whitespace-tokenized answer will be "(1895-1943).". However
# after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
# the exact answer, 1895.
#
# However, this is not always possible. Consider the following:
#
# Question: What country is the top exporter of electornics?
# Context: The Japanese electronics industry is the lagest in the world.
# Answer: Japan
#
# In this case, the annotator chose "Japan" as a character sub-span of
# the word "Japanese". Since our WordPiece tokenizer does not split
# "Japanese", we just use "Japanese" as the annotation. This is fairly rare
# in SQuAD, but does happen.
tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
for new_start in range(input_start, input_end + 1):
for new_end in range(input_end, new_start - 1, -1):
text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
if text_span == tok_answer_text:
return (new_start, new_end)
return (input_start, input_end)
def _check_is_max_context(doc_spans, cur_span_index, position):
"""Check if this is the 'max context' doc span for the token."""
# Because of the sliding window approach taken to scoring documents, a single
# token can appear in multiple documents. E.g.
# Doc: the man went to the store and bought a gallon of milk
# Span A: the man went to the
# Span B: to the store and bought
# Span C: and bought a gallon of
# ...
#
# Now the word 'bought' will have two scores from spans B and C. We only
# want to consider the score with "maximum context", which we define as
# the *minimum* of its left and right context (the *sum* of left and
# right context will always be the same, of course).
#
# In the example the maximum context for 'bought' would be span C since
# it has 1 left context and 3 right context, while span B has 4 left context
# and 0 right context.
best_score = None
best_span_index = None
for (span_index, doc_span) in enumerate(doc_spans):
end = doc_span.start + doc_span.length - 1
if position < doc_span.start:
continue
if position > end:
continue
num_left_context = position - doc_span.start
num_right_context = end - position
score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
if best_score is None or score > best_score:
best_score = score
best_span_index = span_index
return cur_span_index == best_span_index
RawResult = collections.namedtuple("RawResult",
["unique_id", "start_logits", "end_logits"])
def get_answers(examples, features, results, args):
predictions = collections.defaultdict(list) #it is possible that one example corresponds to multiple features
Prediction = collections.namedtuple('Prediction', ['text', 'start_logit', 'end_logit'])
if args.version_2_with_negative:
null_vals = collections.defaultdict(lambda: (float("inf"),0,0))
for ex, feat, result in match_results(examples, features, results):
start_indices = _get_best_indices(result.start_logits, args.n_best_size)
end_indices = _get_best_indices(result.end_logits, args.n_best_size)
prelim_predictions = get_valid_prelim_predictions(start_indices, end_indices, feat, result, args)
prelim_predictions = sorted(
prelim_predictions,
key=lambda x: (x.start_logit + x.end_logit),
reverse=True)
if args.version_2_with_negative:
score = result.start_logits[0] + result.end_logits[0]
if score < null_vals[ex.qas_id][0]:
null_vals[ex.qas_id] = (score, result.start_logits[0], result.end_logits[0])
curr_predictions = []
seen_predictions = []
for pred in prelim_predictions:
if len(curr_predictions) == args.n_best_size:
break
if pred.start_index > 0: # this is a non-null prediction TODO: this probably is irrelevant
final_text = get_answer_text(ex, feat, pred, args)
if final_text in seen_predictions:
continue
else:
final_text = ""
seen_predictions.append(final_text)
curr_predictions.append(Prediction(final_text, pred.start_logit, pred.end_logit))
predictions[ex.qas_id] += curr_predictions
#Add empty prediction
if args.version_2_with_negative:
for qas_id in predictions.keys():
predictions[qas_id].append(Prediction('',
null_vals[ex.qas_id][1],
null_vals[ex.qas_id][2]))
nbest_answers = collections.defaultdict(list)
answers = {}
for qas_id, preds in predictions.items():
nbest = sorted(
preds,
key=lambda x: (x.start_logit + x.end_logit),
reverse=True)[:args.n_best_size]
# In very rare edge cases we could only have single null prediction.
# So we just create a nonce prediction in this case to avoid failure.
if not nbest:
nbest.append(Prediction(text="empty", start_logit=0.0, end_logit=0.0))
total_scores = []
best_non_null_entry = None
for entry in nbest:
total_scores.append(entry.start_logit + entry.end_logit)
if not best_non_null_entry and entry.text:
best_non_null_entry = entry
probs = _compute_softmax(total_scores)
for (i, entry) in enumerate(nbest):
output = collections.OrderedDict()
output["text"] = entry.text
output["probability"] = probs[i]
output["start_logit"] = entry.start_logit
output["end_logit"] = entry.end_logit
nbest_answers[qas_id].append(output)
if args.version_2_with_negative:
score_diff = null_vals[qas_id][0] - best_non_null_entry.start_logit - best_non_null_entry.end_logit
if score_diff > args.null_score_diff_threshold:
answers[qas_id] = ""
else:
answers[qas_id] = best_non_null_entry.text
else:
answers[qas_id] = nbest_answers[qas_id][0]['text']
return answers, nbest_answers
def get_answer_text(example, feature, pred, args):
tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
orig_doc_start = feature.token_to_orig_map[pred.start_index]
orig_doc_end = feature.token_to_orig_map[pred.end_index]
orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
tok_text = " ".join(tok_tokens)
# De-tokenize WordPieces that have been split off.
tok_text = tok_text.replace(" ##", "")
tok_text = tok_text.replace("##", "")
# Clean whitespace
tok_text = tok_text.strip()
tok_text = " ".join(tok_text.split())
orig_text = " ".join(orig_tokens)
final_text = get_final_text(tok_text, orig_text, args.do_lower_case, args.verbose_logging)
return final_text
def get_valid_prelim_predictions(start_indices, end_indices, feature, result, args):
_PrelimPrediction = collections.namedtuple(
"PrelimPrediction",
["start_index", "end_index", "start_logit", "end_logit"])
prelim_predictions = []
for start_index in start_indices:
for end_index in end_indices:
if start_index >= len(feature.tokens):
continue
if end_index >= len(feature.tokens):
continue
if start_index not in feature.token_to_orig_map:
continue
if end_index not in feature.token_to_orig_map:
continue
if not feature.token_is_max_context.get(start_index, False):
continue
if end_index < start_index:
continue
length = end_index - start_index + 1
if length > args.max_answer_length:
continue
prelim_predictions.append(
_PrelimPrediction(
start_index=start_index,
end_index=end_index,
start_logit=result.start_logits[start_index],
end_logit=result.end_logits[end_index]))
return prelim_predictions
def match_results(examples, features, results):
unique_f_ids = set([f.unique_id for f in features])
unique_r_ids = set([r.unique_id for r in results])
matching_ids = unique_f_ids & unique_r_ids
features = [f for f in features if f.unique_id in matching_ids]
results = [r for r in results if r.unique_id in matching_ids]
features.sort(key=lambda x: x.unique_id)
results.sort(key=lambda x: x.unique_id)
for f, r in zip(features, results): #original code assumes strict ordering of examples. TODO: rewrite this
yield examples[f.example_index], f, r
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
"""Project the tokenized prediction back to the original text."""
# When we created the data, we kept track of the alignment between original
# (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
# now `orig_text` contains the span of our original text corresponding to the
# span that we predicted.
#
# However, `orig_text` may contain extra characters that we don't want in
# our prediction.
#
# For example, let's say:
# pred_text = steve smith
# orig_text = Steve Smith's
#
# We don't want to return `orig_text` because it contains the extra "'s".
#
# We don't want to return `pred_text` because it's already been normalized
# (the SQuAD eval script also does punctuation stripping/lower casing but
# our tokenizer does additional normalization like stripping accent
# characters).
#
# What we really want to return is "Steve Smith".
#
# Therefore, we have to apply a semi-complicated alignment heruistic between
# `pred_text` and `orig_text` to get a character-to-charcter alignment. This
# can fail in certain cases in which case we just return `orig_text`.
def _strip_spaces(text):
ns_chars = []
ns_to_s_map = collections.OrderedDict()
for (i, c) in enumerate(text):
if c == " ":
continue
ns_to_s_map[len(ns_chars)] = i
ns_chars.append(c)
ns_text = "".join(ns_chars)
return (ns_text, ns_to_s_map)
# We first tokenize `orig_text`, strip whitespace from the result
# and `pred_text`, and check if they are the same length. If they are
# NOT the same length, the heuristic has failed. If they are the same
# length, we assume the characters are one-to-one aligned.
tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
tok_text = " ".join(tokenizer.tokenize(orig_text))
start_position = tok_text.find(pred_text)
if start_position == -1:
if verbose_logging:
logger.info(
"Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
return orig_text
end_position = start_position + len(pred_text) - 1
(orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
(tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
if len(orig_ns_text) != len(tok_ns_text):
if verbose_logging:
logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
orig_ns_text, tok_ns_text)
return orig_text
# We then project the characters in `pred_text` back to `orig_text` using
# the character-to-character alignment.
tok_s_to_ns_map = {}
for (i, tok_index) in tok_ns_to_s_map.items():
tok_s_to_ns_map[tok_index] = i
orig_start_position = None
if start_position in tok_s_to_ns_map:
ns_start_position = tok_s_to_ns_map[start_position]
if ns_start_position in orig_ns_to_s_map:
orig_start_position = orig_ns_to_s_map[ns_start_position]
if orig_start_position is None:
if verbose_logging:
logger.info("Couldn't map start position")
return orig_text
orig_end_position = None
if end_position in tok_s_to_ns_map:
ns_end_position = tok_s_to_ns_map[end_position]
if ns_end_position in orig_ns_to_s_map:
orig_end_position = orig_ns_to_s_map[ns_end_position]
if orig_end_position is None:
if verbose_logging:
logger.info("Couldn't map end position")
return orig_text
output_text = orig_text[orig_start_position:(orig_end_position + 1)]
return output_text
def _get_best_indices(logits, n_best_size):
"""Get the n-best logits from a list."""
index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
best_indices = []
for i in range(len(index_and_score)):
if i >= n_best_size:
break
best_indices.append(index_and_score[i][0])
return best_indices
def _compute_softmax(scores):
"""Compute softmax probability over raw logits."""
if not scores:
return []
max_score = None
for score in scores:
if max_score is None or score > max_score:
max_score = score
exp_scores = []
total_sum = 0.0
for score in scores:
x = math.exp(score - max_score)
exp_scores.append(x)
total_sum += x
probs = []
for score in exp_scores:
probs.append(score / total_sum)
return probs
from apex.multi_tensor_apply import multi_tensor_applier
class GradientClipper:
"""
Clips gradient norm of an iterable of parameters.
"""
def __init__(self, max_grad_norm):
self.max_norm = max_grad_norm
if multi_tensor_applier.available:
import amp_C
self._overflow_buf = torch.cuda.IntTensor([0])
self.multi_tensor_l2norm = amp_C.multi_tensor_l2norm
self.multi_tensor_scale = amp_C.multi_tensor_scale
else:
raise RuntimeError('Gradient clipping requires cuda extensions')
def step(self, parameters):
l = [p.grad for p in parameters if p.grad is not None]
total_norm, _ = multi_tensor_applier(self.multi_tensor_l2norm, self._overflow_buf, [l], False)
total_norm = total_norm.item()
if (total_norm == float('inf')): return
clip_coef = self.max_norm / (total_norm + 1e-6)
if clip_coef < 1:
multi_tensor_applier(self.multi_tensor_scale, self._overflow_buf, [l, l], clip_coef)
def main():
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--bert_model", default=None, type=str, required=True,
help="Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
"bert-base-multilingual-cased, bert-base-chinese.")
parser.add_argument("--output_dir", default=None, type=str, required=True,
help="The output directory where the model checkpoints and predictions will be written.")
parser.add_argument("--init_checkpoint",
default=None,
type=str,
required=True,
help="The checkpoint file from pretraining")
## Other parameters
parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json")
parser.add_argument("--predict_file", default=None, type=str,
help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
parser.add_argument("--max_seq_length", default=384, type=int,
help="The maximum total input sequence length after WordPiece tokenization. Sequences "
"longer than this will be truncated, and sequences shorter than this will be padded.")
parser.add_argument("--doc_stride", default=128, type=int,
help="When splitting up a long document into chunks, how much stride to take between chunks.")
parser.add_argument("--max_query_length", default=64, type=int,
help="The maximum number of tokens for the question. Questions longer than this will "
"be truncated to this length.")
parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.")
parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.")
parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
parser.add_argument("--num_train_epochs", default=3.0, type=float,
help="Total number of training epochs to perform.")
parser.add_argument("--max_steps", default=-1.0, type=float,
help="Total number of training steps to perform.")
parser.add_argument("--warmup_proportion", default=0.1, type=float,
help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% "
"of training.")
parser.add_argument("--n_best_size", default=20, type=int,
help="The total number of n-best predictions to generate in the nbest_predictions.json "
"output file.")
parser.add_argument("--max_answer_length", default=30, type=int,
help="The maximum length of an answer that can be generated. This is needed because the start "
"and end predictions are not conditioned on one another.")
parser.add_argument("--verbose_logging", action='store_true',
help="If true, all of the warnings related to data processing will be printed. "
"A number of warnings are expected for a normal SQuAD evaluation.")
parser.add_argument("--no_cuda",
action='store_true',
help="Whether not to use CUDA when available")
parser.add_argument('--seed',
type=int,
default=42,
help="random seed for initialization")
parser.add_argument('--gradient_accumulation_steps',
type=int,
default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument("--do_lower_case",
action='store_true',
help="Whether to lower case the input text. True for uncased models, False for cased models.")
parser.add_argument("--local_rank",
type=int,
default=os.getenv('LOCAL_RANK', -1),
help="local_rank for distributed training on gpus")
parser.add_argument('--fp16',
default=False,
action='store_true',
help="Mixed precision training")
parser.add_argument('--amp',
default=False,
action='store_true',
help="Mixed precision training")
parser.add_argument('--loss_scale',
type=float, default=0,
help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
"0 (default value): dynamic loss scaling.\n"
"Positive power of 2: static loss scaling value.\n")
parser.add_argument('--version_2_with_negative',
action='store_true',
help='If true, the SQuAD examples contain some that do not have an answer.')
parser.add_argument('--null_score_diff_threshold',
type=float, default=0.0,
help="If null_score - best_non_null is greater than the threshold predict null.")
parser.add_argument('--vocab_file',
type=str, default=None, required=True,
help="Vocabulary mapping/file BERT was pretrainined on")
parser.add_argument("--config_file",
default=None,
type=str,
required=True,
help="The BERT model config")
parser.add_argument('--log_freq',
type=int, default=50,
help='frequency of logging loss.')
parser.add_argument('--json-summary', type=str, default="results/dllogger.json",
help='If provided, the json summary will be written to'
'the specified file.')
parser.add_argument("--eval_script",
help="Script to evaluate squad predictions",
default="evaluate.py",
type=str)
parser.add_argument("--do_eval",
action='store_true',
help="Whether to use evaluate accuracy of predictions")
parser.add_argument("--use_env",
action='store_true',
help="Whether to read local rank from ENVVAR")
parser.add_argument('--skip_checkpoint',
default=False,
action='store_true',
help="Whether to save checkpoints")
parser.add_argument('--disable-progress-bar',
default=False,
action='store_true',
help='Disable tqdm progress bar')
parser.add_argument("--skip_cache",
default=False,
action='store_true',
help="Whether to cache train features")
parser.add_argument("--cache_dir",
default=None,
type=str,
help="Location to cache train feaures. Will default to the dataset directory")
parser.add_argument("--dist_url",default='tcp://224.66.41.62:23456', type=str,
help='url used to set up distributed training')
parser.add_argument("--gpus_per_node",type=int,default=4,
help='num of gpus per node')
parser.add_argument("--world_size",type=int,default=1,
help="number of process")
args = parser.parse_args()
args.fp16 = args.fp16 or args.amp
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
n_gpu = torch.cuda.device_count()
else:
print("n_gpu:",torch.cuda.device_count())
device_n = args.local_rank % 8
torch.cuda.set_device(device_n)
device = torch.device("cuda", device_n)
#torch.cuda.set_device(args.local_rank)
#device = torch.device("cuda", args.local_rank)
#device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
#torch.distributed.init_process_group(backend='gloo', init_method='env://')
#xuan
#if args.world_size > 1:
# args.local_rank = args.local_rank * args.gpus_per_node
torch.distributed.init_process_group(backend='nccl', init_method=args.dist_url,
world_size=args.world_size, rank=args.local_rank)
n_gpu = 1
if is_main_process():
dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
filename=args.json_summary),
dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step)])
else:
dllogger.init(backends=[])
print("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
device, n_gpu, bool(args.local_rank != -1), args.fp16))
dllogger.log(step="PARAMETER", data={"Config": [str(args)]})
if args.gradient_accumulation_steps < 1:
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
args.gradient_accumulation_steps))
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
dllogger.log(step="PARAMETER", data={"SEED": args.seed})
if n_gpu > 0:
torch.cuda.manual_seed_all(args.seed)
if not args.do_train and not args.do_predict:
raise ValueError("At least one of `do_train` or `do_predict` must be True.")
if args.do_train:
if not args.train_file:
raise ValueError(
"If `do_train` is True, then `train_file` must be specified.")
if args.do_predict:
if not args.predict_file:
raise ValueError(
"If `do_predict` is True, then `predict_file` must be specified.")
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and os.listdir(args.output_dir)!=['logfile.txt']:
print("WARNING: Output directory {} already exists and is not empty.".format(args.output_dir), os.listdir(args.output_dir))
if not os.path.exists(args.output_dir) and is_main_process():
os.makedirs(args.output_dir)
tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large
# tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
train_examples = None
num_train_optimization_steps = None
if args.do_train:
train_examples = read_squad_examples(
input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
num_train_optimization_steps = int(
len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
if args.local_rank != -1:
num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
# Prepare model
config = modeling.BertConfig.from_json_file(args.config_file)
# Padding for divisibility by 8
if config.vocab_size % 8 != 0:
config.vocab_size += 8 - (config.vocab_size % 8)
modeling.ACT2FN["bias_gelu"] = modeling.bias_gelu_training
model = modeling.BertForQuestionAnswering(config)
# model = modeling.BertForQuestionAnswering.from_pretrained(args.bert_model,
# cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)))
dllogger.log(step="PARAMETER", data={"loading_checkpoint": True})
#model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')["model"], strict=False)
model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False)
dllogger.log(step="PARAMETER", data={"loaded_checkpoint": True})
model.to(device)
#model = model.cuda()
num_weights = sum([p.numel() for p in model.parameters() if p.requires_grad])
dllogger.log(step="PARAMETER", data={"model_weights_num":num_weights})
# Prepare optimizer
param_optimizer = list(model.named_parameters())
# hack to remove pooler, which is not used
# thus it produce None grad that break apex
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
if args.do_train:
if args.fp16:
try:
from apex.optimizers import FusedAdam
except ImportError:
raise ImportError(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
optimizer = FusedAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
bias_correction=False)
if args.loss_scale == 0:
model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False,
loss_scale="dynamic")
else:
model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale=args.loss_scale)
if args.do_train:
scheduler = LinearWarmUpScheduler(optimizer, warmup=args.warmup_proportion, total_steps=num_train_optimization_steps)
else:
optimizer = BertAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
if args.local_rank != -1:
try:
from apex.parallel import DistributedDataParallel as DDP
except ImportError:
raise ImportError(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
model = DDP(model)
# model = torch.nn.parallel.DistributedDataParallel(model,device_ids=[device_n])
elif n_gpu > 1:
model = torch.nn.DataParallel(model)
global_step = 0
if args.do_train:
if args.cache_dir is None:
cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format(
list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride),
str(args.max_query_length))
else:
cached_train_features_file = args.cache_dir.strip('/') + '/' + args.train_file.split('/')[-1] + '_{0}_{1}_{2}_{3}'.format(
list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride),
str(args.max_query_length))
train_features = None
try:
with open(cached_train_features_file, "rb") as reader:
train_features = pickle.load(reader)
except:
train_features = convert_examples_to_features(
examples=train_examples,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length,
is_training=True)
if not args.skip_cache and is_main_process():
dllogger.log(step="PARAMETER", data={"Cached_train features_file": cached_train_features_file})
with open(cached_train_features_file, "wb") as writer:
pickle.dump(train_features, writer)
dllogger.log(step="PARAMETER", data={"train_start": True})
dllogger.log(step="PARAMETER", data={"training_samples": len(train_examples)})
dllogger.log(step="PARAMETER", data={"training_features": len(train_features)})
dllogger.log(step="PARAMETER", data={"train_batch_size":args.train_batch_size})
dllogger.log(step="PARAMETER", data={"steps":num_train_optimization_steps})
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
all_start_positions, all_end_positions)
if args.local_rank == -1:
train_sampler = RandomSampler(train_data)
else:
train_sampler = DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size * n_gpu)
model.train()
gradClipper = GradientClipper(max_grad_norm=1.0)
final_loss = None
train_start = time.time()
for epoch in range(int(args.num_train_epochs)):
train_iter = tqdm(train_dataloader, desc="Iteration", disable=args.disable_progress_bar) if is_main_process() else train_dataloader
for step, batch in enumerate(train_iter):
# Terminate early for benchmarking
if args.max_steps > 0 and global_step > args.max_steps:
break
if n_gpu == 1:
batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self
input_ids, input_mask, segment_ids, start_positions, end_positions = batch
start_logits, end_logits = model(input_ids, segment_ids, input_mask)
# If we are on multi-GPU, split add a dimension
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# sometimes the start/end positions are outside our model inputs, we ignore these terms
ignored_index = start_logits.size(1)
start_positions.clamp_(0, ignored_index)
end_positions.clamp_(0, ignored_index)
loss_fct = torch.nn.CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
loss = (start_loss + end_loss) / 2
if n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu.
if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps
if args.fp16:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
# gradient clipping
gradClipper.step(amp.master_params(optimizer))
if (step + 1) % args.gradient_accumulation_steps == 0:
if args.fp16 :
# modify learning rate with special warm up for BERT which FusedAdam doesn't do
scheduler.step()
optimizer.step()
optimizer.zero_grad()
global_step += 1
final_loss = loss.item()
if step % args.log_freq == 0:
dllogger.log(step=(epoch, global_step,), data={"step_loss": final_loss,
"learning_rate": optimizer.param_groups[0]['lr']})
time_to_train = time.time() - train_start
if args.do_train and is_main_process() and not args.skip_checkpoint:
# Save a trained model and the associated configuration
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
output_model_file = os.path.join(args.output_dir, modeling.WEIGHTS_NAME)
torch.save({"model":model_to_save.state_dict()}, output_model_file)
output_config_file = os.path.join(args.output_dir, modeling.CONFIG_NAME)
with open(output_config_file, 'w') as f:
f.write(model_to_save.config.to_json_string())
if args.do_predict and (args.local_rank == -1 or is_main_process()):
if not args.do_train and args.fp16:
model.half()
eval_examples = read_squad_examples(
input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative)
eval_features = convert_examples_to_features(
examples=eval_examples,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length,
is_training=False)
dllogger.log(step="PARAMETER", data={"infer_start": True})
dllogger.log(step="PARAMETER", data={"eval_samples": len(eval_examples)})
dllogger.log(step="PARAMETER", data={"eval_features": len(eval_features)})
dllogger.log(step="PARAMETER", data={"predict_batch_size": args.predict_batch_size})
all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
# Run prediction for full data
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
infer_start = time.time()
model.eval()
all_results = []
dllogger.log(step="PARAMETER", data={"eval_start": True})
for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.disable_progress_bar):
if len(all_results) % 1000 == 0:
dllogger.log(step="PARAMETER", data={"sample_number": len(all_results)})
input_ids = input_ids.to(device)
input_mask = input_mask.to(device)
segment_ids = segment_ids.to(device)
with torch.no_grad():
batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask)
for i, example_index in enumerate(example_indices):
start_logits = batch_start_logits[i].detach().cpu().tolist()
end_logits = batch_end_logits[i].detach().cpu().tolist()
eval_feature = eval_features[example_index.item()]
unique_id = int(eval_feature.unique_id)
all_results.append(RawResult(unique_id=unique_id,
start_logits=start_logits,
end_logits=end_logits))
time_to_infer = time.time() - infer_start
output_prediction_file = os.path.join(args.output_dir, "predictions.json")
output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
answers, nbest_answers = get_answers(eval_examples, eval_features, all_results, args)
with open(output_prediction_file, "w") as f:
f.write(json.dumps(answers, indent=4) + "\n")
with open(output_nbest_file, "w") as f:
f.write(json.dumps(nbest_answers, indent=4) + "\n")
# output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
# write_predictions(eval_examples, eval_features, all_results,
# args.n_best_size, args.max_answer_length,
# args.do_lower_case, output_prediction_file,
# output_nbest_file, output_null_log_odds_file, args.verbose_logging,
# args.version_2_with_negative, args.null_score_diff_threshold)
#if args.do_eval and is_main_process():
if args.do_eval:
import sys
import subprocess
eval_out = subprocess.check_output([sys.executable, args.eval_script,
args.predict_file, args.output_dir + "/predictions.json"])
scores = str(eval_out).strip()
exact_match = float(scores.split(":")[1].split(",")[0])
f1 = float(scores.split(":")[2].split("}")[0])
#测试是否定义了
print('f1:',f1,"exact_match:",exact_match)
if args.do_train:
gpu_count = n_gpu
if torch.distributed.is_initialized():
gpu_count = torch.distributed.get_world_size()
if args.max_steps == -1:
dllogger.log(step=tuple(), data={"e2e_train_time": time_to_train,
"training_sequences_per_second": len(train_features) * args.num_train_epochs / time_to_train,
"final_loss": final_loss})
else:
dllogger.log(step=tuple(), data={"e2e_train_time": time_to_train,
"training_sequences_per_second": args.train_batch_size * args.gradient_accumulation_steps \
* args.max_steps * gpu_count / time_to_train,
"final_loss": final_loss})
if args.do_predict and is_main_process():
dllogger.log(step=tuple(), data={"e2e_inference_time": time_to_infer,
"inference_sequences_per_second": len(eval_features) / time_to_infer})
if args.do_eval and is_main_process():
# global exact_match
# global f1
dllogger.log(step=tuple(), data={"exact_match": exact_match, "F1": f1})
if __name__ == "__main__":
main()
dllogger.flush()
# coding=utf-8
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT finetuning runner."""
import argparse
import csv
import logging
import os
import random
import sys
from io import open
import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from modeling import BertForMultipleChoice, BertConfig, WEIGHTS_NAME, CONFIG_NAME
from optimization import BertAdam, warmup_linear
from tokenization import BertTokenizer
torch._C._jit_set_profiling_mode(False)
torch._C._jit_set_profiling_executor(False)
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logger = logging.getLogger(__name__)
class SwagExample(object):
"""A single training/test example for the SWAG dataset."""
def __init__(self,
swag_id,
context_sentence,
start_ending,
ending_0,
ending_1,
ending_2,
ending_3,
label = None):
self.swag_id = swag_id
self.context_sentence = context_sentence
self.start_ending = start_ending
self.endings = [
ending_0,
ending_1,
ending_2,
ending_3,
]
self.label = label
def __str__(self):
return self.__repr__()
def __repr__(self):
l = [
"swag_id: {}".format(self.swag_id),
"context_sentence: {}".format(self.context_sentence),
"start_ending: {}".format(self.start_ending),
"ending_0: {}".format(self.endings[0]),
"ending_1: {}".format(self.endings[1]),
"ending_2: {}".format(self.endings[2]),
"ending_3: {}".format(self.endings[3]),
]
if self.label is not None:
l.append("label: {}".format(self.label))
return ", ".join(l)
class InputFeatures(object):
def __init__(self,
example_id,
choices_features,
label
):
self.example_id = example_id
self.choices_features = [
{
'input_ids': input_ids,
'input_mask': input_mask,
'segment_ids': segment_ids
}
for _, input_ids, input_mask, segment_ids in choices_features
]
self.label = label
def read_swag_examples(input_file, is_training):
with open(input_file, 'r', encoding='utf-8') as f:
reader = csv.reader(f)
lines = []
for line in reader:
if sys.version_info[0] == 2:
line = list(unicode(cell, 'utf-8') for cell in line)
lines.append(line)
if is_training and lines[0][-1] != 'label':
raise ValueError(
"For training, the input file must contain a label column."
)
examples = [
SwagExample(
swag_id = line[2],
context_sentence = line[4],
start_ending = line[5], # in the swag dataset, the
# common beginning of each
# choice is stored in "sent2".
ending_0 = line[7],
ending_1 = line[8],
ending_2 = line[9],
ending_3 = line[10],
label = int(line[11]) if is_training else None
) for line in lines[1:] # we skip the line with the column names
]
return examples
def convert_examples_to_features(examples, tokenizer, max_seq_length,
is_training):
"""Loads a data file into a list of `InputBatch`s."""
# Swag is a multiple choice task. To perform this task using Bert,
# we will use the formatting proposed in "Improving Language
# Understanding by Generative Pre-Training" and suggested by
# @jacobdevlin-google in this issue
# https://github.com/google-research/bert/issues/38.
#
# Each choice will correspond to a sample on which we run the
# inference. For a given Swag example, we will create the 4
# following inputs:
# - [CLS] context [SEP] choice_1 [SEP]
# - [CLS] context [SEP] choice_2 [SEP]
# - [CLS] context [SEP] choice_3 [SEP]
# - [CLS] context [SEP] choice_4 [SEP]
# The model will output a single value for each input. To get the
# final decision of the model, we will run a softmax over these 4
# outputs.
features = []
for example_index, example in enumerate(examples):
context_tokens = tokenizer.tokenize(example.context_sentence)
start_ending_tokens = tokenizer.tokenize(example.start_ending)
choices_features = []
for ending_index, ending in enumerate(example.endings):
# We create a copy of the context tokens in order to be
# able to shrink it according to ending_tokens
context_tokens_choice = context_tokens[:]
ending_tokens = start_ending_tokens + tokenizer.tokenize(ending)
# Modifies `context_tokens_choice` and `ending_tokens` in
# place so that the total length is less than the
# specified length. Account for [CLS], [SEP], [SEP] with
# "- 3"
_truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3)
tokens = ["[CLS]"] + context_tokens_choice + ["[SEP]"] + ending_tokens + ["[SEP]"]
segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (len(ending_tokens) + 1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
padding = [0] * (max_seq_length - len(input_ids))
input_ids += padding
input_mask += padding
segment_ids += padding
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
choices_features.append((tokens, input_ids, input_mask, segment_ids))
label = example.label
if example_index < 5:
logger.info("*** Example ***")
logger.info("swag_id: {}".format(example.swag_id))
for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
logger.info("choice: {}".format(choice_idx))
logger.info("tokens: {}".format(' '.join(tokens)))
logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
logger.info("input_mask: {}".format(' '.join(map(str, input_mask))))
logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids))))
if is_training:
logger.info("label: {}".format(label))
features.append(
InputFeatures(
example_id = example.swag_id,
choices_features = choices_features,
label = label
)
)
return features
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
def accuracy(out, labels):
outputs = np.argmax(out, axis=1)
return np.sum(outputs == labels)
def select_field(features, field):
return [
[
choice[field]
for choice in feature.choices_features
]
for feature in features
]
def main():
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--data_dir",
default=None,
type=str,
required=True,
help="The input data dir. Should contain the .csv files (or other data files) for the task.")
parser.add_argument("--bert_model", default=None, type=str, required=True,
help="Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
"bert-base-multilingual-cased, bert-base-chinese.")
parser.add_argument("--output_dir",
default=None,
type=str,
required=True,
help="The output directory where the model checkpoints will be written.")
parser.add_argument("--init_checkpoint",
default=None,
type=str,
required=True,
help="The checkpoint file from pretraining")
## Other parameters
parser.add_argument("--max_seq_length",
default=128,
type=int,
help="The maximum total input sequence length after WordPiece tokenization. \n"
"Sequences longer than this will be truncated, and sequences shorter \n"
"than this will be padded.")
parser.add_argument("--do_train",
action='store_true',
help="Whether to run training.")
parser.add_argument("--do_eval",
action='store_true',
help="Whether to run eval on the dev set.")
parser.add_argument("--do_lower_case",
action='store_true',
help="Set this flag if you are using an uncased model.")
parser.add_argument("--train_batch_size",
default=32,
type=int,
help="Total batch size for training.")
parser.add_argument("--eval_batch_size",
default=8,
type=int,
help="Total batch size for eval.")
parser.add_argument("--learning_rate",
default=5e-5,
type=float,
help="The initial learning rate for Adam.")
parser.add_argument("--num_train_epochs",
default=3.0,
type=float,
help="Total number of training epochs to perform.")
parser.add_argument("--max_steps", default=-1.0, type=float,
help="Total number of training steps to perform.")
parser.add_argument("--warmup_proportion",
default=0.1,
type=float,
help="Proportion of training to perform linear learning rate warmup for. "
"E.g., 0.1 = 10%% of training.")
parser.add_argument("--no_cuda",
action='store_true',
help="Whether not to use CUDA when available")
parser.add_argument("--local_rank",
type=int,
default=-1,
help="local_rank for distributed training on gpus")
parser.add_argument('--seed',
type=int,
default=42,
help="random seed for initialization")
parser.add_argument('--gradient_accumulation_steps',
type=int,
default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument('--fp16',
default=False,
action='store_true',
help="Mixed precision training")
parser.add_argument('--amp',
default=False,
action='store_true',
help="Mixed precision training")
parser.add_argument('--loss_scale',
type=float, default=0,
help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
"0 (default value): dynamic loss scaling.\n"
"Positive power of 2: static loss scaling value.\n")
args = parser.parse_args()
args.fp16 = args.fp16 or args.amp
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
n_gpu = torch.cuda.device_count()
else:
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
n_gpu = 1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.distributed.init_process_group(backend='nccl')
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
device, n_gpu, bool(args.local_rank != -1), args.fp16))
if args.gradient_accumulation_steps < 1:
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
args.gradient_accumulation_steps))
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if n_gpu > 0:
torch.cuda.manual_seed_all(args.seed)
if not args.do_train and not args.do_eval:
raise ValueError("At least one of `do_train` or `do_eval` must be True.")
if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
print("WARNING: Output directory ({}) already exists and is not empty.".format(args.output_dir))
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
train_examples = None
num_train_optimization_steps = None
if args.do_train:
train_examples = read_swag_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True)
num_train_optimization_steps = int(
len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
if args.local_rank != -1:
num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
# Prepare model
model = BertForMultipleChoice.from_pretrained(args.bert_model,
cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)),
num_choices=4)
model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False)
if args.fp16:
model.half()
model.to(device)
if args.local_rank != -1:
try:
from apex.parallel import DistributedDataParallel as DDP
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
model = DDP(model)
elif n_gpu > 1:
model = torch.nn.DataParallel(model)
# Prepare optimizer
param_optimizer = list(model.named_parameters())
# hack to remove pooler, which is not used
# thus it produce None grad that break apex
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
if args.fp16:
try:
from apex.contrib.optimizers import FP16_Optimizer
from apex.optimizers import FusedAdam
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
optimizer = FusedAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
bias_correction=False,
max_grad_norm=1.0)
if args.loss_scale == 0:
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
else:
optimizer = BertAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
global_step = 0
if args.do_train:
train_features = convert_examples_to_features(
train_examples, tokenizer, args.max_seq_length, True)
logger.info("***** Running training *****")
logger.info(" Num examples = %d", len(train_examples))
logger.info(" Batch size = %d", args.train_batch_size)
logger.info(" Num steps = %d", num_train_optimization_steps)
all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long)
all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long)
all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long)
all_label = torch.tensor([f.label for f in train_features], dtype=torch.long)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
if args.local_rank == -1:
train_sampler = RandomSampler(train_data)
else:
train_sampler = DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
model.train()
for _ in trange(int(args.num_train_epochs), desc="Epoch"):
tr_loss = 0
nb_tr_examples, nb_tr_steps = 0, 0
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
# Terminate early for benchmarking
if args.max_steps > 0 and global_step > args.max_steps:
break
batch = tuple(t.to(device) for t in batch)
input_ids, input_mask, segment_ids, label_ids = batch
loss = model(input_ids, segment_ids, input_mask, label_ids)
if n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu.
if args.fp16 and args.loss_scale != 1.0:
# rescale loss for fp16 training
# see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
loss = loss * args.loss_scale
if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps
tr_loss += loss.item()
nb_tr_examples += input_ids.size(0)
nb_tr_steps += 1
if args.fp16:
optimizer.backward(loss)
else:
loss.backward()
if (step + 1) % args.gradient_accumulation_steps == 0:
if args.fp16:
# modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used that handles this automatically
lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
for param_group in optimizer.param_groups:
param_group['lr'] = lr_this_step
optimizer.step()
optimizer.zero_grad()
global_step += 1
if args.do_train:
# Save a trained model and the associated configuration
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
torch.save(model_to_save.state_dict(), output_model_file)
output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
with open(output_config_file, 'w') as f:
f.write(model_to_save.config.to_json_string())
# Load a trained model and config that you have fine-tuned
config = BertConfig(output_config_file)
model = BertForMultipleChoice(config, num_choices=4)
model.load_state_dict(torch.load(output_model_file))
else:
model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4)
model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False)
model.to(device)
if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
eval_examples = read_swag_examples(os.path.join(args.data_dir, 'val.csv'), is_training = True)
eval_features = convert_examples_to_features(
eval_examples, tokenizer, args.max_seq_length, True)
logger.info("***** Running evaluation *****")
logger.info(" Num examples = %d", len(eval_examples))
logger.info(" Batch size = %d", args.eval_batch_size)
all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long)
all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long)
all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long)
all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
# Run prediction for full data
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
input_ids = input_ids.to(device)
input_mask = input_mask.to(device)
segment_ids = segment_ids.to(device)
label_ids = label_ids.to(device)
with torch.no_grad():
tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
logits = model(input_ids, segment_ids, input_mask)
logits = logits.detach().cpu().numpy()
label_ids = label_ids.to('cpu').numpy()
tmp_eval_accuracy = accuracy(logits, label_ids)
eval_loss += tmp_eval_loss.mean().item()
eval_accuracy += tmp_eval_accuracy
nb_eval_examples += input_ids.size(0)
nb_eval_steps += 1
eval_loss = eval_loss / nb_eval_steps
eval_accuracy = eval_accuracy / nb_eval_examples
result = {'eval_loss': eval_loss,
'eval_accuracy': eval_accuracy,
'global_step': global_step,
'loss': tr_loss/nb_tr_steps}
output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
with open(output_eval_file, "w") as writer:
logger.info("***** Eval results *****")
for key in sorted(result.keys()):
logger.info(" %s = %s", key, str(result[key]))
writer.write("%s = %s\n" % (key, str(result[key])))
if __name__ == "__main__":
main()
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import torch
from torch.optim.optimizer import Optimizer
from torch.optim.lr_scheduler import _LRScheduler
class LRScheduler(_LRScheduler):
def __init__(self, optimizer, last_epoch=-1):
# Check if using mixed precision training
self.mixed_training = False
base_optimizer = optimizer
# Check that optimizer param is valid
if not isinstance(optimizer, Optimizer):
raise TypeError('{} is not an Optimizer'.format(
type(optimizer).__name__))
super(LRScheduler, self).__init__(base_optimizer, last_epoch)
def step(self, epoch=None):
# Set the current training step
# ('epoch' is used to be consistent with _LRScheduler)
if self.mixed_training:
# The assumption is that the step will be constant
state_dict = self.optimizer.state[self.optimizer.param_groups[0]['params'][0]]
if 'step' in state_dict:
self.last_epoch = state_dict['step'] + 1
else:
self.last_epoch = 1
else:
self.last_epoch = epoch if epoch is not None else self.last_epoch + 1
for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
param_group['lr'] = lr
class CosineWarmUpScheduler(LRScheduler):
"""
Applies a warm up period to the learning rate.
"""
def __init__(self, optimizer, warmup, total_steps, last_epoch=-1):
self.warmup = warmup
self.total_steps = total_steps
super(CosineWarmUpScheduler, self).__init__(optimizer, last_epoch)
def get_lr(self):
progress = self.last_epoch / self.total_steps
if progress < self.warmup:
return [base_lr * progress / self.warmup for base_lr in self.base_lrs]
else:
return [base_lr * (0.5 * (1.0 + torch.cos(math.pi + progress))) for base_lr in self.base_lrs]
class ConstantWarmUpScheduler(LRScheduler):
"""
Applies a warm up period to the learning rate.
"""
def __init__(self, optimizer, warmup, total_steps, last_epoch=-1):
self.warmup = warmup
self.total_steps = total_steps
super(ConstantWarmUpScheduler, self).__init__(optimizer, last_epoch)
def get_lr(self):
progress = self.last_epoch / self.total_steps
if progress < self.warmup:
return [base_lr * progress / self.warmup for base_lr in self.base_lrs]
else:
return self.base_lrs
class LinearWarmUpScheduler(LRScheduler):
"""
Applies a warm up period to the learning rate.
"""
def __init__(self, optimizer, warmup, total_steps, last_epoch=-1):
self.warmup = warmup
self.total_steps = total_steps
super(LinearWarmUpScheduler, self).__init__(optimizer, last_epoch)
def get_lr(self):
progress = self.last_epoch / self.total_steps
if progress < self.warmup:
return [base_lr * progress / self.warmup for base_lr in self.base_lrs]
else:
return [base_lr * max(( progress - 1.0)/(self.warmup - 1.0), 0.) for base_lr in self.base_lrs]
class PolyWarmUpScheduler(LRScheduler):
"""
Applies a warm up period to the learning rate.
"""
def __init__(self, optimizer, warmup, total_steps, degree=0.5, last_epoch=-1):
self.warmup = warmup
self.total_steps = total_steps
self.degree = degree
super(PolyWarmUpScheduler, self).__init__(optimizer, last_epoch)
def step(self, epoch=None):
param_group = self.optimizer.param_groups[0]
if 'step' in param_group:
self.last_epoch = param_group['step'] + 1
else:
self.last_epoch = 1
for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
param_group['lr'] = lr
def get_lr(self):
progress = self.last_epoch / self.total_steps
if progress < self.warmup:
return [base_lr * progress / self.warmup for base_lr in self.base_lrs]
else:
return [base_lr * ((1.0 - progress) ** self.degree) for base_lr in self.base_lrs]
# pretrain for wiki-en
## 1.prepare datasets
* 下载数据并解压
```
BERT_PREP_WORKING_DIR=./data_tf python3 ./bertPrep.py --action download --dataset wikicorpus_en
bzip2 -dk enwiki-20170201-pages-articles-multistream.xml.bz2
BERT_PREP_WORKING_DIR=./data_tf_mlperf python3 ./bertPrep.py --action text_formatting --dataset wikicorpus_en
```
* 数据分片
```
BERT_PREP_WORKING_DIR=./data_tf_mlperf python3 ./bertPrep.py --action sharding --dataset wikicorpus_en
```
* 生成phrase1 tf数据
```
python3 bertPrep.py --action create_tfrecord_files --dataset ${DATASET} --max_seq_length 128 --max_predictions_per_seq 20 --vocab_file ~/NLP-0904/uncased_L-12_H-768_A-12/vocab.txt
```
* 生成phrase2 tf数据
```
python3 bertPrep.py --action create_tfrecord_files --dataset ${DATASET} --max_seq_length 512 --max_predictions_per_seq 80 --vocab_file ~/NLP-0904/uncased_L-12_H-768_A-12/vocab.txt
```
## 2.install newer apex
```
wget https://github.com/ROCmSoftwarePlatform/apex/archive/v0.3.tar.gz
tar -zxf
python3 setup.py install --cuda_ext --cpp_ext
```
## 3.train
详见 [README](http://10.0.100.3/dcutoolkit/deeplearing/dlexamples/-/blob/dev_xuan/PyTorch/NLP/BERT/README.md)
<br>
# Fine Tune train and test for SQuAD1.1
## 1.download datasets
https://rajpurkar.github.io/SQuAD-explorer/
## 2.download pretrained model
https://github.com/google-research/bert#fine-tuning-with-bert
## 3.convert_tf_checkpoint
```
python3 convert_tf_checkpoint.py --tf_checkpoint ~/NLP/cks/bs64k_32k_ckpt/model.ckpt-28252 --bert_config_path ~/NLP/cks/bs64k_32k_ckpt/bert_config.json --output_checkpoint model.ckpt-28252.pt
```
* you can download converted model from :
```
链接:https://pan.baidu.com/s/1V8kFpgsLQe8tOAeft-5UpQ
提取码:vs8d
```
4.run
详见 [README](http://10.0.100.3/dcutoolkit/deeplearing/dlexamples/-/blob/dev_xuan/PyTorch/NLP/BERT/README.md)
the detail records at : http://wiki.sugon.com/display/~%E6%9D%A8%E7%92%87/BERT
#!/usr/bin/env bash
# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
batch_size_and_gradient_accumulation_steps() {
batch_size=$((global_batch_size / num_gpu))
gradient_accumulation_steps=1
while [ $((batch_size / gradient_accumulation_steps)) -gt $batch_size_capacity ]
do
gradient_accumulation_steps=$((gradient_accumulation_steps * 2))
done
}
commons () {
init_checkpoint=/workspace/bert/checkpoints/bert_uncased.pt
vocab_file=${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt
config_file=/workspace/bert/bert_config.json
max_steps=-1.0
}
mrpc_commons () {
data_dir=${BERT_PREP_WORKING_DIR}/download/glue/MRPC/
out_dir=/workspace/bert/results/MRPC
task_name=mrpc
global_batch_size=128
learning_rate=2.4e-5
warmup_proportion=0.1
epochs=3
}
sst-2_commons () {
data_dir=${BERT_PREP_WORKING_DIR}/download/glue/SST-2/
out_dir=/workspace/bert/results/SST-2
task_name=sst-2
warmup_proportion=0.1
epochs=3
}
dgxa100_fp16_commons () {
batch_size_capacity=128
precision=fp16
}
dgxa100_tf32_commons () {
batch_size_capacity=64
precision=tf32
}
dgx2_fp16_commons () {
batch_size_capacity=128
precision=fp16
}
dgx2_fp32_commons () {
batch_size_capacity=64
precision=fp32
}
print_arguments_in_order () {
echo \
$init_checkpoint \
$data_dir \
$vocab_file \
$config_file \
$out_dir \
$task_name \
$num_gpu \
$batch_size \
$gradient_accumulation_steps \
$learning_rate \
$warmup_proportion \
$epochs \
$max_steps \
$precision
}
##########################################
# DGXA100 #
##########################################
##########################
# MRPC #
##########################
# AMP
mrpc_dgxa100_1gpu_fp16 () {
commons
mrpc_commons
dgxa100_fp16_commons
num_gpu=1
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
mrpc_dgxa100_2gpu_fp16 () {
commons
mrpc_commons
dgxa100_fp16_commons
num_gpu=2
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
mrpc_dgxa100_4gpu_fp16 () {
commons
mrpc_commons
dgxa100_fp16_commons
num_gpu=4
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
mrpc_dgxa100_8gpu_fp16 () {
commons
mrpc_commons
dgxa100_fp16_commons
num_gpu=8
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
# TF32
mrpc_dgxa100_1gpu_tf32 () {
commons
mrpc_commons
dgxa100_tf32_commons
num_gpu=1
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
mrpc_dgxa100_2gpu_tf32 () {
commons
mrpc_commons
dgxa100_tf32_commons
num_gpu=2
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
mrpc_dgxa100_4gpu_tf32 () {
commons
mrpc_commons
dgxa100_tf32_commons
num_gpu=4
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
mrpc_dgxa100_8gpu_tf32 () {
commons
mrpc_commons
dgxa100_tf32_commons
num_gpu=8
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
##########################
# SST-2 #
##########################
# AMP
sst-2_dgxa100_fp16_commons () {
global_batch_size=1024
learning_rate=3e-5
}
sst-2_dgxa100_1gpu_fp16 () {
commons
sst-2_commons
dgxa100_fp16_commons
sst-2_dgxa100_fp16_commons
num_gpu=1
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
sst-2_dgxa100_2gpu_fp16 () {
commons
sst-2_commons
dgxa100_fp16_commons
sst-2_dgxa100_fp16_commons
num_gpu=2
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
sst-2_dgxa100_4gpu_fp16 () {
commons
sst-2_commons
dgxa100_fp16_commons
sst-2_dgxa100_fp16_commons
num_gpu=4
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
sst-2_dgxa100_8gpu_fp16 () {
commons
sst-2_commons
dgxa100_fp16_commons
sst-2_dgxa100_fp16_commons
num_gpu=8
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
# TF32
sst-2_dgxa100_tf32_commons () {
global_batch_size=512
learning_rate=2e-5
}
sst-2_dgxa100_1gpu_tf32 () {
commons
sst-2_commons
dgxa100_tf32_commons
sst-2_dgxa100_tf32_commons
num_gpu=1
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
sst-2_dgxa100_2gpu_tf32 () {
commons
sst-2_commons
dgxa100_tf32_commons
sst-2_dgxa100_tf32_commons
num_gpu=2
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
sst-2_dgxa100_4gpu_tf32 () {
commons
sst-2_commons
dgxa100_tf32_commons
sst-2_dgxa100_tf32_commons
num_gpu=4
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
sst-2_dgxa100_8gpu_tf32 () {
commons
sst-2_commons
dgxa100_tf32_commons
sst-2_dgxa100_tf32_commons
num_gpu=8
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
##########################################
# DGX2 #
##########################################
##########################
# MRPC #
##########################
# AMP
mrpc_dgx2_1gpu_fp16 () {
commons
mrpc_commons
dgx2_fp16_commons
num_gpu=1
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
mrpc_dgx2_2gpu_fp16 () {
commons
mrpc_commons
dgx2_fp16_commons
num_gpu=2
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
mrpc_dgx2_4gpu_fp16 () {
commons
mrpc_commons
dgx2_fp16_commons
num_gpu=4
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
mrpc_dgx2_8gpu_fp16 () {
commons
mrpc_commons
dgx2_fp16_commons
num_gpu=8
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
mrpc_dgx2_16gpu_fp16 () {
commons
mrpc_commons
dgx2_fp16_commons
num_gpu=16
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
# FP32.
mrpc_dgx2_1gpu_fp32 () {
commons
mrpc_commons
dgx2_fp32_commons
num_gpu=1
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
mrpc_dgx2_2gpu_fp32 () {
commons
mrpc_commons
dgx2_fp32_commons
num_gpu=2
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
mrpc_dgx2_4gpu_fp32 () {
commons
mrpc_commons
dgx2_fp32_commons
num_gpu=4
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
mrpc_dgx2_8gpu_fp32 () {
commons
mrpc_commons
dgx2_fp32_commons
num_gpu=8
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
mrpc_dgx2_16gpu_fp32 () {
commons
mrpc_commons
dgx2_fp32_commons
num_gpu=16
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
##########################
# SST-2 #
##########################
sst-2_dgx2_commons () {
global_batch_size=1024
learning_rate=3e-5
}
# AMP
sst-2_dgx2_1gpu_fp16 () {
commons
sst-2_commons
dgx2_fp16_commons
sst-2_dgx2_commons
num_gpu=1
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
sst-2_dgx2_2gpu_fp16 () {
commons
sst-2_commons
dgx2_fp16_commons
sst-2_dgx2_commons
num_gpu=2
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
sst-2_dgx2_4gpu_fp16 () {
commons
sst-2_commons
dgx2_fp16_commons
sst-2_dgx2_commons
num_gpu=4
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
sst-2_dgx2_8gpu_fp16 () {
commons
sst-2_commons
dgx2_fp16_commons
sst-2_dgx2_commons
num_gpu=8
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
sst-2_dgx2_16gpu_fp16 () {
commons
sst-2_commons
dgx2_fp16_commons
sst-2_dgx2_commons
num_gpu=16
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
# TF32
sst-2_dgx2_1gpu_fp32 () {
commons
sst-2_commons
dgx2_fp32_commons
sst-2_dgx2_commons
num_gpu=1
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
sst-2_dgx2_2gpu_fp32 () {
commons
sst-2_commons
dgx2_fp32_commons
sst-2_dgx2_commons
num_gpu=2
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
sst-2_dgx2_4gpu_fp32 () {
commons
sst-2_commons
dgx2_fp32_commons
sst-2_dgx2_commons
num_gpu=4
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
sst-2_dgx2_8gpu_fp32 () {
commons
sst-2_commons
dgx2_fp32_commons
sst-2_dgx2_commons
num_gpu=8
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
sst-2_dgx2_16gpu_fp32 () {
commons
sst-2_commons
dgx2_fp32_commons
sst-2_dgx2_commons
num_gpu=16
batch_size_and_gradient_accumulation_steps
print_arguments_in_order
}
#!/usr/bin/env bash
# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
dgxa100_8gpu_fp16 ()
{
train_batch_size="8192"
learning_rate="6e-3"
precision="fp16"
num_gpus=8
warmup_proportion="0.2843"
train_steps=7038
save_checkpoint_steps=200
resume_training="false"
create_logfile="true"
accumulate_gradients="true"
gradient_accumulation_steps=128
seed=42
job_name="bert_lamb_pretraining"
allreduce_post_accumulation="true"
allreduce_post_accumulation_fp16="true"
train_batch_size_phase2=4096
learning_rate_phase2="4e-3"
warmup_proportion_phase2="0.128"
train_steps_phase2=1563
gradient_accumulation_steps_phase2=256
DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
DATA_DIR_PHASE1="$BERT_PREP_WORKING_DIR/${DATASET}/"
BERT_CONFIG=bert_config.json
CODEDIR="/workspace/bert"
init_checkpoint="None"
DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
DATA_DIR_PHASE2="$BERT_PREP_WORKING_DIR/${DATASET2}/"
echo $train_batch_size $learning_rate $precision $num_gpus \
$warmup_proportion $train_steps $save_checkpoint_steps \
$resume_training $create_logfile $accumulate_gradients \
$gradient_accumulation_steps $seed $job_name $allreduce_post_accumulation \
$allreduce_post_accumulation_fp16 $train_batch_size_phase2 $learning_rate_phase2 \
$warmup_proportion_phase2 $train_steps_phase2 $gradient_accumulation_steps_phase2 \
$DATA_DIR_PHASE1 $DATA_DIR_PHASE2 $CODEDIR
}
dgxa100_8gpu_tf32 ()
{
train_batch_size="8192"
learning_rate="6e-3"
precision="tf32"
num_gpus=8
warmup_proportion="0.2843"
train_steps=7038
save_checkpoint_steps=200
resume_training="false"
create_logfile="true"
accumulate_gradients="true"
gradient_accumulation_steps=128
seed=42
job_name="bert_lamb_pretraining"
allreduce_post_accumulation="true"
allreduce_post_accumulation_fp16="false"
train_batch_size_phase2=4096
learning_rate_phase2="4e-3"
warmup_proportion_phase2="0.128"
train_steps_phase2=1563
gradient_accumulation_steps_phase2=512
DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
DATA_DIR_PHASE1="$BERT_PREP_WORKING_DIR/${DATASET}/"
BERT_CONFIG=bert_config.json
CODEDIR="/workspace/bert"
init_checkpoint="None"
DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
DATA_DIR_PHASE2="$BERT_PREP_WORKING_DIR/${DATASET2}/"
echo $train_batch_size $learning_rate $precision $num_gpus \
$warmup_proportion $train_steps $save_checkpoint_steps \
$resume_training $create_logfile $accumulate_gradients \
$gradient_accumulation_steps $seed $job_name $allreduce_post_accumulation \
$allreduce_post_accumulation_fp16 $train_batch_size_phase2 $learning_rate_phase2 \
$warmup_proportion_phase2 $train_steps_phase2 $gradient_accumulation_steps_phase2 \
$DATA_DIR_PHASE1 $DATA_DIR_PHASE2 $CODEDIR
}
# Full pretraining configs for NVIDIA DGX-2H (16x NVIDIA V100 32GB GPU)
dgx2_16gpu_fp16 ()
{
train_batch_size="4096"
learning_rate="6e-3"
precision="fp16"
num_gpus=16
warmup_proportion="0.2843"
train_steps=7038
save_checkpoint_steps=200
resume_training="false"
create_logfile="true"
accumulate_gradients="true"
gradient_accumulation_steps=64
seed=42
job_name="bert_lamb_pretraining"
allreduce_post_accumulation="true"
allreduce_post_accumulation_fp16="true"
train_batch_size_phase2=2048
learning_rate_phase2="4e-3"
warmup_proportion_phase2="0.128"
train_steps_phase2=1563
gradient_accumulation_steps_phase2=128
DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
DATA_DIR_PHASE1="$BERT_PREP_WORKING_DIR/${DATASET}/"
BERT_CONFIG=bert_config.json
CODEDIR="/workspace/bert"
init_checkpoint="None"
DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
DATA_DIR_PHASE2="$BERT_PREP_WORKING_DIR/${DATASET2}/"
echo $train_batch_size $learning_rate $precision $num_gpus \
$warmup_proportion $train_steps $save_checkpoint_steps \
$resume_training $create_logfile $accumulate_gradients \
$gradient_accumulation_steps $seed $job_name $allreduce_post_accumulation \
$allreduce_post_accumulation_fp16 $train_batch_size_phase2 $learning_rate_phase2 \
$warmup_proportion_phase2 $train_steps_phase2 $gradient_accumulation_steps_phase2 \
$DATA_DIR_PHASE1 $DATA_DIR_PHASE2 $CODEDIR
}
dgx2_16gpu_fp32 ()
{
train_batch_size="4096"
learning_rate="6e-3"
precision="fp32"
num_gpus=16
warmup_proportion="0.2843"
train_steps=7038
save_checkpoint_steps=200
resume_training="false"
create_logfile="true"
accumulate_gradients="true"
gradient_accumulation_steps=128
seed=42
job_name="bert_lamb_pretraining"
allreduce_post_accumulation="true"
allreduce_post_accumulation_fp16="false"
train_batch_size_phase2=2048
learning_rate_phase2="4e-3"
warmup_proportion_phase2="0.128"
train_steps_phase2=1563
gradient_accumulation_steps_phase2=256
DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
DATA_DIR_PHASE1="$BERT_PREP_WORKING_DIR/${DATASET}/"
BERT_CONFIG=bert_config.json
CODEDIR="/workspace/bert"
init_checkpoint="None"
DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
DATA_DIR_PHASE2="$BERT_PREP_WORKING_DIR/${DATASET2}/"
echo $train_batch_size $learning_rate $precision $num_gpus \
$warmup_proportion $train_steps $save_checkpoint_steps \
$resume_training $create_logfile $accumulate_gradients \
$gradient_accumulation_steps $seed $job_name $allreduce_post_accumulation \
$allreduce_post_accumulation_fp16 $train_batch_size_phase2 $learning_rate_phase2 \
$warmup_proportion_phase2 $train_steps_phase2 $gradient_accumulation_steps_phase2 \
$DATA_DIR_PHASE1 $DATA_DIR_PHASE2 $CODEDIR
}
# Full pretraining configs for NVIDIA DGX-1 (8x NVIDIA V100 16GB GPU)
dgx1_8gpu_fp16 ()
{
train_batch_size="8192"
learning_rate="6e-3"
precision="fp16"
num_gpus=8
warmup_proportion="0.2843"
train_steps=7038
save_checkpoint_steps=200
resume_training="false"
create_logfile="true"
accumulate_gradients="true"
gradient_accumulation_steps=512
seed=42
job_name="bert_lamb_pretraining"
allreduce_post_accumulation="true"
allreduce_post_accumulation_fp16="true"
train_batch_size_phase2=4096
learning_rate_phase2="4e-3"
warmup_proportion_phase2="0.128"
train_steps_phase2=1563
gradient_accumulation_steps_phase2=512
DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
DATA_DIR_PHASE1="$BERT_PREP_WORKING_DIR/${DATASET}/"
BERT_CONFIG=bert_config.json
CODEDIR="/workspace/bert"
init_checkpoint="None"
DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
DATA_DIR_PHASE2="$BERT_PREP_WORKING_DIR/${DATASET2}/"
echo $train_batch_size $learning_rate $precision $num_gpus \
$warmup_proportion $train_steps $save_checkpoint_steps \
$resume_training $create_logfile $accumulate_gradients \
$gradient_accumulation_steps $seed $job_name $allreduce_post_accumulation \
$allreduce_post_accumulation_fp16 $train_batch_size_phase2 $learning_rate_phase2 \
$warmup_proportion_phase2 $train_steps_phase2 $gradient_accumulation_steps_phase2 \
$DATA_DIR_PHASE1 $DATA_DIR_PHASE2 $CODEDIR
}
dgx1_8gpu_fp32 ()
{
train_batch_size="8192"
learning_rate="6e-3"
precision="fp32"
num_gpus=8
warmup_proportion="0.2843"
train_steps=7038
save_checkpoint_steps=200
resume_training="false"
create_logfile="true"
accumulate_gradients="true"
gradient_accumulation_steps=1024
seed=42
job_name="bert_lamb_pretraining"
allreduce_post_accumulation="true"
allreduce_post_accumulation_fp16="false"
train_batch_size_phase2=4096
learning_rate_phase2="4e-3"
warmup_proportion_phase2="0.128"
train_steps_phase2=1563
gradient_accumulation_steps_phase2=1024
DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
DATA_DIR_PHASE1="$BERT_PREP_WORKING_DIR/${DATASET}/"
BERT_CONFIG=bert_config.json
CODEDIR="/workspace/bert"
init_checkpoint="None"
DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
DATA_DIR_PHASE2="$BERT_PREP_WORKING_DIR/${DATASET2}/"
echo $train_batch_size $learning_rate $precision $num_gpus \
$warmup_proportion $train_steps $save_checkpoint_steps \
$resume_training $create_logfile $accumulate_gradients \
$gradient_accumulation_steps $seed $job_name $allreduce_post_accumulation \
$allreduce_post_accumulation_fp16 $train_batch_size_phase2 $learning_rate_phase2 \
$warmup_proportion_phase2 $train_steps_phase2 $gradient_accumulation_steps_phase2 \
$DATA_DIR_PHASE1 $DATA_DIR_PHASE2 $CODEDIR
}
#!/usr/bin/env bash
# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
dgxa100_8gpu_fp16 ()
{
init_checkpoint="/workspace/bert/checkpoints/bert_uncased.pt"
epochs="2.0"
batch_size="32"
learning_rate="3e-5"
precision="fp16"
num_gpu="8"
seed="1"
squad_dir="$BERT_PREP_WORKING_DIR/download/squad/v1.1"
vocab_file="$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt"
OUT_DIR="/workspace/bert/results/SQuAD"
echo $init_checkpoint $epochs $batch_size $learning_rate \
$precision $num_gpu $seed $squad_dir $vocab_file \
$OUT_DIR
}
dgxa100_8gpu_tf32 ()
{
init_checkpoint="/workspace/bert/checkpoints/bert_uncased.pt"
epochs="2.0"
batch_size="16"
learning_rate="3e-5"
precision="tf32"
num_gpu="8"
seed="1"
squad_dir="$BERT_PREP_WORKING_DIR/download/squad/v1.1"
vocab_file="$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt"
OUT_DIR="/workspace/bert/results/SQuAD"
echo $init_checkpoint $epochs $batch_size $learning_rate \
$precision $num_gpu $seed $squad_dir $vocab_file \
$OUT_DIR
}
# Full SQuAD training configs for NVIDIA DGX-2H (16x NVIDIA V100 32GB GPU)
dgx2_16gpu_fp16 ()
{
init_checkpoint="/workspace/bert/checkpoints/bert_uncased.pt"
epochs="2.0"
batch_size="16"
learning_rate="3e-5"
precision="fp16"
num_gpu="16"
seed="1"
squad_dir="$BERT_PREP_WORKING_DIR/download/squad/v1.1"
vocab_file="$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt"
OUT_DIR="/workspace/bert/results/SQuAD"
echo $init_checkpoint $epochs $batch_size $learning_rate \
$precision $num_gpu $seed $squad_dir $vocab_file \
$OUT_DIR
}
dgx2_16gpu_fp32 ()
{
init_checkpoint="/workspace/bert/checkpoints/bert_uncased.pt"
epochs="2.0"
batch_size="8"
learning_rate="3e-5"
precision="fp16"
num_gpu="16"
seed="1"
squad_dir="$BERT_PREP_WORKING_DIR/download/squad/v1.1"
vocab_file="$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt"
OUT_DIR="/workspace/bert/results/SQuAD"
echo $init_checkpoint $epochs $batch_size $learning_rate \
$precision $num_gpu $seed $squad_dir $vocab_file \
$OUT_DIR
}
# Full SQuAD training configs for NVIDIA DGX-1 (8x NVIDIA V100 16GB GPU)
dgx1_8gpu_fp16 ()
{
init_checkpoint="/workspace/bert/checkpoints/bert_uncased.pt"
epochs="2.0"
batch_size="10"
learning_rate="3e-5"
precision="fp16"
num_gpu="8"
seed="1"
squad_dir="$BERT_PREP_WORKING_DIR/download/squad/v1.1"
vocab_file="$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt"
OUT_DIR="/workspace/bert/results/SQuAD"
echo $init_checkpoint $epochs $batch_size $learning_rate \
$precision $num_gpu $seed $squad_dir $vocab_file \
$OUT_DIR
}
dgx1_8gpu_fp32 ()
{
init_checkpoint="/workspace/bert/checkpoints/bert_uncased.pt"
epochs="2.0"
batch_size="4"
learning_rate="3e-5"
precision="fp32"
num_gpu="8"
seed="1"
squad_dir="$BERT_PREP_WORKING_DIR/download/squad/v1.1"
vocab_file="$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt"
OUT_DIR="/workspace/bert/results/SQuAD"
echo $init_checkpoint $epochs $batch_size $learning_rate \
$precision $num_gpu $seed $squad_dir $vocab_file \
$OUT_DIR
}
#!/usr/bin/env bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
DATA_DIR=${1:-/workspace/bert/data}
# Download vocab files from pretrained model
cd vocab && python3 download_models.py && rm *.zip && rm ./*/*.ckpt.*
# Download SQUAD
cd $DATA_DIR/squad && . squad_download.sh
# Download SWAG
git clone https://github.com/rowanz/swagaf.git $DATA_DIR/swag
# Download GLUE
cd $DATA_DIR/glue && . download_mrpc.sh
# WIKI Download
cd $DATA_DIR/wikipedia_corpus && . download_wikipedia.sh
# Bookcorpus Download
cd $DATA_DIR/bookcorpus && . download_bookcorpus.sh
cd $DATA_DIR
# Create HDF5 files for WIKI
bash create_datasets_from_start.sh wikipedia_corpus ./wikipedia_corpus/wikipedia_corpus.txt \
&& rm -r ./wikipedia_corpus/final_* \
# Create HDF5 files for Bookcorpus
bash create_datasets_from_start.sh bookcorpus ./bookcorpus/bookcorpus.txt \
&& rm -r ./bookcorpus/final_* \
# Create HDF5 files for inter sequence-pair mixed Wikipedia and Bookcorpus
bash merge_datasets_after_creation.sh merged_wiki+books wikipedia_corpus/hdf5_shards,bookcorpus/hdf5_shards 1024
#!/bin/bash
docker build --network=host . --rm --pull --no-cache -t bert
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment