Commit 5394b117 authored by sunxx1's avatar sunxx1
Browse files

Merge branch 'deepspeed-branch' into 'main'

Deepspeed branch

See merge request dcutoolkit/deeplearing/dlexamples_new!22
parents 491af051 316d3f90
import logging
import torch.distributed as dist
logging.basicConfig(
format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt='%m/%d/%Y %H:%M:%S',
level=logging.INFO)
logger = logging.getLogger(__name__)
class Logger():
def __init__(self, cuda=False):
self.logger = logging.getLogger(__name__)
self.cuda = cuda
def info(self, message, *args, **kwargs):
if (self.cuda and dist.get_rank() == 0) or not self.cuda:
self.logger.info(message, *args, **kwargs)
def error(self, message, *args, **kwargs):
self.logger.error(message, *args, **kwargs)
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
class FocalLoss(nn.Module):
r"""
This criterion is a implemenation of Focal Loss, which is proposed in
Focal Loss for Dense Object Detection.
Loss(x, class) = - \alpha (1-softmax(x)[class])^gamma \log(softmax(x)[class])
The losses are averaged across observations for each minibatch.
Args:
alpha(1D Tensor, Variable) : the scalar factor for this criterion
gamma(float, double) : gamma > 0; reduces the relative loss for well-classified examples (p > .5),
putting more focus on hard, misclassified examples
size_average(bool): size_average(bool): By default, the losses are averaged over observations for each minibatch.
However, if the field size_average is set to False, the losses are
instead summed for each minibatch.
"""
def __init__(self, class_num, alpha=None, gamma=2, size_average=True):
super(FocalLoss, self).__init__()
if alpha is None:
self.alpha = torch.ones(class_num, 1)
else:
if isinstance(alpha, Variable):
self.alpha = alpha
else:
self.alpha = Variable(alpha)
self.gamma = gamma
self.class_num = class_num
self.size_average = size_average
def forward(self, inputs, targets):
N = inputs.size(0)
C = inputs.size(1)
P = F.softmax(inputs)
class_mask = inputs.data.new(N, C).fill_(0)
# class_mask = Variable(class_mask)
ids = targets.view(-1, 1)
class_mask.scatter_(1, ids.data, 1.)
if inputs.is_cuda and not self.alpha.is_cuda:
self.alpha = self.alpha.cuda()
alpha = self.alpha[ids.data.view(-1)]
probs = (P * class_mask).sum(1).view(-1, 1)
log_p = probs.log()
batch_loss = -alpha * (torch.pow((1 - probs), self.gamma)) * log_p
if self.size_average:
loss = batch_loss.mean()
else:
loss = batch_loss.sum()
return loss
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss
from turing.utils import TorchTuple
from pytorch_pretrained_bert.modeling import BertModel
from pytorch_pretrained_bert.modeling import BertPreTrainingHeads, PreTrainedBertModel, BertPreTrainingHeads
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
class BertPretrainingLoss(PreTrainedBertModel):
def __init__(self, bert_encoder, config):
super(BertPretrainingLoss, self).__init__(config)
self.bert = bert_encoder
self.cls = BertPreTrainingHeads(
config, self.bert.embeddings.word_embeddings.weight)
self.cls.apply(self.init_bert_weights)
def forward(self,
input_ids,
token_type_ids=None,
attention_mask=None,
masked_lm_labels=None,
next_sentence_label=None):
sequence_output, pooled_output = self.bert(
input_ids,
token_type_ids,
attention_mask,
output_all_encoded_layers=False)
prediction_scores, seq_relationship_score = self.cls(
sequence_output, pooled_output)
if masked_lm_labels is not None and next_sentence_label is not None:
loss_fct = CrossEntropyLoss(ignore_index=-1)
next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
next_sentence_label.view(-1))
masked_lm_loss = loss_fct(
prediction_scores.view(-1, self.config.vocab_size),
masked_lm_labels.view(-1))
total_loss = masked_lm_loss + next_sentence_loss
return total_loss
else:
return prediction_scores, seq_relationship_score
class BertClassificationLoss(PreTrainedBertModel):
def __init__(self, bert_encoder, config, num_labels: int = 1):
super(BertClassificationLoss, self).__init__(config)
self.bert = bert_encoder
self.num_labels = num_labels
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, num_labels)
self.classifier.apply(self.init_bert_weights)
def forward(self,
input_ids,
token_type_ids=None,
attention_mask=None,
labels=None):
_, pooled_output = self.bert(input_ids,
token_type_ids,
attention_mask,
output_all_encoded_layers=False)
pooled_output = self.dropout(pooled_output)
scores = self.classifier(pooled_output)
if labels is not None:
loss_fct = nn.BCEWithLogitsLoss()
loss = loss_fct(scores.view(-1, self.num_labels),
labels.view(-1, 1))
return loss
else:
return scores
class BertRegressionLoss(PreTrainedBertModel):
def __init__(self, bert_encoder, config):
super(BertRegressionLoss, self).__init__(config)
self.bert = bert_encoder
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1)
self.classifier.apply(self.init_bert_weights)
def forward(self,
input_ids,
token_type_ids=None,
attention_mask=None,
labels=None):
_, pooled_output = self.bert(input_ids,
token_type_ids,
attention_mask,
output_all_encoded_layers=False)
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
if labels is not None:
loss_fct = MSELoss()
loss = loss_fct(logits.view(-1, 1), labels.view(-1, 1))
return loss
else:
return logits
class BertMultiTask:
def __init__(self, args):
self.config = args.config
if not args.use_pretrain:
if args.progressive_layer_drop:
print("BertConfigPreLnLayerDrop")
from nvidia.modelingpreln_layerdrop import BertForPreTrainingPreLN, BertConfig
else:
from nvidia.modelingpreln import BertForPreTrainingPreLN, BertConfig
bert_config = BertConfig(**self.config["bert_model_config"])
bert_config.vocab_size = len(args.tokenizer.vocab)
# Padding for divisibility by 8
if bert_config.vocab_size % 8 != 0:
bert_config.vocab_size += 8 - (bert_config.vocab_size % 8)
print("VOCAB SIZE:", bert_config.vocab_size)
self.network = BertForPreTrainingPreLN(bert_config, args)
# Use pretrained bert weights
else:
self.bert_encoder = BertModel.from_pretrained(
self.config['bert_model_file'],
cache_dir=PYTORCH_PRETRAINED_BERT_CACHE /
'distributed_{}'.format(args.local_rank))
bert_config = self.bert_encoder.config
self.device = None
def set_device(self, device):
self.device = device
def save(self, filename: str):
network = self.network.module
return torch.save(network.state_dict(), filename)
def load(self, model_state_dict: str):
return self.network.module.load_state_dict(
torch.load(model_state_dict,
map_location=lambda storage, loc: storage))
def move_batch(self, batch: TorchTuple, non_blocking=False):
return batch.to(self.device, non_blocking)
def eval(self):
self.network.eval()
def train(self):
self.network.train()
def save_bert(self, filename: str):
return torch.save(self.bert_encoder.state_dict(), filename)
def to(self, device):
assert isinstance(device, torch.device)
self.network.to(device)
def half(self):
self.network.half()
from tqdm import tqdm
from typing import Tuple
from random import shuffle
import pickle
import random
import numpy as np
from pathlib import Path
from pytorch_pretrained_bert.tokenization import BertTokenizer
def truncate_input_sequence(tokens_a, tokens_b, max_num_tokens):
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_num_tokens:
break
trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
assert len(trunc_tokens) >= 1
# We want to sometimes truncate from the front and sometimes from the
# back to add more randomness and avoid biases.
if random.random() < 0.5:
del trunc_tokens[0]
else:
trunc_tokens.pop()
class TokenInstance:
""" This TokenInstance is a obect to have the basic units of data that should be
extracted from the raw text file and can be consumed by any BERT like model.
"""
def __init__(self, tokens_a, tokens_b, is_next, lang="en"):
self.tokens_a = tokens_a
self.tokens_b = tokens_b
self.is_next = is_next # 0 is if in continuation, 1 if is random
self.lang = lang
def get_values(self):
return (self.tokens_a, self.tokens_b, self.is_next)
def get_lang(self):
return self.lang
class QueryPassageDataset:
def __init__(self, path, readin=20000000):
all_pairs = []
with open(path, encoding="utf-8") as fd:
for i, line in enumerate(tqdm(fd)):
line = line.replace('\n', '')
qpl_tuple: Tuple[str, str, str] = line.split('\t')
all_pairs.append(qpl_tuple)
if i > readin:
break
shuffle(all_pairs)
self.all_pairs = all_pairs
self.len = len(self.all_pairs)
def __len__(self):
return self.len
class QueryPassageFineTuningDataset:
def __init__(self, path, readin=20000000):
all_pairs = []
with open(path, encoding="utf-8") as fd:
for i, line in enumerate(tqdm(fd)):
line = line.replace('\n', '')
entities = line.split('\t')
qpl_tuple: Tuple[str, str,
str] = (entities[0], entities[2], entities[4])
all_pairs.append(qpl_tuple)
if i > readin:
break
shuffle(all_pairs)
self.all_pairs = all_pairs
self.len = len(self.all_pairs)
def __len__(self):
return self.len
class QueryInstanceDataset:
def __init__(self, path, readin=20000000):
all_pairs = []
with open(path, encoding="utf-8") as fd:
for i, line in enumerate(tqdm(fd)):
line = line.replace('\n', '')
qpl_tuple: Tuple[str, str, str] = line.split('\t')
all_pairs.append(qpl_tuple)
if i > readin:
break
shuffle(all_pairs)
self.all_pairs = all_pairs
self.len = len(self.all_pairs)
def __len__(self):
return self.len
class PretrainingDataCreator:
def __init__(self,
path,
tokenizer: BertTokenizer,
max_seq_length,
readin: int = 2000000,
dupe_factor: int = 5,
small_seq_prob: float = 0.1):
self.dupe_factor = dupe_factor
self.max_seq_length = max_seq_length
self.small_seq_prob = small_seq_prob
documents = []
instances = []
with open(path, encoding='utf-8') as fd:
for i, line in enumerate(tqdm(fd)):
line = line.replace('\n', '')
# Expected format (Q,T,U,S,D)
# query, title, url, snippet, document = line.split('\t')
# ! remove this following line later
document = line
if len(document.split("<sep>")) <= 3:
continue
lines = document.split("<sep>")
document = []
for seq in lines:
document.append(tokenizer.tokenize(seq))
# document = list(map(tokenizer.tokenize, lines))
documents.append(document)
documents = [x for x in documents if x]
self.documents = documents
for _ in range(self.dupe_factor):
for index in range(len(self.documents)):
instances.extend(self.create_training_instance(index))
shuffle(instances)
self.instances = instances
self.len = len(self.instances)
self.documents = None
documents = None
def __len__(self):
return self.len
def __getstate__(self):
state = self.__dict__.copy()
return state
def __setstate__(self, state):
self.__dict__.update(state)
def save(self, filename):
with open(filename, 'wb') as outfile:
pickle.dump(self, outfile)
@staticmethod
def load(filename):
with open(filename, 'rb') as f:
return pickle.load(f)
def create_training_instance(self, index):
document = self.documents[index]
# l = 0
# for s in document:
# l+=len(s)
# print(l)
# print(document)
# Need to add [CLS] + 2*[SEP] tokens
max_num_tokens = self.max_seq_length - 3
# We want to maximize the inp sequence but also want inputs similar
# to our generic task inputs which will be compartively smaller
# than the data on which we intend to pre-train.
target_seq_length = max_num_tokens
if random.random() < self.small_seq_prob:
target_seq_length = random.randint(5, max_num_tokens)
# Need to make the sequences split for NSP task for interesting
# rather than choosing some arbitrary point. If not the NSP
# task might become way too easy.
instances = []
current_chunk = []
current_length = 0
i = 0
while i < len(document):
segment = document[i]
current_chunk.append(segment)
current_length += len(segment)
if i == len(document) - 1 or current_length >= target_seq_length:
if current_chunk:
# `a_end` is how many segments from `current_chunk` go into the `A`
# (first) sentence.
a_end = 1
if len(current_chunk) >= 2:
a_end = random.randint(1, len(current_chunk) - 1)
tokens_a = []
for j in range(a_end):
tokens_a.extend(current_chunk[j])
tokens_b = []
# Random Next
is_random_next = False
if len(current_chunk) == 1 or random.random() < 0.5:
is_random_next = True
target_b_length = target_seq_length - len(tokens_a)
# Pick a random document
for _ in range(10):
random_doc_index = random.randint(
0,
len(self.documents) - 1)
if random_doc_index != index:
break
random_doc = self.documents[random_doc_index]
random_start = random.randint(0, len(random_doc) - 1)
for j in range(random_start, len(random_doc)):
tokens_b.extend(random_doc[j])
if len(tokens_b) >= target_b_length:
break
# We didn't actually use these segments so we "put them back" so
# they don't go to waste.
num_unused_segments = len(current_chunk) - a_end
i -= num_unused_segments
# Actual Next
else:
is_random_next = False
for j in range(a_end, len(current_chunk)):
tokens_b.extend(current_chunk[j])
truncate_input_sequence(tokens_a, tokens_b, max_num_tokens)
assert len(tokens_a) >= 1
assert len(tokens_b) >= 1
instances.append(
TokenInstance(tokens_a, tokens_b, int(is_random_next)))
# print(instances[-1])
current_chunk = []
current_length = 0
i += 1
# print(len(instances))
return instances
class CleanBodyDataCreator(PretrainingDataCreator):
def __init__(self,
path,
tokenizer: BertTokenizer,
max_seq_length: int = 512,
readin: int = 2000000,
dupe_factor: int = 5,
small_seq_prob: float = 0.1):
self.dupe_factor = dupe_factor
self.max_seq_length = max_seq_length
self.small_seq_prob = small_seq_prob
documents = []
instances = []
with open(path, encoding='utf-8') as fd:
for i, line in enumerate(tqdm(fd)):
line = line.replace('\n', '')
url, cleanbody, rand_int = line.rstrip("\n").split("\t")
cleanbody = cleanbody.replace("#TAB#", " ").replace(
"#NULL#", "").replace("#HASH#", "#")
cleanbody_parts = cleanbody.split("#R##N#")
for document in cleanbody_parts:
lines = document.split("#N#")
document = []
document_len = 0
for seq in lines:
tok_seq = tokenizer.tokenize(seq)
if len(tok_seq) != 0:
document.append(tok_seq)
document_len += len(tok_seq)
if document_len >= 200:
documents.append(document)
documents = [x for x in documents if x]
self.documents = documents
for _ in range(self.dupe_factor):
for index in range(len(self.documents)):
instances.extend(self.create_training_instance(index))
shuffle(instances)
self.instances = instances
self.len = len(self.instances)
self.documents = None
documents = None
class WikiNBookCorpusPretrainingDataCreator(PretrainingDataCreator):
def __init__(self,
path,
tokenizer: BertTokenizer,
max_seq_length: int = 512,
readin: int = 2000000,
dupe_factor: int = 6,
small_seq_prob: float = 0.1):
self.dupe_factor = dupe_factor
self.max_seq_length = max_seq_length
self.small_seq_prob = small_seq_prob
documents = []
instances = []
with open(path, encoding='utf-8') as fd:
document = []
for i, line in enumerate(tqdm(fd)):
line = line.replace('\n', '')
# document = line
# if len(document.split("<sep>")) <= 3:
# continue
if len(line) == 0: # This is end of document
documents.append(document)
document = []
if len(line.split(' ')) > 2:
document.append(tokenizer.tokenize(line))
if len(document) > 0:
documents.append(document)
documents = [x for x in documents if x]
print(documents[0])
print(len(documents))
self.documents = documents
for _ in range(self.dupe_factor):
for index in range(len(self.documents)):
instances.extend(self.create_training_instance(index))
shuffle(instances)
self.instances = instances
self.len = len(self.instances)
self.documents = None
documents = None
class WikiPretrainingDataCreator(PretrainingDataCreator):
def __init__(self,
path,
tokenizer: BertTokenizer,
max_seq_length: int = 512,
readin: int = 2000000,
dupe_factor: int = 6,
small_seq_prob: float = 0.1):
self.dupe_factor = dupe_factor
self.max_seq_length = max_seq_length
self.small_seq_prob = small_seq_prob
documents = []
instances = []
with open(path, encoding='utf-8') as fd:
document = []
for i, line in enumerate(tqdm(fd)):
line = line.replace('\n', '')
# document = line
# if len(document.split("<sep>")) <= 3:
# continue
if len(line
) > 0 and line[:2] == "[[": # This is end of document
documents.append(document)
document = []
if len(line.split(' ')) > 2:
document.append(tokenizer.tokenize(line))
if len(document) > 0:
documents.append(document)
documents = [x for x in documents if x]
# print(len(documents))
# print(len(documents[0]))
# print(documents[0][0:10])
self.documents = documents
for _ in range(self.dupe_factor):
for index in range(len(self.documents)):
instances.extend(self.create_training_instance(index))
shuffle(instances)
self.instances = instances
self.len = len(self.instances)
self.documents = None
documents = None
class NumpyByteInstances:
TOKEN_SEP_VAL = int.from_bytes(b'\x1f', byteorder='big')
def __init__(self, data_creator):
self.data_creator = data_creator
self.getitem_fixed = self.sep_getitem_fixed if self.data_creator.use_separators else self.data_creator.nosep_getitem_fixed
# if self.data_creator.multilingual:
# self.__getitem__ = self.getitem_multilingual
# else:
# self.__getitem__ = self.getitem_monolingual
def getitem_multilingual(self, i):
tokens_a, tokens_b, is_next = self.getitem_fixed(i)
return TokenInstance(tokens_a,
tokens_b,
is_next,
lang=self.data_creator.lang[i])
def getitem_monolingual(self, i):
return TokenInstance(*self.getitem_fixed(i))
def __getitem__(self, i):
if self.data_creator.multilingual:
return self.getitem_multilingual(i)
else:
return self.getitem_monolingual(i)
def nosep_getitem_fixed(self, i):
if i > self.data_creator.len:
raise IndexError
if i < 0:
i += self.data_creator.len
instance_start, instance_end = self.data_creator.instance_offsets[i:i +
2]
tok_offsets_start, tok_offsets_end = self.data_creator.instance_token_offsets[
i:i + 2]
token_offsets = self.data_creator.token_offsets[
tok_offsets_start:tok_offsets_end]
tokens_split = self.data_creator.tokens_split[i]
token_arrs = np.split(
self.data_creator.data[instance_start:instance_end], token_offsets)
tokens = [t.tostring().decode('utf8') for t in token_arrs]
return tokens[:tokens_split], tokens[
tokens_split:], self.data_creator.is_next[i]
def sep_getitem_fixed(self, i):
if i > self.data_creator.len:
raise IndexError
if i < 0:
i += self.data_creator.len
instance_start, instance_end = self.data_creator.instance_offsets[i:i +
2]
instance_data = self.data_creator.data[instance_start:instance_end]
tokens_split = self.data_creator.tokens_split[i]
token_arrs = np.split(
instance_data,
np.where(instance_data == NumpyByteInstances.TOKEN_SEP_VAL)
[0]) # split on the token separator
tokens = [
(t[1:] if i > 0 else t).tostring().decode('utf8')
for i, t in enumerate(token_arrs)
] # ignore first byte, which will be separator, for tokens after the first
return tokens[:tokens_split], tokens[
tokens_split:], self.data_creator.is_next[i]
def __len__(self):
return self.data_creator.len
class NumpyPretrainingDataCreator:
def __init__(self, path, mmap=False):
path = Path(path)
self.path = path
mmap_mode = 'r' if mmap else None
self.data = np.load(str(path / 'data.npy'), mmap_mode=mmap_mode)
self.is_next = np.load(str(path / 'is_next.npy'), mmap_mode=mmap_mode)
self.tokens_split = np.load(str(path / 'tokens_split.npy'),
mmap_mode=mmap_mode)
self.instance_offsets = np.load(str(path / 'instance_offsets.npy'),
mmap_mode=mmap_mode)
if (path / 'instance_token_offsets.npy').is_file():
self.use_separators = False
self.instance_token_offsets = np.load(str(
path / 'instance_token_offsets.npy'),
mmap_mode=mmap_mode)
self.token_offsets = np.load(str(path / 'token_offsets.npy'),
mmap_mode=mmap_mode)
else:
self.use_separators = True
self.instance_token_offsets = None
self.token_offsets = None
if (path / 'lang.npy').is_file():
self.multilingual = True
self.lang = np.load(str(path / 'lang.npy'), mmap_mode=mmap_mode)
else:
self.multilingual = False
self.lang = None
self.instances = NumpyByteInstances(self)
self.len = len(self.is_next)
def __len__(self):
return self.len
@classmethod
def load(cls, path):
return cls(path)
import torch
PAD = 0
def mask(x):
return x != PAD
def torch_long(x):
return torch.LongTensor(x)
import sys as _sys
from typing import List
from collections import _iskeyword # type: ignore
from tensorboardX import SummaryWriter
import os
SUMMARY_WRITER_DIR_NAME = 'runs'
def get_sample_writer(name, base=".."):
"""Returns a tensorboard summary writer
"""
return SummaryWriter(
log_dir=os.path.join(base, SUMMARY_WRITER_DIR_NAME, name))
class TorchTuple(tuple):
def to(self, device, non_blocking=False):
raise NotImplementedError("")
_class_template = """\
from builtins import property as _property, tuple as _tuple
from operator import itemgetter as _itemgetter
from collections import OrderedDict
from turing.utils import TorchTuple
import torch
class {typename}(TorchTuple):
'{typename}({arg_list})'
__slots__ = ()
_fields = {field_names!r}
def __new__(_cls, {arg_list}):
'Create new instance of {typename}({arg_list})'
return _tuple.__new__(_cls, ({arg_list}))
@classmethod
def _make(cls, iterable, new=tuple.__new__, len=len):
'Make a new {typename} object from a sequence or iterable'
result = new(cls, iterable)
if len(result) != {num_fields:d}:
raise TypeError('Expected {num_fields:d} arguments, got %d' % len(result))
return result
def _replace(_self, **kwds):
'Return a new {typename} object replacing specified fields with new values'
result = _self._make(map(kwds.pop, {field_names!r}, _self))
if kwds:
raise ValueError('Got unexpected field names: %r' % list(kwds))
return result
def __repr__(self):
'Return a nicely formatted representation string'
return self.__class__.__name__ + '({repr_fmt})' % self
@property
def __dict__(self):
'A new OrderedDict mapping field names to their values'
return OrderedDict(zip(self._fields, self))
def _asdict(self):
'''Return a new OrderedDict which maps field names to their values.
This method is obsolete. Use vars(nt) or nt.__dict__ instead.
'''
return self.__dict__
def __getnewargs__(self):
'Return self as a plain tuple. Used by copy and pickle.'
return tuple(self)
def __getstate__(self):
'Exclude the OrderedDict from pickling'
return None
def to(self, device, non_blocking=False):
_dict = self.__dict__.copy()
new_dict = dict()
for key, value in _dict.items():
if isinstance(value, torch.Tensor):
if device.type != 'cpu' and non_blocking and torch.cuda.is_available():
new_dict[key] = value.cuda(device, non_blocking=non_blocking)
else:
new_dict[key] = value.to(device)
else:
new_dict[key] = value
return {typename}(**new_dict)
{field_defs}
"""
_repr_template = '{name}=%r'
_field_template = '''\
{name} = _property(_itemgetter({index:d}), doc='Alias for field number {index:d}')
'''
def namedtorchbatch(typename: str,
field_names: List[str],
verbose: bool = False,
rename: bool = False):
"""Returns a new subclass of tuple with named fields leveraging use of torch tensors.
"""
# Validate the field names. At the user's option, either generate an error
# message or automatically replace the field name with a valid name.
if isinstance(field_names, str):
field_names = field_names.replace(',', ' ').split()
field_names = list(map(str, field_names))
if rename:
seen: set = set()
for index, name in enumerate(field_names):
if (not name.isidentifier() or _iskeyword(name)
or name.startswith('_') or name in seen):
field_names[index] = '_%d' % index
seen.add(name)
for name in [typename] + field_names:
if not name.isidentifier():
raise ValueError('Type names and field names must be valid '
'identifiers: %r' % name)
if _iskeyword(name):
raise ValueError('Type names and field names cannot be a '
'keyword: %r' % name)
seen = set()
for name in field_names:
if name.startswith('_') and not rename:
raise ValueError('Field names cannot start with an underscore: '
'%r' % name)
if name in seen:
raise ValueError('Encountered duplicate field name: %r' % name)
seen.add(name)
# Fill-in the class template
class_definition = _class_template.format(
typename=typename,
field_names=tuple(field_names),
num_fields=len(field_names),
arg_list=repr(tuple(field_names)).replace("'", "")[1:-1],
repr_fmt=', '.join(
_repr_template.format(name=name) for name in field_names),
field_defs='\n'.join(
_field_template.format(index=index, name=name)
for index, name in enumerate(field_names)))
# Execute the template string in a temporary namespace and support
# tracing utilities by setting a value for frame.f_globals['__name__']
namespace = dict(__name__='namedtuple_%s' % typename)
exec(class_definition, namespace)
result = namespace[typename]
result._source = class_definition # type: ignore
if verbose:
print(result._source) # type: ignore
# For pickling to work, the __module__ variable needs to be set to the frame
# where the named tuple is created. Bypass this step in enviroments where
# sys._getframe is not defined (Jython for example) or sys._getframe is not
# defined for arguments greater than 0 (IronPython).
try:
result.__module__ = _sys._getframe(1).f_globals.get(
'__name__', '__main__')
except (AttributeError, ValueError):
pass
return result
{
"train_batch_size": 96,
"train_micro_batch_size_per_gpu": 3,
"steps_per_print": 100,
"optimizer": {
"type": "OnebitAdam",
"params": {
"lr": 3e-5,
"freeze_step": 400,
"weight_decay": 0.0,
"bias_correction": false,
"cuda_aware": false,
"comm_backend_name": "mpi"
}
},
"gradient_clipping": 1.0,
"fp16": {
"enabled": true
}
}
# If you are able to install pytorch >= 1.8
# (and nccl >= 2.8.3 if you have 64 or more GPUs),
# we highly recommend you to use the NCCL-based 1-bit Adam
# which has better performance and ease of use
# (see scripts in DeepSpeedExamples/BingBertSquad/1-bit_adam/nccl
# and read the tutorial for more details:
# https://www.deepspeed.ai/tutorials/onebit-adam/)
NUM_NODES=4
NGPU_PER_NODE=8
MODEL_FILE="../../ckpt/bert-large-uncased-whole-word-masking-pytorch_model.bin"
ORIGIN_CONFIG_FILE="../../ckpt/bert-large-uncased-whole-word-masking-config.json"
SQUAD_DIR="../../data"
OUTPUT_DIR=$1
LR=3e-5
SEED=$RANDOM
MASTER_PORT=12345
DROPOUT=0.1
sudo rm -rf ${OUTPUT_DIR}
NGPU=$((NGPU_PER_NODE*NUM_NODES))
EFFECTIVE_BATCH_SIZE=96
MAX_GPU_BATCH_SIZE=3
PER_GPU_BATCH_SIZE=$((EFFECTIVE_BATCH_SIZE/NGPU))
if [[ $PER_GPU_BATCH_SIZE -lt $MAX_GPU_BATCH_SIZE ]]; then
GRAD_ACCUM_STEPS=1
else
GRAD_ACCUM_STEPS=$((PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE))
fi
JOB_NAME="onebit_deepspeed_${NGPU}GPUs_${EFFECTIVE_BATCH_SIZE}batch_size"
config_json=deepspeed_onebitadam_bsz96_config.json
# NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed.
NCCL_TREE_THRESHOLD=0 NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 deepspeed --launcher=openmpi ../../nvidia_run_squad_deepspeed.py \
--bert_model bert-large-uncased \
--do_train \
--do_lower_case \
--predict_batch_size 3 \
--do_predict \
--train_file $SQUAD_DIR/train-v1.1.json \
--predict_file $SQUAD_DIR/dev-v1.1.json \
--train_batch_size $PER_GPU_BATCH_SIZE \
--learning_rate ${LR} \
--num_train_epochs 2.0 \
--max_seq_length 384 \
--doc_stride 128 \
--output_dir $OUTPUT_DIR \
--job_name ${JOB_NAME} \
--gradient_accumulation_steps ${GRAD_ACCUM_STEPS} \
--fp16 \
--deepspeed \
--deepspeed_mpi \
--deepspeed_transformer_kernel \
--deepspeed_config ${config_json} \
--dropout ${DROPOUT} \
--model_file $MODEL_FILE \
--seed ${SEED} \
--ckpt_type HF \
--origin_bert_config_file ${ORIGIN_CONFIG_FILE} \
# If you are able to install pytorch >= 1.8
# (and nccl >= 2.8.3 if you have 64 or more GPUs),
# we highly recommend you to use the NCCL-based 1-bit Adam
# which has better performance and ease of use
# (see scripts in DeepSpeedExamples/BingBertSquad/1-bit_adam/nccl
# and read the tutorial for more details:
# https://www.deepspeed.ai/tutorials/onebit-adam/)
NUM_NODES=4
NGPU_PER_NODE=8
MODEL_FILE="../../ckpt/bert-large-uncased-whole-word-masking-pytorch_model.bin"
ORIGIN_CONFIG_FILE="../../ckpt/bert-large-uncased-whole-word-masking-config.json"
SQUAD_DIR="../../data"
OUTPUT_DIR=$1
LR=3e-5
SEED=$RANDOM
MASTER_PORT=12345
DROPOUT=0.1
sudo rm -rf ${OUTPUT_DIR}
NGPU=$((NGPU_PER_NODE*NUM_NODES))
EFFECTIVE_BATCH_SIZE=96
MAX_GPU_BATCH_SIZE=3
PER_GPU_BATCH_SIZE=$((EFFECTIVE_BATCH_SIZE/NGPU))
if [[ $PER_GPU_BATCH_SIZE -lt $MAX_GPU_BATCH_SIZE ]]; then
GRAD_ACCUM_STEPS=1
else
GRAD_ACCUM_STEPS=$((PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE))
fi
JOB_NAME="onebit_deepspeed_${NGPU}GPUs_${EFFECTIVE_BATCH_SIZE}batch_size"
config_json=deepspeed_onebitadam_bsz96_config.json
# NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed.
mpirun -n $NGPU -npernode $NGPU_PER_NODE -hostfile /job/hostfile -x UCX_TLS=tcp --mca btl ^openib --mca btl_tcp_if_include eth0 -x NCCL_TREE_THRESHOLD=0 -x NCCL_IB_DISABLE=1 -x NCCL_SOCKET_IFNAME=eth0 python ../../nvidia_run_squad_deepspeed.py \
--bert_model bert-large-uncased \
--do_train \
--do_lower_case \
--predict_batch_size 3 \
--do_predict \
--train_file $SQUAD_DIR/train-v1.1.json \
--predict_file $SQUAD_DIR/dev-v1.1.json \
--train_batch_size $PER_GPU_BATCH_SIZE \
--learning_rate ${LR} \
--num_train_epochs 2.0 \
--max_seq_length 384 \
--doc_stride 128 \
--output_dir $OUTPUT_DIR \
--job_name ${JOB_NAME} \
--gradient_accumulation_steps ${GRAD_ACCUM_STEPS} \
--fp16 \
--deepspeed \
--deepspeed_mpi \
--deepspeed_transformer_kernel \
--deepspeed_config ${config_json} \
--dropout ${DROPOUT} \
--model_file $MODEL_FILE \
--seed ${SEED} \
--ckpt_type HF \
--origin_bert_config_file ${ORIGIN_CONFIG_FILE} \
{
"train_batch_size": 96,
"train_micro_batch_size_per_gpu": 3,
"steps_per_print": 100,
"optimizer": {
"type": "OnebitAdam",
"params": {
"lr": 3e-5,
"freeze_step": 400,
"weight_decay": 0.0,
"bias_correction": false,
"cuda_aware": true,
"comm_backend_name": "mpi"
}
},
"gradient_clipping": 1.0,
"fp16": {
"enabled": true
}
}
# If you are able to install pytorch >= 1.8
# (and nccl >= 2.8.3 if you have 64 or more GPUs),
# we highly recommend you to use the NCCL-based 1-bit Adam
# which has better performance and ease of use
# (see scripts in DeepSpeedExamples/BingBertSquad/1-bit_adam/nccl
# and read the tutorial for more details:
# https://www.deepspeed.ai/tutorials/onebit-adam/)
NUM_NODES=4
NGPU_PER_NODE=8
MODEL_FILE="../../ckpt/bert-large-uncased-whole-word-masking-pytorch_model.bin"
ORIGIN_CONFIG_FILE="../../ckpt/bert-large-uncased-whole-word-masking-config.json"
SQUAD_DIR="../../data"
OUTPUT_DIR=$1
LR=3e-5
SEED=$RANDOM
MASTER_PORT=12345
DROPOUT=0.1
sudo rm -rf ${OUTPUT_DIR}
NGPU=$((NGPU_PER_NODE*NUM_NODES))
EFFECTIVE_BATCH_SIZE=96
MAX_GPU_BATCH_SIZE=3
PER_GPU_BATCH_SIZE=$((EFFECTIVE_BATCH_SIZE/NGPU))
if [[ $PER_GPU_BATCH_SIZE -lt $MAX_GPU_BATCH_SIZE ]]; then
GRAD_ACCUM_STEPS=1
else
GRAD_ACCUM_STEPS=$((PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE))
fi
JOB_NAME="onebit_deepspeed_${NGPU}GPUs_${EFFECTIVE_BATCH_SIZE}batch_size"
config_json=deepspeed_onebitadam_bsz96_config.json
NCCL_TREE_THRESHOLD=0 deepspeed --launcher=mvapich ../../nvidia_run_squad_deepspeed.py \
--bert_model bert-large-uncased \
--do_train \
--do_lower_case \
--predict_batch_size 3 \
--do_predict \
--train_file $SQUAD_DIR/train-v1.1.json \
--predict_file $SQUAD_DIR/dev-v1.1.json \
--train_batch_size $PER_GPU_BATCH_SIZE \
--learning_rate ${LR} \
--num_train_epochs 2.0 \
--max_seq_length 384 \
--doc_stride 128 \
--output_dir $OUTPUT_DIR \
--job_name ${JOB_NAME} \
--gradient_accumulation_steps ${GRAD_ACCUM_STEPS} \
--fp16 \
--deepspeed \
--deepspeed_mpi \
--deepspeed_transformer_kernel \
--deepspeed_config ${config_json} \
--dropout ${DROPOUT} \
--model_file $MODEL_FILE \
--seed ${SEED} \
--ckpt_type HF \
--origin_bert_config_file ${ORIGIN_CONFIG_FILE} \
# If you are able to install pytorch >= 1.8
# (and nccl >= 2.8.3 if you have 64 or more GPUs),
# we highly recommend you to use the NCCL-based 1-bit Adam
# which has better performance and ease of use
# (see scripts in DeepSpeedExamples/BingBertSquad/1-bit_adam/nccl
# and read the tutorial for more details:
# https://www.deepspeed.ai/tutorials/onebit-adam/)
NUM_NODES=4
NGPU_PER_NODE=8
MODEL_FILE="../../ckpt/bert-large-uncased-whole-word-masking-pytorch_model.bin"
ORIGIN_CONFIG_FILE="../../ckpt/bert-large-uncased-whole-word-masking-config.json"
SQUAD_DIR="../../data"
OUTPUT_DIR=$1
LR=3e-5
SEED=$RANDOM
MASTER_PORT=12345
DROPOUT=0.1
sudo rm -rf ${OUTPUT_DIR}
NGPU=$((NGPU_PER_NODE*NUM_NODES))
EFFECTIVE_BATCH_SIZE=96
MAX_GPU_BATCH_SIZE=3
PER_GPU_BATCH_SIZE=$((EFFECTIVE_BATCH_SIZE/NGPU))
if [[ $PER_GPU_BATCH_SIZE -lt $MAX_GPU_BATCH_SIZE ]]; then
GRAD_ACCUM_STEPS=1
else
GRAD_ACCUM_STEPS=$((PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE))
fi
JOB_NAME="onebit_deepspeed_${NGPU}GPUs_${EFFECTIVE_BATCH_SIZE}batch_size"
config_json=deepspeed_onebitadam_bsz96_config.json
mpirun -n $NGPU -ppn $NGPU_PER_NODE -f /tmp/deepspeed_mvapich_hostfile -env MV2_SUPPORT_DL=1 -env MV2_USE_GDR=0 -env MV2_USE_CUDA=1 -env MV2_USE_GDRCOPY=0 -env MV2_SMP_USE_CMA=0 -env MV2_DEBUG_SHOW_BACKTRACE=1 python ../../nvidia_run_squad_deepspeed.py \
--bert_model bert-large-uncased \
--do_train \
--do_lower_case \
--predict_batch_size 3 \
--do_predict \
--train_file $SQUAD_DIR/train-v1.1.json \
--predict_file $SQUAD_DIR/dev-v1.1.json \
--train_batch_size $PER_GPU_BATCH_SIZE \
--learning_rate ${LR} \
--num_train_epochs 2.0 \
--max_seq_length 384 \
--doc_stride 128 \
--output_dir $OUTPUT_DIR \
--job_name ${JOB_NAME} \
--gradient_accumulation_steps ${GRAD_ACCUM_STEPS} \
--fp16 \
--deepspeed \
--deepspeed_mpi \
--deepspeed_transformer_kernel \
--deepspeed_config ${config_json} \
--dropout ${DROPOUT} \
--model_file $MODEL_FILE \
--seed ${SEED} \
--ckpt_type HF \
--origin_bert_config_file ${ORIGIN_CONFIG_FILE} \
{
"train_batch_size": 96,
"train_micro_batch_size_per_gpu": 3,
"steps_per_print": 100,
"optimizer": {
"type": "OnebitAdam",
"params": {
"lr": 3e-5,
"freeze_step": 400,
"weight_decay": 0.0,
"bias_correction": false,
"cuda_aware": false,
"comm_backend_name": "nccl"
}
},
"gradient_clipping": 1.0,
"fp16": {
"enabled": true
}
}
# This script requires pytorch >= 1.8
# (and nccl >= 2.8.3 if you have 64 or more GPUs).
# Read the tutorial for more details:
# https://www.deepspeed.ai/tutorials/onebit-adam/
NUM_NODES=4
NGPU_PER_NODE=8
MODEL_FILE="../../ckpt/bert-large-uncased-whole-word-masking-pytorch_model.bin"
ORIGIN_CONFIG_FILE="../../ckpt/bert-large-uncased-whole-word-masking-config.json"
SQUAD_DIR="../../data"
OUTPUT_DIR=$1
LR=3e-5
SEED=$RANDOM
MASTER_PORT=12345
DROPOUT=0.1
sudo rm -rf ${OUTPUT_DIR}
NGPU=$((NGPU_PER_NODE*NUM_NODES))
EFFECTIVE_BATCH_SIZE=96
MAX_GPU_BATCH_SIZE=3
PER_GPU_BATCH_SIZE=$((EFFECTIVE_BATCH_SIZE/NGPU))
if [[ $PER_GPU_BATCH_SIZE -lt $MAX_GPU_BATCH_SIZE ]]; then
GRAD_ACCUM_STEPS=1
else
GRAD_ACCUM_STEPS=$((PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE))
fi
JOB_NAME="onebit_deepspeed_${NGPU}GPUs_${EFFECTIVE_BATCH_SIZE}batch_size"
config_json=deepspeed_onebitadam_bsz96_config.json
# NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed.
NCCL_TREE_THRESHOLD=0 NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 deepspeed ../../nvidia_run_squad_deepspeed.py \
--bert_model bert-large-uncased \
--do_train \
--do_lower_case \
--predict_batch_size 3 \
--do_predict \
--train_file $SQUAD_DIR/train-v1.1.json \
--predict_file $SQUAD_DIR/dev-v1.1.json \
--train_batch_size $PER_GPU_BATCH_SIZE \
--learning_rate ${LR} \
--num_train_epochs 2.0 \
--max_seq_length 384 \
--doc_stride 128 \
--output_dir $OUTPUT_DIR \
--job_name ${JOB_NAME} \
--gradient_accumulation_steps ${GRAD_ACCUM_STEPS} \
--fp16 \
--deepspeed \
--deepspeed_transformer_kernel \
--deepspeed_config ${config_json} \
--dropout ${DROPOUT} \
--model_file $MODEL_FILE \
--seed ${SEED} \
--ckpt_type HF \
--origin_bert_config_file ${ORIGIN_CONFIG_FILE} \
NOTICES AND INFORMATION
Do Not Translate or Localize
This software incorporates material from third parties. Microsoft makes certain
open source code available at https://3rdpartysource.microsoft.com, or you may
send a check or money order for US $5.00, including the product name, the open
source component name, and version number, to:
Source Code Compliance Team
Microsoft Corporation
One Microsoft Way
Redmond, WA 98052
USA
Notwithstanding any other terms, you may reverse engineer this software to the
extent required to debug changes to any libraries licensed under the GNU Lesser
General Public License.
Component. BingBertSquad
Open Source License/Copyright Notice.
Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
{
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 16,
"num_hidden_layers": 24,
"pad_token_id": 0,
"type_vocab_size": 2,
"vocab_size": 30522
}
# coding=utf-8
# This script references to below file from HuggingFace:
# https://github.com/huggingface/transformers/blob/d541938/src/transformers/modeling_bert.py
#
# It converts Tensorflow and Huggingface checkpoint files to DeepSpeed.
import os
import argparse
import logging
import torch
import re
import numpy as np
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def set_data(param, array):
try:
assert param.shape == array.shape
except AssertionError as e:
e.args += (param.shape, array.shape)
raise
param.data = torch.from_numpy(array)
def load_tf_weights_in_bert_kernel(model, ckpt_path, voc_size_diff):
""" Load tf checkpoints in DeepSpeed model.
"""
try:
import re
import numpy as np
import tensorflow as tf
except ImportError:
logger.error(
"Loading a TensorFlow model in DeepSpeed, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path = os.path.abspath(ckpt_path)
logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
# Load weights from TF model
init_vars = tf.train.list_variables(tf_path)
names = []
arrays = []
for name, shape in init_vars:
logger.info("Loading TF weight {} with shape {}".format(name, shape))
array = tf.train.load_variable(tf_path, name)
names.append(name)
arrays.append(array)
qkv = {}
for name_str, array in zip(names, arrays):
name = name_str.split("/")
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model
if any(
n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
for n in name
):
logger.info("Skipping {}".format("/".join(name)))
continue
pointer = model
key = None
skipping = False
for m_name in name:
if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
scope_names = re.split(r"_(\d+)", m_name)
else:
scope_names = [m_name]
if scope_names[0] == "kernel" or scope_names[0] == "gamma":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
pointer = getattr(pointer, "bias")
elif scope_names[0] == "output_weights":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "squad":
pointer = getattr(pointer, "classifier")
# Special in deepspeed.
elif name_str.find("bert/pooler/dense") >= 0 and scope_names[0] == "dense":
pointer = getattr(pointer, "dense_act")
elif name_str.find("bert/embeddings/LayerNorm/gamma") >= 0 and scope_names[0] == "gamma":
pointer = getattr(pointer, "weight")
elif name_str.find("bert/embeddings/LayerNorm/beta") >= 0 and scope_names[0] == "beta":
pointer = getattr(pointer, "bias")
else:
try:
pointer = getattr(pointer, scope_names[0])
except AttributeError:
logger.info("Skipping {}".format("/".join(name)))
skipping = True
break
if len(scope_names) >= 2:
num = int(scope_names[1])
pointer = pointer[num]
# For transofrmer kernel layers.
if scope_names[0] == 'layer':
if name_str.find("attention/self/query/kernel") > 0:
key = "qw"
elif name_str.find("attention/self/query/bias") > 0:
key = "qb"
elif name_str.find("attention/self/key/kernel") > 0:
key = "kw"
elif name_str.find("attention/self/key/bias") > 0:
key = "kb"
elif name_str.find("attention/self/value/kernel") > 0:
key = "vw"
elif name_str.find("attention/self/value/bias") > 0:
key = "vb"
elif name_str.find("attention/output/dense/kernel") > 0:
pointer = getattr(pointer, "attn_ow")
elif name_str.find("attention/output/dense/bias") > 0:
pointer = getattr(pointer, "attn_ob")
elif name_str.find("attention/output/LayerNorm/gamma") > 0:
pointer = getattr(pointer, "attn_nw")
elif name_str.find("attention/output/LayerNorm/beta") > 0:
pointer = getattr(pointer, "attn_nb")
elif name_str.find("intermediate/dense/kernel") > 0:
pointer = getattr(pointer, "inter_w")
elif name_str.find("intermediate/dense/bias") > 0:
pointer = getattr(pointer, "inter_b")
elif name_str.find("output/dense/kernel") > 0 and name_str.find("attention") < 0:
pointer = getattr(pointer, "output_w")
elif name_str.find("output/dense/bias") > 0 and name_str.find("attention") < 0:
pointer = getattr(pointer, "output_b")
elif name_str.find("output/LayerNorm/gamma") > 0 and name_str.find("attention") < 0:
pointer = getattr(pointer, "norm_w")
elif name_str.find("output/LayerNorm/beta") > 0 and name_str.find("attention") < 0:
pointer = getattr(pointer, "norm_b")
else:
raise ValueError(f"unexpect scope name {name_str} in transformer layer.")
break
if skipping:
continue
if m_name[-11:] == "_embeddings":
pointer = getattr(pointer, "weight")
elif "kernel" in name:
array = np.transpose(array)
if key is not None:
qkv[key] = array
if all(k in qkv for k in ("qw", "kw", "vw")):
array = np.concatenate((qkv["qw"], qkv["kw"], qkv["vw"]), axis=0)
pointer = getattr(pointer, "attn_qkvw")
qkv.pop("qw")
qkv.pop("kw")
qkv.pop("vw")
elif all(k in qkv for k in ("qb", "kb", "vb")):
array = np.concatenate((qkv["qb"], qkv["kb"], qkv["vb"]), axis=0)
pointer = getattr(pointer, "attn_qkvb")
qkv.pop("qb")
qkv.pop("kb")
qkv.pop("vb")
elif key is not None:
# For Q/K/V weight/bias in TF, do nothing if not all ready to merge.
continue
# DeepSpeed BERT model has voc_size 8 aligned.
if voc_size_diff > 0 and name_str.find("embeddings/word_embeddings") >= 0:
z = np.zeros((voc_size_diff, array.shape[1]), dtype=array.dtype)
array = np.concatenate((array, z), axis=0)
set_data(pointer, array)
logger.info("Initialize DeepSpeed weight {}".format(name))
return model
def load_hf_weights_in_bert_kernel(model, ckpt_path, voc_size_diff):
""" Load huggingface checkpoints and convert to a deepspeed model.
"""
hf_path = os.path.abspath(ckpt_path)
logger.info("Converting Huggingface checkpoint from {}".format(hf_path))
# Load weights from Huggingface model
ckpt = torch.load(hf_path, map_location=torch.device("cpu"))
qkv = {}
for name_str in ckpt.keys():
array = ckpt[name_str].numpy()
logger.info("Loading Huggingface weight {} with shape {}".format(name_str, array.shape))
name = name_str.split(".")
pointer = model
key = None
is_layer = False
skipping = False
for m_name in name:
# Special in deepspeed.
if name_str.find("bert.pooler.dense") >= 0 and m_name == "dense":
pointer = getattr(pointer, "dense_act")
elif is_layer:
pass
else:
try:
pointer = getattr(pointer, m_name)
except AttributeError:
logger.info("Skipping {}".format(".".join(name)))
skipping = True
break
if m_name == "layer":
is_layer = True
continue
if m_name.isnumeric() and is_layer:
num = int(m_name)
pointer = pointer[num]
is_layer = False
# For transofrmer kernel layers.
if name_str.find("attention.self.query.weight") > 0:
key = "qw"
elif name_str.find("attention.self.query.bias") > 0:
key = "qb"
elif name_str.find("attention.self.key.weight") > 0:
key = "kw"
elif name_str.find("attention.self.key.bias") > 0:
key = "kb"
elif name_str.find("attention.self.value.weight") > 0:
key = "vw"
elif name_str.find("attention.self.value.bias") > 0:
key = "vb"
elif name_str.find("attention.output.dense.weight") > 0:
pointer = getattr(pointer, "attn_ow")
elif name_str.find("attention.output.dense.bias") > 0:
pointer = getattr(pointer, "attn_ob")
elif name_str.find("attention.output.LayerNorm.weight") > 0:
pointer = getattr(pointer, "attn_nw")
elif name_str.find("attention.output.LayerNorm.bias") > 0:
pointer = getattr(pointer, "attn_nb")
elif name_str.find("intermediate.dense.weight") > 0:
pointer = getattr(pointer, "inter_w")
elif name_str.find("intermediate.dense.bias") > 0:
pointer = getattr(pointer, "inter_b")
elif name_str.find("output.dense.weight") > 0 and name_str.find("attention") < 0:
pointer = getattr(pointer, "output_w")
elif name_str.find("output.dense.bias") > 0 and name_str.find("attention") < 0:
pointer = getattr(pointer, "output_b")
elif name_str.find("output.LayerNorm.weight") > 0 and name_str.find("attention") < 0:
pointer = getattr(pointer, "norm_w")
elif name_str.find("output.LayerNorm.bias") > 0 and name_str.find("attention") < 0:
pointer = getattr(pointer, "norm_b")
else:
raise ValueError(f"unexpect scope name {name_str} in transformer layer.")
break
if skipping:
continue
if key is not None:
qkv[key] = array
if all(k in qkv for k in ("qw", "kw", "vw")):
array = np.concatenate((qkv["qw"], qkv["kw"], qkv["vw"]), axis=0)
pointer = getattr(pointer, "attn_qkvw")
qkv.pop("qw")
qkv.pop("kw")
qkv.pop("vw")
elif all(k in qkv for k in ("qb", "kb", "vb")):
array = np.concatenate((qkv["qb"], qkv["kb"], qkv["vb"]), axis=0)
pointer = getattr(pointer, "attn_qkvb")
qkv.pop("qb")
qkv.pop("kb")
qkv.pop("vb")
elif key is not None:
# For Q/K/V weight/bias in HF, do nothing if not all ready to merge.
continue
# DeepSpeed BERT model has voc_size 8 aligned.
if voc_size_diff > 0 and name_str.find("embeddings.word_embeddings") >= 0:
z = np.zeros((voc_size_diff, array.shape[1]), dtype=array.dtype)
array = np.concatenate((array, z), axis=0)
set_data(pointer, array)
logger.info("Initialize DeepSpeed weight {}".format(name))
return model
def load_hf_weights_in_bert_torch(model, ckpt_path, voc_size_diff):
""" Load huggingface checkpoints and convert to a deepspeed model.
"""
hf_path = os.path.abspath(ckpt_path)
logger.info("Converting Huggingface checkpoint from {}".format(hf_path))
# Load weights from Huggingface model
ckpt = torch.load(hf_path, map_location=torch.device("cpu"))
qkv = {}
for name_str in ckpt.keys():
array = ckpt[name_str].numpy()
logger.info("Loading Huggingface weight {} with shape {}".format(name_str, array.shape))
name = name_str.split(".")
pointer = model
key = None
is_layer = False
skipping = False
for m_name in name:
# Special in deepspeed.
if name_str.find("intermediate.dense") >= 0 and m_name == "dense":
pointer = getattr(pointer, "dense_act")
elif name_str.find("pooler.dense") >= 0 and m_name == "dense":
pointer = getattr(pointer, "dense_act")
else:
try:
pointer = getattr(pointer, m_name)
except AttributeError:
logger.info("Skipping {}".format(".".join(name)))
skipping = True
break
if skipping:
continue
# DeepSpeed BERT model has voc_size 8 aligned.
if voc_size_diff > 0 and name_str.find("embeddings.word_embeddings") >= 0:
z = np.zeros((voc_size_diff, array.shape[1]), dtype=array.dtype)
array = np.concatenate((array, z), axis=0)
set_data(pointer, array)
logger.info("Initialize DeepSpeed weight {}".format(name))
return model
def convert_ckpt_to_deepspeed(model, ckpt_type, ckpt_path, vocab_diff, kernel_enabled):
# Load weights from checkpoint
if ckpt_type == "HF":
if kernel_enabled:
load_hf_weights_in_bert_kernel(model, ckpt_path, vocab_diff)
else:
load_hf_weights_in_bert_torch(model, ckpt_path, vocab_diff)
elif ckpt_type == "TF":
if kernel_enabled:
load_tf_weights_in_bert_kernel(model, ckpt_path, vocab_diff)
else:
raise ValueError("--deepspeed_transformer_kernel is required for loading TF checkpoint.")
else:
raise ValueError(f"Invalid ckpt_type.")
{
"train_batch_size": 24,
"train_micro_batch_size_per_gpu": 3,
"steps_per_print": 10,
"optimizer": {
"type": "Adam",
"params": {
"lr": 3e-5,
"weight_decay": 0.0,
"bias_correction": false
}
},
"gradient_clipping": 1.0,
"fp16": {
"enabled": true
}
}
import argparse
import json
import evaluate as eval
if __name__ == '__main__':
expected_version = '1.1'
parser = argparse.ArgumentParser(description='Evaluation for SQuAD ' +
expected_version)
parser.add_argument('dataset_file', help='Dataset file')
parser.add_argument('prediction_file', help='Prediction File')
args = parser.parse_args()
print(
json.dumps(
eval.evaluate(expected_version, args.dataset_file,
args.prediction_file)))
""" Official evaluation script for v1.1 of the SQuAD dataset. """
from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def f1_score(prediction, ground_truth):
prediction_tokens = normalize_answer(prediction).split()
ground_truth_tokens = normalize_answer(ground_truth).split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
def exact_match_score(prediction, ground_truth):
return (normalize_answer(prediction) == normalize_answer(ground_truth))
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
scores_for_ground_truths = []
for ground_truth in ground_truths:
score = metric_fn(prediction, ground_truth)
scores_for_ground_truths.append(score)
return max(scores_for_ground_truths)
def evaluate(expected_version, ds_file, pred_file):
with open(ds_file) as dataset_file:
dataset_json = json.load(dataset_file)
if (dataset_json['version'] != expected_version):
print('Evaluation expects v-' + expected_version +
', but got dataset with v-' + dataset_json['version'],
file=sys.stderr)
dataset = dataset_json['data']
with open(pred_file) as prediction_file:
predictions = json.load(prediction_file)
f1 = exact_match = total = 0
for article in dataset:
for paragraph in article['paragraphs']:
for qa in paragraph['qas']:
total += 1
if qa['id'] not in predictions:
message = 'Unanswered question ' + qa['id'] + \
' will receive score 0.'
print(message, file=sys.stderr)
continue
ground_truths = list(map(lambda x: x['text'], qa['answers']))
prediction = predictions[qa['id']]
exact_match += metric_max_over_ground_truths(
exact_match_score, prediction, ground_truths)
f1 += metric_max_over_ground_truths(f1_score, prediction,
ground_truths)
exact_match = 100.0 * exact_match / total
f1 = 100.0 * f1 / total
return {'exact_match': exact_match, 'f1': f1}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment