Commit 316d3f90 authored by Pan,Huiwen's avatar Pan,Huiwen
Browse files

增加ds框架测试模型

parent aebde649
"""
Utilities for working with the local dataset cache.
This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
Copyright by the AllenNLP authors.
"""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import json
import logging
import os
import shutil
import tempfile
from functools import wraps
from hashlib import sha256
import sys
from io import open
import boto3
import requests
from botocore.exceptions import ClientError
from tqdm import tqdm
try:
from urllib.parse import urlparse
except ImportError:
from urlparse import urlparse
try:
from pathlib import Path
PYTORCH_PRETRAINED_BERT_CACHE = Path(
os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
Path.home() / '.pytorch_pretrained_bert'))
except AttributeError:
PYTORCH_PRETRAINED_BERT_CACHE = os.getenv(
'PYTORCH_PRETRAINED_BERT_CACHE',
os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
def url_to_filename(url, etag=None):
"""
Convert `url` into a hashed filename in a repeatable way.
If `etag` is specified, append its hash to the url's, delimited
by a period.
"""
url_bytes = url.encode('utf-8')
url_hash = sha256(url_bytes)
filename = url_hash.hexdigest()
if etag:
etag_bytes = etag.encode('utf-8')
etag_hash = sha256(etag_bytes)
filename += '.' + etag_hash.hexdigest()
return filename
def filename_to_url(filename, cache_dir=None):
"""
Return the url and etag (which may be ``None``) stored for `filename`.
Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
"""
if cache_dir is None:
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
cache_dir = str(cache_dir)
cache_path = os.path.join(cache_dir, filename)
if not os.path.exists(cache_path):
raise EnvironmentError("file {} not found".format(cache_path))
meta_path = cache_path + '.json'
if not os.path.exists(meta_path):
raise EnvironmentError("file {} not found".format(meta_path))
with open(meta_path, encoding="utf-8") as meta_file:
metadata = json.load(meta_file)
url = metadata['url']
etag = metadata['etag']
return url, etag
def cached_path(url_or_filename, cache_dir=None):
"""
Given something that might be a URL (or might be a local path),
determine which. If it's a URL, download the file and cache it, and
return the path to the cached file. If it's already a local path,
make sure the file exists and then return the path.
"""
if cache_dir is None:
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
url_or_filename = str(url_or_filename)
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
cache_dir = str(cache_dir)
parsed = urlparse(url_or_filename)
if parsed.scheme in ('http', 'https', 's3'):
# URL, so get it from the cache (downloading if necessary)
return get_from_cache(url_or_filename, cache_dir)
elif os.path.exists(url_or_filename):
# File, and it exists.
return url_or_filename
elif parsed.scheme == '':
# File, but it doesn't exist.
raise EnvironmentError("file {} not found".format(url_or_filename))
else:
# Something unknown
raise ValueError(
"unable to parse {} as a URL or as a local path".format(
url_or_filename))
def split_s3_path(url):
"""Split a full s3 path into the bucket name and path."""
parsed = urlparse(url)
if not parsed.netloc or not parsed.path:
raise ValueError("bad s3 path {}".format(url))
bucket_name = parsed.netloc
s3_path = parsed.path
# Remove '/' at beginning of path.
if s3_path.startswith("/"):
s3_path = s3_path[1:]
return bucket_name, s3_path
def s3_request(func):
"""
Wrapper function for s3 requests in order to create more helpful error
messages.
"""
@wraps(func)
def wrapper(url, *args, **kwargs):
try:
return func(url, *args, **kwargs)
except ClientError as exc:
if int(exc.response["Error"]["Code"]) == 404:
raise EnvironmentError("file {} not found".format(url))
else:
raise
return wrapper
@s3_request
def s3_etag(url):
"""Check ETag on S3 object."""
s3_resource = boto3.resource("s3")
bucket_name, s3_path = split_s3_path(url)
s3_object = s3_resource.Object(bucket_name, s3_path)
return s3_object.e_tag
@s3_request
def s3_get(url, temp_file):
"""Pull a file directly from S3."""
s3_resource = boto3.resource("s3")
bucket_name, s3_path = split_s3_path(url)
s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
def http_get(url, temp_file):
req = requests.get(url, stream=True)
content_length = req.headers.get('Content-Length')
total = int(content_length) if content_length is not None else None
progress = tqdm(unit="B", total=total)
for chunk in req.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
progress.update(len(chunk))
temp_file.write(chunk)
progress.close()
def get_from_cache(url, cache_dir=None):
"""
Given a URL, look for the corresponding dataset in the local cache.
If it's not there, download it. Then return the path to the cached file.
"""
if cache_dir is None:
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
cache_dir = str(cache_dir)
if not os.path.exists(cache_dir):
os.makedirs(cache_dir)
# Get eTag to add to filename, if it exists.
if url.startswith("s3://"):
etag = s3_etag(url)
else:
response = requests.head(url, allow_redirects=True)
if response.status_code != 200:
raise IOError(
"HEAD request failed for url {} with status code {}".format(
url, response.status_code))
etag = response.headers.get("ETag")
filename = url_to_filename(url, etag)
# get cache path to put the file
cache_path = os.path.join(cache_dir, filename)
if not os.path.exists(cache_path):
# Download to temporary file, then copy to cache dir once finished.
# Otherwise you get corrupt cache entries if the download gets interrupted.
with tempfile.NamedTemporaryFile() as temp_file:
logger.info("%s not found in cache, downloading to %s", url,
temp_file.name)
# GET file object
if url.startswith("s3://"):
s3_get(url, temp_file)
else:
http_get(url, temp_file)
# we are copying the file before closing it, so flush to avoid truncation
temp_file.flush()
# shutil.copyfileobj() starts at the current position, so go to the start
temp_file.seek(0)
logger.info("copying %s to cache at %s", temp_file.name,
cache_path)
with open(cache_path, 'wb') as cache_file:
shutil.copyfileobj(temp_file, cache_file)
logger.info("creating metadata file for %s", cache_path)
meta = {'url': url, 'etag': etag}
meta_path = cache_path + '.json'
with open(meta_path, 'w', encoding="utf-8") as meta_file:
json.dump(meta, meta_file)
logger.info("removing temp file %s", temp_file.name)
return cache_path
def read_set_from_file(filename):
'''
Extract a de-duped collection (set) of text from a file.
Expected file format is one item per line.
'''
collection = set()
with open(filename, 'r', encoding='utf-8') as file_:
for line in file_:
collection.add(line.rstrip())
return collection
def get_file_extension(path, dot=True, lower=True):
ext = os.path.splitext(path)[1]
ext = ext if dot else ext[1:]
return ext.lower() if lower else ext
import logging
import torch.distributed as dist
logging.basicConfig(
format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt='%m/%d/%Y %H:%M:%S',
level=logging.INFO)
logger = logging.getLogger(__name__)
class Logger():
def __init__(self, cuda=False):
self.logger = logging.getLogger(__name__)
self.cuda = cuda
def info(self, message, *args, **kwargs):
if (self.cuda and dist.get_rank() == 0) or not self.cuda:
self.logger.info(message, *args, **kwargs)
def error(self, message, *args, **kwargs):
self.logger.error(message, *args, **kwargs)
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
class FocalLoss(nn.Module):
r"""
This criterion is a implemenation of Focal Loss, which is proposed in
Focal Loss for Dense Object Detection.
Loss(x, class) = - \alpha (1-softmax(x)[class])^gamma \log(softmax(x)[class])
The losses are averaged across observations for each minibatch.
Args:
alpha(1D Tensor, Variable) : the scalar factor for this criterion
gamma(float, double) : gamma > 0; reduces the relative loss for well-classified examples (p > .5),
putting more focus on hard, misclassified examples
size_average(bool): size_average(bool): By default, the losses are averaged over observations for each minibatch.
However, if the field size_average is set to False, the losses are
instead summed for each minibatch.
"""
def __init__(self, class_num, alpha=None, gamma=2, size_average=True):
super(FocalLoss, self).__init__()
if alpha is None:
self.alpha = torch.ones(class_num, 1)
else:
if isinstance(alpha, Variable):
self.alpha = alpha
else:
self.alpha = Variable(alpha)
self.gamma = gamma
self.class_num = class_num
self.size_average = size_average
def forward(self, inputs, targets):
N = inputs.size(0)
C = inputs.size(1)
P = F.softmax(inputs)
class_mask = inputs.data.new(N, C).fill_(0)
# class_mask = Variable(class_mask)
ids = targets.view(-1, 1)
class_mask.scatter_(1, ids.data, 1.)
if inputs.is_cuda and not self.alpha.is_cuda:
self.alpha = self.alpha.cuda()
alpha = self.alpha[ids.data.view(-1)]
probs = (P * class_mask).sum(1).view(-1, 1)
log_p = probs.log()
batch_loss = -alpha * (torch.pow((1 - probs), self.gamma)) * log_p
if self.size_average:
loss = batch_loss.mean()
else:
loss = batch_loss.sum()
return loss
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss
from turing.utils import TorchTuple
from pytorch_pretrained_bert.modeling import BertModel
from pytorch_pretrained_bert.modeling import BertPreTrainingHeads, PreTrainedBertModel, BertPreTrainingHeads
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
class BertPretrainingLoss(PreTrainedBertModel):
def __init__(self, bert_encoder, config):
super(BertPretrainingLoss, self).__init__(config)
self.bert = bert_encoder
self.cls = BertPreTrainingHeads(
config, self.bert.embeddings.word_embeddings.weight)
self.cls.apply(self.init_bert_weights)
def forward(self,
input_ids,
token_type_ids=None,
attention_mask=None,
masked_lm_labels=None,
next_sentence_label=None):
sequence_output, pooled_output = self.bert(
input_ids,
token_type_ids,
attention_mask,
output_all_encoded_layers=False)
prediction_scores, seq_relationship_score = self.cls(
sequence_output, pooled_output)
if masked_lm_labels is not None and next_sentence_label is not None:
loss_fct = CrossEntropyLoss(ignore_index=-1)
next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
next_sentence_label.view(-1))
masked_lm_loss = loss_fct(
prediction_scores.view(-1, self.config.vocab_size),
masked_lm_labels.view(-1))
total_loss = masked_lm_loss + next_sentence_loss
return total_loss
else:
return prediction_scores, seq_relationship_score
class BertClassificationLoss(PreTrainedBertModel):
def __init__(self, bert_encoder, config, num_labels: int = 1):
super(BertClassificationLoss, self).__init__(config)
self.bert = bert_encoder
self.num_labels = num_labels
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, num_labels)
self.classifier.apply(self.init_bert_weights)
def forward(self,
input_ids,
token_type_ids=None,
attention_mask=None,
labels=None):
_, pooled_output = self.bert(input_ids,
token_type_ids,
attention_mask,
output_all_encoded_layers=False)
pooled_output = self.dropout(pooled_output)
scores = self.classifier(pooled_output)
if labels is not None:
loss_fct = nn.BCEWithLogitsLoss()
loss = loss_fct(scores.view(-1, self.num_labels),
labels.view(-1, 1))
return loss
else:
return scores
class BertRegressionLoss(PreTrainedBertModel):
def __init__(self, bert_encoder, config):
super(BertRegressionLoss, self).__init__(config)
self.bert = bert_encoder
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1)
self.classifier.apply(self.init_bert_weights)
def forward(self,
input_ids,
token_type_ids=None,
attention_mask=None,
labels=None):
_, pooled_output = self.bert(input_ids,
token_type_ids,
attention_mask,
output_all_encoded_layers=False)
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
if labels is not None:
loss_fct = MSELoss()
loss = loss_fct(logits.view(-1, 1), labels.view(-1, 1))
return loss
else:
return logits
class BertMultiTask:
def __init__(self, args):
self.config = args.config
if not args.use_pretrain:
if args.progressive_layer_drop:
print("BertConfigPreLnLayerDrop")
from nvidia.modelingpreln_layerdrop import BertForPreTrainingPreLN, BertConfig
else:
from nvidia.modelingpreln import BertForPreTrainingPreLN, BertConfig
bert_config = BertConfig(**self.config["bert_model_config"])
bert_config.vocab_size = len(args.tokenizer.vocab)
# Padding for divisibility by 8
if bert_config.vocab_size % 8 != 0:
bert_config.vocab_size += 8 - (bert_config.vocab_size % 8)
print("VOCAB SIZE:", bert_config.vocab_size)
self.network = BertForPreTrainingPreLN(bert_config, args)
# Use pretrained bert weights
else:
self.bert_encoder = BertModel.from_pretrained(
self.config['bert_model_file'],
cache_dir=PYTORCH_PRETRAINED_BERT_CACHE /
'distributed_{}'.format(args.local_rank))
bert_config = self.bert_encoder.config
self.device = None
def set_device(self, device):
self.device = device
def save(self, filename: str):
network = self.network.module
return torch.save(network.state_dict(), filename)
def load(self, model_state_dict: str):
return self.network.module.load_state_dict(
torch.load(model_state_dict,
map_location=lambda storage, loc: storage))
def move_batch(self, batch: TorchTuple, non_blocking=False):
return batch.to(self.device, non_blocking)
def eval(self):
self.network.eval()
def train(self):
self.network.train()
def save_bert(self, filename: str):
return torch.save(self.bert_encoder.state_dict(), filename)
def to(self, device):
assert isinstance(device, torch.device)
self.network.to(device)
def half(self):
self.network.half()
from tqdm import tqdm
from typing import Tuple
from random import shuffle
import pickle
import random
import numpy as np
from pathlib import Path
from pytorch_pretrained_bert.tokenization import BertTokenizer
def truncate_input_sequence(tokens_a, tokens_b, max_num_tokens):
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_num_tokens:
break
trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
assert len(trunc_tokens) >= 1
# We want to sometimes truncate from the front and sometimes from the
# back to add more randomness and avoid biases.
if random.random() < 0.5:
del trunc_tokens[0]
else:
trunc_tokens.pop()
class TokenInstance:
""" This TokenInstance is a obect to have the basic units of data that should be
extracted from the raw text file and can be consumed by any BERT like model.
"""
def __init__(self, tokens_a, tokens_b, is_next, lang="en"):
self.tokens_a = tokens_a
self.tokens_b = tokens_b
self.is_next = is_next # 0 is if in continuation, 1 if is random
self.lang = lang
def get_values(self):
return (self.tokens_a, self.tokens_b, self.is_next)
def get_lang(self):
return self.lang
class QueryPassageDataset:
def __init__(self, path, readin=20000000):
all_pairs = []
with open(path, encoding="utf-8") as fd:
for i, line in enumerate(tqdm(fd)):
line = line.replace('\n', '')
qpl_tuple: Tuple[str, str, str] = line.split('\t')
all_pairs.append(qpl_tuple)
if i > readin:
break
shuffle(all_pairs)
self.all_pairs = all_pairs
self.len = len(self.all_pairs)
def __len__(self):
return self.len
class QueryPassageFineTuningDataset:
def __init__(self, path, readin=20000000):
all_pairs = []
with open(path, encoding="utf-8") as fd:
for i, line in enumerate(tqdm(fd)):
line = line.replace('\n', '')
entities = line.split('\t')
qpl_tuple: Tuple[str, str,
str] = (entities[0], entities[2], entities[4])
all_pairs.append(qpl_tuple)
if i > readin:
break
shuffle(all_pairs)
self.all_pairs = all_pairs
self.len = len(self.all_pairs)
def __len__(self):
return self.len
class QueryInstanceDataset:
def __init__(self, path, readin=20000000):
all_pairs = []
with open(path, encoding="utf-8") as fd:
for i, line in enumerate(tqdm(fd)):
line = line.replace('\n', '')
qpl_tuple: Tuple[str, str, str] = line.split('\t')
all_pairs.append(qpl_tuple)
if i > readin:
break
shuffle(all_pairs)
self.all_pairs = all_pairs
self.len = len(self.all_pairs)
def __len__(self):
return self.len
class PretrainingDataCreator:
def __init__(self,
path,
tokenizer: BertTokenizer,
max_seq_length,
readin: int = 2000000,
dupe_factor: int = 5,
small_seq_prob: float = 0.1):
self.dupe_factor = dupe_factor
self.max_seq_length = max_seq_length
self.small_seq_prob = small_seq_prob
documents = []
instances = []
with open(path, encoding='utf-8') as fd:
for i, line in enumerate(tqdm(fd)):
line = line.replace('\n', '')
# Expected format (Q,T,U,S,D)
# query, title, url, snippet, document = line.split('\t')
# ! remove this following line later
document = line
if len(document.split("<sep>")) <= 3:
continue
lines = document.split("<sep>")
document = []
for seq in lines:
document.append(tokenizer.tokenize(seq))
# document = list(map(tokenizer.tokenize, lines))
documents.append(document)
documents = [x for x in documents if x]
self.documents = documents
for _ in range(self.dupe_factor):
for index in range(len(self.documents)):
instances.extend(self.create_training_instance(index))
shuffle(instances)
self.instances = instances
self.len = len(self.instances)
self.documents = None
documents = None
def __len__(self):
return self.len
def __getstate__(self):
state = self.__dict__.copy()
return state
def __setstate__(self, state):
self.__dict__.update(state)
def save(self, filename):
with open(filename, 'wb') as outfile:
pickle.dump(self, outfile)
@staticmethod
def load(filename):
with open(filename, 'rb') as f:
return pickle.load(f)
def create_training_instance(self, index):
document = self.documents[index]
# l = 0
# for s in document:
# l+=len(s)
# print(l)
# print(document)
# Need to add [CLS] + 2*[SEP] tokens
max_num_tokens = self.max_seq_length - 3
# We want to maximize the inp sequence but also want inputs similar
# to our generic task inputs which will be compartively smaller
# than the data on which we intend to pre-train.
target_seq_length = max_num_tokens
if random.random() < self.small_seq_prob:
target_seq_length = random.randint(5, max_num_tokens)
# Need to make the sequences split for NSP task for interesting
# rather than choosing some arbitrary point. If not the NSP
# task might become way too easy.
instances = []
current_chunk = []
current_length = 0
i = 0
while i < len(document):
segment = document[i]
current_chunk.append(segment)
current_length += len(segment)
if i == len(document) - 1 or current_length >= target_seq_length:
if current_chunk:
# `a_end` is how many segments from `current_chunk` go into the `A`
# (first) sentence.
a_end = 1
if len(current_chunk) >= 2:
a_end = random.randint(1, len(current_chunk) - 1)
tokens_a = []
for j in range(a_end):
tokens_a.extend(current_chunk[j])
tokens_b = []
# Random Next
is_random_next = False
if len(current_chunk) == 1 or random.random() < 0.5:
is_random_next = True
target_b_length = target_seq_length - len(tokens_a)
# Pick a random document
for _ in range(10):
random_doc_index = random.randint(
0,
len(self.documents) - 1)
if random_doc_index != index:
break
random_doc = self.documents[random_doc_index]
random_start = random.randint(0, len(random_doc) - 1)
for j in range(random_start, len(random_doc)):
tokens_b.extend(random_doc[j])
if len(tokens_b) >= target_b_length:
break
# We didn't actually use these segments so we "put them back" so
# they don't go to waste.
num_unused_segments = len(current_chunk) - a_end
i -= num_unused_segments
# Actual Next
else:
is_random_next = False
for j in range(a_end, len(current_chunk)):
tokens_b.extend(current_chunk[j])
truncate_input_sequence(tokens_a, tokens_b, max_num_tokens)
assert len(tokens_a) >= 1
assert len(tokens_b) >= 1
instances.append(
TokenInstance(tokens_a, tokens_b, int(is_random_next)))
# print(instances[-1])
current_chunk = []
current_length = 0
i += 1
# print(len(instances))
return instances
class CleanBodyDataCreator(PretrainingDataCreator):
def __init__(self,
path,
tokenizer: BertTokenizer,
max_seq_length: int = 512,
readin: int = 2000000,
dupe_factor: int = 5,
small_seq_prob: float = 0.1):
self.dupe_factor = dupe_factor
self.max_seq_length = max_seq_length
self.small_seq_prob = small_seq_prob
documents = []
instances = []
with open(path, encoding='utf-8') as fd:
for i, line in enumerate(tqdm(fd)):
line = line.replace('\n', '')
url, cleanbody, rand_int = line.rstrip("\n").split("\t")
cleanbody = cleanbody.replace("#TAB#", " ").replace(
"#NULL#", "").replace("#HASH#", "#")
cleanbody_parts = cleanbody.split("#R##N#")
for document in cleanbody_parts:
lines = document.split("#N#")
document = []
document_len = 0
for seq in lines:
tok_seq = tokenizer.tokenize(seq)
if len(tok_seq) != 0:
document.append(tok_seq)
document_len += len(tok_seq)
if document_len >= 200:
documents.append(document)
documents = [x for x in documents if x]
self.documents = documents
for _ in range(self.dupe_factor):
for index in range(len(self.documents)):
instances.extend(self.create_training_instance(index))
shuffle(instances)
self.instances = instances
self.len = len(self.instances)
self.documents = None
documents = None
class WikiNBookCorpusPretrainingDataCreator(PretrainingDataCreator):
def __init__(self,
path,
tokenizer: BertTokenizer,
max_seq_length: int = 512,
readin: int = 2000000,
dupe_factor: int = 6,
small_seq_prob: float = 0.1):
self.dupe_factor = dupe_factor
self.max_seq_length = max_seq_length
self.small_seq_prob = small_seq_prob
documents = []
instances = []
with open(path, encoding='utf-8') as fd:
document = []
for i, line in enumerate(tqdm(fd)):
line = line.replace('\n', '')
# document = line
# if len(document.split("<sep>")) <= 3:
# continue
if len(line) == 0: # This is end of document
documents.append(document)
document = []
if len(line.split(' ')) > 2:
document.append(tokenizer.tokenize(line))
if len(document) > 0:
documents.append(document)
documents = [x for x in documents if x]
print(documents[0])
print(len(documents))
self.documents = documents
for _ in range(self.dupe_factor):
for index in range(len(self.documents)):
instances.extend(self.create_training_instance(index))
shuffle(instances)
self.instances = instances
self.len = len(self.instances)
self.documents = None
documents = None
class WikiPretrainingDataCreator(PretrainingDataCreator):
def __init__(self,
path,
tokenizer: BertTokenizer,
max_seq_length: int = 512,
readin: int = 2000000,
dupe_factor: int = 6,
small_seq_prob: float = 0.1):
self.dupe_factor = dupe_factor
self.max_seq_length = max_seq_length
self.small_seq_prob = small_seq_prob
documents = []
instances = []
with open(path, encoding='utf-8') as fd:
document = []
for i, line in enumerate(tqdm(fd)):
line = line.replace('\n', '')
# document = line
# if len(document.split("<sep>")) <= 3:
# continue
if len(line
) > 0 and line[:2] == "[[": # This is end of document
documents.append(document)
document = []
if len(line.split(' ')) > 2:
document.append(tokenizer.tokenize(line))
if len(document) > 0:
documents.append(document)
documents = [x for x in documents if x]
# print(len(documents))
# print(len(documents[0]))
# print(documents[0][0:10])
self.documents = documents
for _ in range(self.dupe_factor):
for index in range(len(self.documents)):
instances.extend(self.create_training_instance(index))
shuffle(instances)
self.instances = instances
self.len = len(self.instances)
self.documents = None
documents = None
class NumpyByteInstances:
TOKEN_SEP_VAL = int.from_bytes(b'\x1f', byteorder='big')
def __init__(self, data_creator):
self.data_creator = data_creator
self.getitem_fixed = self.sep_getitem_fixed if self.data_creator.use_separators else self.data_creator.nosep_getitem_fixed
# if self.data_creator.multilingual:
# self.__getitem__ = self.getitem_multilingual
# else:
# self.__getitem__ = self.getitem_monolingual
def getitem_multilingual(self, i):
tokens_a, tokens_b, is_next = self.getitem_fixed(i)
return TokenInstance(tokens_a,
tokens_b,
is_next,
lang=self.data_creator.lang[i])
def getitem_monolingual(self, i):
return TokenInstance(*self.getitem_fixed(i))
def __getitem__(self, i):
if self.data_creator.multilingual:
return self.getitem_multilingual(i)
else:
return self.getitem_monolingual(i)
def nosep_getitem_fixed(self, i):
if i > self.data_creator.len:
raise IndexError
if i < 0:
i += self.data_creator.len
instance_start, instance_end = self.data_creator.instance_offsets[i:i +
2]
tok_offsets_start, tok_offsets_end = self.data_creator.instance_token_offsets[
i:i + 2]
token_offsets = self.data_creator.token_offsets[
tok_offsets_start:tok_offsets_end]
tokens_split = self.data_creator.tokens_split[i]
token_arrs = np.split(
self.data_creator.data[instance_start:instance_end], token_offsets)
tokens = [t.tostring().decode('utf8') for t in token_arrs]
return tokens[:tokens_split], tokens[
tokens_split:], self.data_creator.is_next[i]
def sep_getitem_fixed(self, i):
if i > self.data_creator.len:
raise IndexError
if i < 0:
i += self.data_creator.len
instance_start, instance_end = self.data_creator.instance_offsets[i:i +
2]
instance_data = self.data_creator.data[instance_start:instance_end]
tokens_split = self.data_creator.tokens_split[i]
token_arrs = np.split(
instance_data,
np.where(instance_data == NumpyByteInstances.TOKEN_SEP_VAL)
[0]) # split on the token separator
tokens = [
(t[1:] if i > 0 else t).tostring().decode('utf8')
for i, t in enumerate(token_arrs)
] # ignore first byte, which will be separator, for tokens after the first
return tokens[:tokens_split], tokens[
tokens_split:], self.data_creator.is_next[i]
def __len__(self):
return self.data_creator.len
class NumpyPretrainingDataCreator:
def __init__(self, path, mmap=False):
path = Path(path)
self.path = path
mmap_mode = 'r' if mmap else None
self.data = np.load(str(path / 'data.npy'), mmap_mode=mmap_mode)
self.is_next = np.load(str(path / 'is_next.npy'), mmap_mode=mmap_mode)
self.tokens_split = np.load(str(path / 'tokens_split.npy'),
mmap_mode=mmap_mode)
self.instance_offsets = np.load(str(path / 'instance_offsets.npy'),
mmap_mode=mmap_mode)
if (path / 'instance_token_offsets.npy').is_file():
self.use_separators = False
self.instance_token_offsets = np.load(str(
path / 'instance_token_offsets.npy'),
mmap_mode=mmap_mode)
self.token_offsets = np.load(str(path / 'token_offsets.npy'),
mmap_mode=mmap_mode)
else:
self.use_separators = True
self.instance_token_offsets = None
self.token_offsets = None
if (path / 'lang.npy').is_file():
self.multilingual = True
self.lang = np.load(str(path / 'lang.npy'), mmap_mode=mmap_mode)
else:
self.multilingual = False
self.lang = None
self.instances = NumpyByteInstances(self)
self.len = len(self.is_next)
def __len__(self):
return self.len
@classmethod
def load(cls, path):
return cls(path)
import torch
PAD = 0
def mask(x):
return x != PAD
def torch_long(x):
return torch.LongTensor(x)
import sys as _sys
from typing import List
from collections import _iskeyword # type: ignore
from tensorboardX import SummaryWriter
import os
SUMMARY_WRITER_DIR_NAME = 'runs'
def get_sample_writer(name, base=".."):
"""Returns a tensorboard summary writer
"""
return SummaryWriter(
log_dir=os.path.join(base, SUMMARY_WRITER_DIR_NAME, name))
class TorchTuple(tuple):
def to(self, device, non_blocking=False):
raise NotImplementedError("")
_class_template = """\
from builtins import property as _property, tuple as _tuple
from operator import itemgetter as _itemgetter
from collections import OrderedDict
from turing.utils import TorchTuple
import torch
class {typename}(TorchTuple):
'{typename}({arg_list})'
__slots__ = ()
_fields = {field_names!r}
def __new__(_cls, {arg_list}):
'Create new instance of {typename}({arg_list})'
return _tuple.__new__(_cls, ({arg_list}))
@classmethod
def _make(cls, iterable, new=tuple.__new__, len=len):
'Make a new {typename} object from a sequence or iterable'
result = new(cls, iterable)
if len(result) != {num_fields:d}:
raise TypeError('Expected {num_fields:d} arguments, got %d' % len(result))
return result
def _replace(_self, **kwds):
'Return a new {typename} object replacing specified fields with new values'
result = _self._make(map(kwds.pop, {field_names!r}, _self))
if kwds:
raise ValueError('Got unexpected field names: %r' % list(kwds))
return result
def __repr__(self):
'Return a nicely formatted representation string'
return self.__class__.__name__ + '({repr_fmt})' % self
@property
def __dict__(self):
'A new OrderedDict mapping field names to their values'
return OrderedDict(zip(self._fields, self))
def _asdict(self):
'''Return a new OrderedDict which maps field names to their values.
This method is obsolete. Use vars(nt) or nt.__dict__ instead.
'''
return self.__dict__
def __getnewargs__(self):
'Return self as a plain tuple. Used by copy and pickle.'
return tuple(self)
def __getstate__(self):
'Exclude the OrderedDict from pickling'
return None
def to(self, device, non_blocking=False):
_dict = self.__dict__.copy()
new_dict = dict()
for key, value in _dict.items():
if isinstance(value, torch.Tensor):
if device.type != 'cpu' and non_blocking and torch.cuda.is_available():
new_dict[key] = value.cuda(device, non_blocking=non_blocking)
else:
new_dict[key] = value.to(device)
else:
new_dict[key] = value
return {typename}(**new_dict)
{field_defs}
"""
_repr_template = '{name}=%r'
_field_template = '''\
{name} = _property(_itemgetter({index:d}), doc='Alias for field number {index:d}')
'''
def namedtorchbatch(typename: str,
field_names: List[str],
verbose: bool = False,
rename: bool = False):
"""Returns a new subclass of tuple with named fields leveraging use of torch tensors.
"""
# Validate the field names. At the user's option, either generate an error
# message or automatically replace the field name with a valid name.
if isinstance(field_names, str):
field_names = field_names.replace(',', ' ').split()
field_names = list(map(str, field_names))
if rename:
seen: set = set()
for index, name in enumerate(field_names):
if (not name.isidentifier() or _iskeyword(name)
or name.startswith('_') or name in seen):
field_names[index] = '_%d' % index
seen.add(name)
for name in [typename] + field_names:
if not name.isidentifier():
raise ValueError('Type names and field names must be valid '
'identifiers: %r' % name)
if _iskeyword(name):
raise ValueError('Type names and field names cannot be a '
'keyword: %r' % name)
seen = set()
for name in field_names:
if name.startswith('_') and not rename:
raise ValueError('Field names cannot start with an underscore: '
'%r' % name)
if name in seen:
raise ValueError('Encountered duplicate field name: %r' % name)
seen.add(name)
# Fill-in the class template
class_definition = _class_template.format(
typename=typename,
field_names=tuple(field_names),
num_fields=len(field_names),
arg_list=repr(tuple(field_names)).replace("'", "")[1:-1],
repr_fmt=', '.join(
_repr_template.format(name=name) for name in field_names),
field_defs='\n'.join(
_field_template.format(index=index, name=name)
for index, name in enumerate(field_names)))
# Execute the template string in a temporary namespace and support
# tracing utilities by setting a value for frame.f_globals['__name__']
namespace = dict(__name__='namedtuple_%s' % typename)
exec(class_definition, namespace)
result = namespace[typename]
result._source = class_definition # type: ignore
if verbose:
print(result._source) # type: ignore
# For pickling to work, the __module__ variable needs to be set to the frame
# where the named tuple is created. Bypass this step in enviroments where
# sys._getframe is not defined (Jython for example) or sys._getframe is not
# defined for arguments greater than 0 (IronPython).
try:
result.__module__ = _sys._getframe(1).f_globals.get(
'__name__', '__main__')
except (AttributeError, ValueError):
pass
return result
{
"train_batch_size": 96,
"train_micro_batch_size_per_gpu": 3,
"steps_per_print": 100,
"optimizer": {
"type": "OnebitAdam",
"params": {
"lr": 3e-5,
"freeze_step": 400,
"weight_decay": 0.0,
"bias_correction": false,
"cuda_aware": false,
"comm_backend_name": "mpi"
}
},
"gradient_clipping": 1.0,
"fp16": {
"enabled": true
}
}
# If you are able to install pytorch >= 1.8
# (and nccl >= 2.8.3 if you have 64 or more GPUs),
# we highly recommend you to use the NCCL-based 1-bit Adam
# which has better performance and ease of use
# (see scripts in DeepSpeedExamples/BingBertSquad/1-bit_adam/nccl
# and read the tutorial for more details:
# https://www.deepspeed.ai/tutorials/onebit-adam/)
NUM_NODES=4
NGPU_PER_NODE=8
MODEL_FILE="../../ckpt/bert-large-uncased-whole-word-masking-pytorch_model.bin"
ORIGIN_CONFIG_FILE="../../ckpt/bert-large-uncased-whole-word-masking-config.json"
SQUAD_DIR="../../data"
OUTPUT_DIR=$1
LR=3e-5
SEED=$RANDOM
MASTER_PORT=12345
DROPOUT=0.1
sudo rm -rf ${OUTPUT_DIR}
NGPU=$((NGPU_PER_NODE*NUM_NODES))
EFFECTIVE_BATCH_SIZE=96
MAX_GPU_BATCH_SIZE=3
PER_GPU_BATCH_SIZE=$((EFFECTIVE_BATCH_SIZE/NGPU))
if [[ $PER_GPU_BATCH_SIZE -lt $MAX_GPU_BATCH_SIZE ]]; then
GRAD_ACCUM_STEPS=1
else
GRAD_ACCUM_STEPS=$((PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE))
fi
JOB_NAME="onebit_deepspeed_${NGPU}GPUs_${EFFECTIVE_BATCH_SIZE}batch_size"
config_json=deepspeed_onebitadam_bsz96_config.json
# NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed.
NCCL_TREE_THRESHOLD=0 NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 deepspeed --launcher=openmpi ../../nvidia_run_squad_deepspeed.py \
--bert_model bert-large-uncased \
--do_train \
--do_lower_case \
--predict_batch_size 3 \
--do_predict \
--train_file $SQUAD_DIR/train-v1.1.json \
--predict_file $SQUAD_DIR/dev-v1.1.json \
--train_batch_size $PER_GPU_BATCH_SIZE \
--learning_rate ${LR} \
--num_train_epochs 2.0 \
--max_seq_length 384 \
--doc_stride 128 \
--output_dir $OUTPUT_DIR \
--job_name ${JOB_NAME} \
--gradient_accumulation_steps ${GRAD_ACCUM_STEPS} \
--fp16 \
--deepspeed \
--deepspeed_mpi \
--deepspeed_transformer_kernel \
--deepspeed_config ${config_json} \
--dropout ${DROPOUT} \
--model_file $MODEL_FILE \
--seed ${SEED} \
--ckpt_type HF \
--origin_bert_config_file ${ORIGIN_CONFIG_FILE} \
# If you are able to install pytorch >= 1.8
# (and nccl >= 2.8.3 if you have 64 or more GPUs),
# we highly recommend you to use the NCCL-based 1-bit Adam
# which has better performance and ease of use
# (see scripts in DeepSpeedExamples/BingBertSquad/1-bit_adam/nccl
# and read the tutorial for more details:
# https://www.deepspeed.ai/tutorials/onebit-adam/)
NUM_NODES=4
NGPU_PER_NODE=8
MODEL_FILE="../../ckpt/bert-large-uncased-whole-word-masking-pytorch_model.bin"
ORIGIN_CONFIG_FILE="../../ckpt/bert-large-uncased-whole-word-masking-config.json"
SQUAD_DIR="../../data"
OUTPUT_DIR=$1
LR=3e-5
SEED=$RANDOM
MASTER_PORT=12345
DROPOUT=0.1
sudo rm -rf ${OUTPUT_DIR}
NGPU=$((NGPU_PER_NODE*NUM_NODES))
EFFECTIVE_BATCH_SIZE=96
MAX_GPU_BATCH_SIZE=3
PER_GPU_BATCH_SIZE=$((EFFECTIVE_BATCH_SIZE/NGPU))
if [[ $PER_GPU_BATCH_SIZE -lt $MAX_GPU_BATCH_SIZE ]]; then
GRAD_ACCUM_STEPS=1
else
GRAD_ACCUM_STEPS=$((PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE))
fi
JOB_NAME="onebit_deepspeed_${NGPU}GPUs_${EFFECTIVE_BATCH_SIZE}batch_size"
config_json=deepspeed_onebitadam_bsz96_config.json
# NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed.
mpirun -n $NGPU -npernode $NGPU_PER_NODE -hostfile /job/hostfile -x UCX_TLS=tcp --mca btl ^openib --mca btl_tcp_if_include eth0 -x NCCL_TREE_THRESHOLD=0 -x NCCL_IB_DISABLE=1 -x NCCL_SOCKET_IFNAME=eth0 python ../../nvidia_run_squad_deepspeed.py \
--bert_model bert-large-uncased \
--do_train \
--do_lower_case \
--predict_batch_size 3 \
--do_predict \
--train_file $SQUAD_DIR/train-v1.1.json \
--predict_file $SQUAD_DIR/dev-v1.1.json \
--train_batch_size $PER_GPU_BATCH_SIZE \
--learning_rate ${LR} \
--num_train_epochs 2.0 \
--max_seq_length 384 \
--doc_stride 128 \
--output_dir $OUTPUT_DIR \
--job_name ${JOB_NAME} \
--gradient_accumulation_steps ${GRAD_ACCUM_STEPS} \
--fp16 \
--deepspeed \
--deepspeed_mpi \
--deepspeed_transformer_kernel \
--deepspeed_config ${config_json} \
--dropout ${DROPOUT} \
--model_file $MODEL_FILE \
--seed ${SEED} \
--ckpt_type HF \
--origin_bert_config_file ${ORIGIN_CONFIG_FILE} \
{
"train_batch_size": 96,
"train_micro_batch_size_per_gpu": 3,
"steps_per_print": 100,
"optimizer": {
"type": "OnebitAdam",
"params": {
"lr": 3e-5,
"freeze_step": 400,
"weight_decay": 0.0,
"bias_correction": false,
"cuda_aware": true,
"comm_backend_name": "mpi"
}
},
"gradient_clipping": 1.0,
"fp16": {
"enabled": true
}
}
# If you are able to install pytorch >= 1.8
# (and nccl >= 2.8.3 if you have 64 or more GPUs),
# we highly recommend you to use the NCCL-based 1-bit Adam
# which has better performance and ease of use
# (see scripts in DeepSpeedExamples/BingBertSquad/1-bit_adam/nccl
# and read the tutorial for more details:
# https://www.deepspeed.ai/tutorials/onebit-adam/)
NUM_NODES=4
NGPU_PER_NODE=8
MODEL_FILE="../../ckpt/bert-large-uncased-whole-word-masking-pytorch_model.bin"
ORIGIN_CONFIG_FILE="../../ckpt/bert-large-uncased-whole-word-masking-config.json"
SQUAD_DIR="../../data"
OUTPUT_DIR=$1
LR=3e-5
SEED=$RANDOM
MASTER_PORT=12345
DROPOUT=0.1
sudo rm -rf ${OUTPUT_DIR}
NGPU=$((NGPU_PER_NODE*NUM_NODES))
EFFECTIVE_BATCH_SIZE=96
MAX_GPU_BATCH_SIZE=3
PER_GPU_BATCH_SIZE=$((EFFECTIVE_BATCH_SIZE/NGPU))
if [[ $PER_GPU_BATCH_SIZE -lt $MAX_GPU_BATCH_SIZE ]]; then
GRAD_ACCUM_STEPS=1
else
GRAD_ACCUM_STEPS=$((PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE))
fi
JOB_NAME="onebit_deepspeed_${NGPU}GPUs_${EFFECTIVE_BATCH_SIZE}batch_size"
config_json=deepspeed_onebitadam_bsz96_config.json
NCCL_TREE_THRESHOLD=0 deepspeed --launcher=mvapich ../../nvidia_run_squad_deepspeed.py \
--bert_model bert-large-uncased \
--do_train \
--do_lower_case \
--predict_batch_size 3 \
--do_predict \
--train_file $SQUAD_DIR/train-v1.1.json \
--predict_file $SQUAD_DIR/dev-v1.1.json \
--train_batch_size $PER_GPU_BATCH_SIZE \
--learning_rate ${LR} \
--num_train_epochs 2.0 \
--max_seq_length 384 \
--doc_stride 128 \
--output_dir $OUTPUT_DIR \
--job_name ${JOB_NAME} \
--gradient_accumulation_steps ${GRAD_ACCUM_STEPS} \
--fp16 \
--deepspeed \
--deepspeed_mpi \
--deepspeed_transformer_kernel \
--deepspeed_config ${config_json} \
--dropout ${DROPOUT} \
--model_file $MODEL_FILE \
--seed ${SEED} \
--ckpt_type HF \
--origin_bert_config_file ${ORIGIN_CONFIG_FILE} \
# If you are able to install pytorch >= 1.8
# (and nccl >= 2.8.3 if you have 64 or more GPUs),
# we highly recommend you to use the NCCL-based 1-bit Adam
# which has better performance and ease of use
# (see scripts in DeepSpeedExamples/BingBertSquad/1-bit_adam/nccl
# and read the tutorial for more details:
# https://www.deepspeed.ai/tutorials/onebit-adam/)
NUM_NODES=4
NGPU_PER_NODE=8
MODEL_FILE="../../ckpt/bert-large-uncased-whole-word-masking-pytorch_model.bin"
ORIGIN_CONFIG_FILE="../../ckpt/bert-large-uncased-whole-word-masking-config.json"
SQUAD_DIR="../../data"
OUTPUT_DIR=$1
LR=3e-5
SEED=$RANDOM
MASTER_PORT=12345
DROPOUT=0.1
sudo rm -rf ${OUTPUT_DIR}
NGPU=$((NGPU_PER_NODE*NUM_NODES))
EFFECTIVE_BATCH_SIZE=96
MAX_GPU_BATCH_SIZE=3
PER_GPU_BATCH_SIZE=$((EFFECTIVE_BATCH_SIZE/NGPU))
if [[ $PER_GPU_BATCH_SIZE -lt $MAX_GPU_BATCH_SIZE ]]; then
GRAD_ACCUM_STEPS=1
else
GRAD_ACCUM_STEPS=$((PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE))
fi
JOB_NAME="onebit_deepspeed_${NGPU}GPUs_${EFFECTIVE_BATCH_SIZE}batch_size"
config_json=deepspeed_onebitadam_bsz96_config.json
mpirun -n $NGPU -ppn $NGPU_PER_NODE -f /tmp/deepspeed_mvapich_hostfile -env MV2_SUPPORT_DL=1 -env MV2_USE_GDR=0 -env MV2_USE_CUDA=1 -env MV2_USE_GDRCOPY=0 -env MV2_SMP_USE_CMA=0 -env MV2_DEBUG_SHOW_BACKTRACE=1 python ../../nvidia_run_squad_deepspeed.py \
--bert_model bert-large-uncased \
--do_train \
--do_lower_case \
--predict_batch_size 3 \
--do_predict \
--train_file $SQUAD_DIR/train-v1.1.json \
--predict_file $SQUAD_DIR/dev-v1.1.json \
--train_batch_size $PER_GPU_BATCH_SIZE \
--learning_rate ${LR} \
--num_train_epochs 2.0 \
--max_seq_length 384 \
--doc_stride 128 \
--output_dir $OUTPUT_DIR \
--job_name ${JOB_NAME} \
--gradient_accumulation_steps ${GRAD_ACCUM_STEPS} \
--fp16 \
--deepspeed \
--deepspeed_mpi \
--deepspeed_transformer_kernel \
--deepspeed_config ${config_json} \
--dropout ${DROPOUT} \
--model_file $MODEL_FILE \
--seed ${SEED} \
--ckpt_type HF \
--origin_bert_config_file ${ORIGIN_CONFIG_FILE} \
{
"train_batch_size": 96,
"train_micro_batch_size_per_gpu": 3,
"steps_per_print": 100,
"optimizer": {
"type": "OnebitAdam",
"params": {
"lr": 3e-5,
"freeze_step": 400,
"weight_decay": 0.0,
"bias_correction": false,
"cuda_aware": false,
"comm_backend_name": "nccl"
}
},
"gradient_clipping": 1.0,
"fp16": {
"enabled": true
}
}
# This script requires pytorch >= 1.8
# (and nccl >= 2.8.3 if you have 64 or more GPUs).
# Read the tutorial for more details:
# https://www.deepspeed.ai/tutorials/onebit-adam/
NUM_NODES=4
NGPU_PER_NODE=8
MODEL_FILE="../../ckpt/bert-large-uncased-whole-word-masking-pytorch_model.bin"
ORIGIN_CONFIG_FILE="../../ckpt/bert-large-uncased-whole-word-masking-config.json"
SQUAD_DIR="../../data"
OUTPUT_DIR=$1
LR=3e-5
SEED=$RANDOM
MASTER_PORT=12345
DROPOUT=0.1
sudo rm -rf ${OUTPUT_DIR}
NGPU=$((NGPU_PER_NODE*NUM_NODES))
EFFECTIVE_BATCH_SIZE=96
MAX_GPU_BATCH_SIZE=3
PER_GPU_BATCH_SIZE=$((EFFECTIVE_BATCH_SIZE/NGPU))
if [[ $PER_GPU_BATCH_SIZE -lt $MAX_GPU_BATCH_SIZE ]]; then
GRAD_ACCUM_STEPS=1
else
GRAD_ACCUM_STEPS=$((PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE))
fi
JOB_NAME="onebit_deepspeed_${NGPU}GPUs_${EFFECTIVE_BATCH_SIZE}batch_size"
config_json=deepspeed_onebitadam_bsz96_config.json
# NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed.
NCCL_TREE_THRESHOLD=0 NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 deepspeed ../../nvidia_run_squad_deepspeed.py \
--bert_model bert-large-uncased \
--do_train \
--do_lower_case \
--predict_batch_size 3 \
--do_predict \
--train_file $SQUAD_DIR/train-v1.1.json \
--predict_file $SQUAD_DIR/dev-v1.1.json \
--train_batch_size $PER_GPU_BATCH_SIZE \
--learning_rate ${LR} \
--num_train_epochs 2.0 \
--max_seq_length 384 \
--doc_stride 128 \
--output_dir $OUTPUT_DIR \
--job_name ${JOB_NAME} \
--gradient_accumulation_steps ${GRAD_ACCUM_STEPS} \
--fp16 \
--deepspeed \
--deepspeed_transformer_kernel \
--deepspeed_config ${config_json} \
--dropout ${DROPOUT} \
--model_file $MODEL_FILE \
--seed ${SEED} \
--ckpt_type HF \
--origin_bert_config_file ${ORIGIN_CONFIG_FILE} \
NOTICES AND INFORMATION
Do Not Translate or Localize
This software incorporates material from third parties. Microsoft makes certain
open source code available at https://3rdpartysource.microsoft.com, or you may
send a check or money order for US $5.00, including the product name, the open
source component name, and version number, to:
Source Code Compliance Team
Microsoft Corporation
One Microsoft Way
Redmond, WA 98052
USA
Notwithstanding any other terms, you may reverse engineer this software to the
extent required to debug changes to any libraries licensed under the GNU Lesser
General Public License.
Component. BingBertSquad
Open Source License/Copyright Notice.
Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
{
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 16,
"num_hidden_layers": 24,
"pad_token_id": 0,
"type_vocab_size": 2,
"vocab_size": 30522
}
# coding=utf-8
# This script references to below file from HuggingFace:
# https://github.com/huggingface/transformers/blob/d541938/src/transformers/modeling_bert.py
#
# It converts Tensorflow and Huggingface checkpoint files to DeepSpeed.
import os
import argparse
import logging
import torch
import re
import numpy as np
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def set_data(param, array):
try:
assert param.shape == array.shape
except AssertionError as e:
e.args += (param.shape, array.shape)
raise
param.data = torch.from_numpy(array)
def load_tf_weights_in_bert_kernel(model, ckpt_path, voc_size_diff):
""" Load tf checkpoints in DeepSpeed model.
"""
try:
import re
import numpy as np
import tensorflow as tf
except ImportError:
logger.error(
"Loading a TensorFlow model in DeepSpeed, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path = os.path.abspath(ckpt_path)
logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
# Load weights from TF model
init_vars = tf.train.list_variables(tf_path)
names = []
arrays = []
for name, shape in init_vars:
logger.info("Loading TF weight {} with shape {}".format(name, shape))
array = tf.train.load_variable(tf_path, name)
names.append(name)
arrays.append(array)
qkv = {}
for name_str, array in zip(names, arrays):
name = name_str.split("/")
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model
if any(
n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
for n in name
):
logger.info("Skipping {}".format("/".join(name)))
continue
pointer = model
key = None
skipping = False
for m_name in name:
if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
scope_names = re.split(r"_(\d+)", m_name)
else:
scope_names = [m_name]
if scope_names[0] == "kernel" or scope_names[0] == "gamma":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
pointer = getattr(pointer, "bias")
elif scope_names[0] == "output_weights":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "squad":
pointer = getattr(pointer, "classifier")
# Special in deepspeed.
elif name_str.find("bert/pooler/dense") >= 0 and scope_names[0] == "dense":
pointer = getattr(pointer, "dense_act")
elif name_str.find("bert/embeddings/LayerNorm/gamma") >= 0 and scope_names[0] == "gamma":
pointer = getattr(pointer, "weight")
elif name_str.find("bert/embeddings/LayerNorm/beta") >= 0 and scope_names[0] == "beta":
pointer = getattr(pointer, "bias")
else:
try:
pointer = getattr(pointer, scope_names[0])
except AttributeError:
logger.info("Skipping {}".format("/".join(name)))
skipping = True
break
if len(scope_names) >= 2:
num = int(scope_names[1])
pointer = pointer[num]
# For transofrmer kernel layers.
if scope_names[0] == 'layer':
if name_str.find("attention/self/query/kernel") > 0:
key = "qw"
elif name_str.find("attention/self/query/bias") > 0:
key = "qb"
elif name_str.find("attention/self/key/kernel") > 0:
key = "kw"
elif name_str.find("attention/self/key/bias") > 0:
key = "kb"
elif name_str.find("attention/self/value/kernel") > 0:
key = "vw"
elif name_str.find("attention/self/value/bias") > 0:
key = "vb"
elif name_str.find("attention/output/dense/kernel") > 0:
pointer = getattr(pointer, "attn_ow")
elif name_str.find("attention/output/dense/bias") > 0:
pointer = getattr(pointer, "attn_ob")
elif name_str.find("attention/output/LayerNorm/gamma") > 0:
pointer = getattr(pointer, "attn_nw")
elif name_str.find("attention/output/LayerNorm/beta") > 0:
pointer = getattr(pointer, "attn_nb")
elif name_str.find("intermediate/dense/kernel") > 0:
pointer = getattr(pointer, "inter_w")
elif name_str.find("intermediate/dense/bias") > 0:
pointer = getattr(pointer, "inter_b")
elif name_str.find("output/dense/kernel") > 0 and name_str.find("attention") < 0:
pointer = getattr(pointer, "output_w")
elif name_str.find("output/dense/bias") > 0 and name_str.find("attention") < 0:
pointer = getattr(pointer, "output_b")
elif name_str.find("output/LayerNorm/gamma") > 0 and name_str.find("attention") < 0:
pointer = getattr(pointer, "norm_w")
elif name_str.find("output/LayerNorm/beta") > 0 and name_str.find("attention") < 0:
pointer = getattr(pointer, "norm_b")
else:
raise ValueError(f"unexpect scope name {name_str} in transformer layer.")
break
if skipping:
continue
if m_name[-11:] == "_embeddings":
pointer = getattr(pointer, "weight")
elif "kernel" in name:
array = np.transpose(array)
if key is not None:
qkv[key] = array
if all(k in qkv for k in ("qw", "kw", "vw")):
array = np.concatenate((qkv["qw"], qkv["kw"], qkv["vw"]), axis=0)
pointer = getattr(pointer, "attn_qkvw")
qkv.pop("qw")
qkv.pop("kw")
qkv.pop("vw")
elif all(k in qkv for k in ("qb", "kb", "vb")):
array = np.concatenate((qkv["qb"], qkv["kb"], qkv["vb"]), axis=0)
pointer = getattr(pointer, "attn_qkvb")
qkv.pop("qb")
qkv.pop("kb")
qkv.pop("vb")
elif key is not None:
# For Q/K/V weight/bias in TF, do nothing if not all ready to merge.
continue
# DeepSpeed BERT model has voc_size 8 aligned.
if voc_size_diff > 0 and name_str.find("embeddings/word_embeddings") >= 0:
z = np.zeros((voc_size_diff, array.shape[1]), dtype=array.dtype)
array = np.concatenate((array, z), axis=0)
set_data(pointer, array)
logger.info("Initialize DeepSpeed weight {}".format(name))
return model
def load_hf_weights_in_bert_kernel(model, ckpt_path, voc_size_diff):
""" Load huggingface checkpoints and convert to a deepspeed model.
"""
hf_path = os.path.abspath(ckpt_path)
logger.info("Converting Huggingface checkpoint from {}".format(hf_path))
# Load weights from Huggingface model
ckpt = torch.load(hf_path, map_location=torch.device("cpu"))
qkv = {}
for name_str in ckpt.keys():
array = ckpt[name_str].numpy()
logger.info("Loading Huggingface weight {} with shape {}".format(name_str, array.shape))
name = name_str.split(".")
pointer = model
key = None
is_layer = False
skipping = False
for m_name in name:
# Special in deepspeed.
if name_str.find("bert.pooler.dense") >= 0 and m_name == "dense":
pointer = getattr(pointer, "dense_act")
elif is_layer:
pass
else:
try:
pointer = getattr(pointer, m_name)
except AttributeError:
logger.info("Skipping {}".format(".".join(name)))
skipping = True
break
if m_name == "layer":
is_layer = True
continue
if m_name.isnumeric() and is_layer:
num = int(m_name)
pointer = pointer[num]
is_layer = False
# For transofrmer kernel layers.
if name_str.find("attention.self.query.weight") > 0:
key = "qw"
elif name_str.find("attention.self.query.bias") > 0:
key = "qb"
elif name_str.find("attention.self.key.weight") > 0:
key = "kw"
elif name_str.find("attention.self.key.bias") > 0:
key = "kb"
elif name_str.find("attention.self.value.weight") > 0:
key = "vw"
elif name_str.find("attention.self.value.bias") > 0:
key = "vb"
elif name_str.find("attention.output.dense.weight") > 0:
pointer = getattr(pointer, "attn_ow")
elif name_str.find("attention.output.dense.bias") > 0:
pointer = getattr(pointer, "attn_ob")
elif name_str.find("attention.output.LayerNorm.weight") > 0:
pointer = getattr(pointer, "attn_nw")
elif name_str.find("attention.output.LayerNorm.bias") > 0:
pointer = getattr(pointer, "attn_nb")
elif name_str.find("intermediate.dense.weight") > 0:
pointer = getattr(pointer, "inter_w")
elif name_str.find("intermediate.dense.bias") > 0:
pointer = getattr(pointer, "inter_b")
elif name_str.find("output.dense.weight") > 0 and name_str.find("attention") < 0:
pointer = getattr(pointer, "output_w")
elif name_str.find("output.dense.bias") > 0 and name_str.find("attention") < 0:
pointer = getattr(pointer, "output_b")
elif name_str.find("output.LayerNorm.weight") > 0 and name_str.find("attention") < 0:
pointer = getattr(pointer, "norm_w")
elif name_str.find("output.LayerNorm.bias") > 0 and name_str.find("attention") < 0:
pointer = getattr(pointer, "norm_b")
else:
raise ValueError(f"unexpect scope name {name_str} in transformer layer.")
break
if skipping:
continue
if key is not None:
qkv[key] = array
if all(k in qkv for k in ("qw", "kw", "vw")):
array = np.concatenate((qkv["qw"], qkv["kw"], qkv["vw"]), axis=0)
pointer = getattr(pointer, "attn_qkvw")
qkv.pop("qw")
qkv.pop("kw")
qkv.pop("vw")
elif all(k in qkv for k in ("qb", "kb", "vb")):
array = np.concatenate((qkv["qb"], qkv["kb"], qkv["vb"]), axis=0)
pointer = getattr(pointer, "attn_qkvb")
qkv.pop("qb")
qkv.pop("kb")
qkv.pop("vb")
elif key is not None:
# For Q/K/V weight/bias in HF, do nothing if not all ready to merge.
continue
# DeepSpeed BERT model has voc_size 8 aligned.
if voc_size_diff > 0 and name_str.find("embeddings.word_embeddings") >= 0:
z = np.zeros((voc_size_diff, array.shape[1]), dtype=array.dtype)
array = np.concatenate((array, z), axis=0)
set_data(pointer, array)
logger.info("Initialize DeepSpeed weight {}".format(name))
return model
def load_hf_weights_in_bert_torch(model, ckpt_path, voc_size_diff):
""" Load huggingface checkpoints and convert to a deepspeed model.
"""
hf_path = os.path.abspath(ckpt_path)
logger.info("Converting Huggingface checkpoint from {}".format(hf_path))
# Load weights from Huggingface model
ckpt = torch.load(hf_path, map_location=torch.device("cpu"))
qkv = {}
for name_str in ckpt.keys():
array = ckpt[name_str].numpy()
logger.info("Loading Huggingface weight {} with shape {}".format(name_str, array.shape))
name = name_str.split(".")
pointer = model
key = None
is_layer = False
skipping = False
for m_name in name:
# Special in deepspeed.
if name_str.find("intermediate.dense") >= 0 and m_name == "dense":
pointer = getattr(pointer, "dense_act")
elif name_str.find("pooler.dense") >= 0 and m_name == "dense":
pointer = getattr(pointer, "dense_act")
else:
try:
pointer = getattr(pointer, m_name)
except AttributeError:
logger.info("Skipping {}".format(".".join(name)))
skipping = True
break
if skipping:
continue
# DeepSpeed BERT model has voc_size 8 aligned.
if voc_size_diff > 0 and name_str.find("embeddings.word_embeddings") >= 0:
z = np.zeros((voc_size_diff, array.shape[1]), dtype=array.dtype)
array = np.concatenate((array, z), axis=0)
set_data(pointer, array)
logger.info("Initialize DeepSpeed weight {}".format(name))
return model
def convert_ckpt_to_deepspeed(model, ckpt_type, ckpt_path, vocab_diff, kernel_enabled):
# Load weights from checkpoint
if ckpt_type == "HF":
if kernel_enabled:
load_hf_weights_in_bert_kernel(model, ckpt_path, vocab_diff)
else:
load_hf_weights_in_bert_torch(model, ckpt_path, vocab_diff)
elif ckpt_type == "TF":
if kernel_enabled:
load_tf_weights_in_bert_kernel(model, ckpt_path, vocab_diff)
else:
raise ValueError("--deepspeed_transformer_kernel is required for loading TF checkpoint.")
else:
raise ValueError(f"Invalid ckpt_type.")
{
"train_batch_size": 24,
"train_micro_batch_size_per_gpu": 3,
"steps_per_print": 10,
"optimizer": {
"type": "Adam",
"params": {
"lr": 3e-5,
"weight_decay": 0.0,
"bias_correction": false
}
},
"gradient_clipping": 1.0,
"fp16": {
"enabled": true
}
}
import argparse
import json
import evaluate as eval
if __name__ == '__main__':
expected_version = '1.1'
parser = argparse.ArgumentParser(description='Evaluation for SQuAD ' +
expected_version)
parser.add_argument('dataset_file', help='Dataset file')
parser.add_argument('prediction_file', help='Prediction File')
args = parser.parse_args()
print(
json.dumps(
eval.evaluate(expected_version, args.dataset_file,
args.prediction_file)))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment