增加ds框架测试模型

316d3f90 · Pan,Huiwen · aebde649 · 316d3f90 · 316d3f90 · 316d3f90
Commit 316d3f90 authored Jul 14, 2022 by Pan,Huiwen
20 changed files
--- a/Deepspeed/BingBertGlue/turing/file_utils.py
+++ b/Deepspeed/BingBertGlue/turing/file_utils.py
+"""
+Utilities for working with the local dataset cache.
+This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
+Copyright by the AllenNLP authors.
+"""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import json
+import logging
+import os
+import shutil
+import tempfile
+from functools import wraps
+from hashlib import sha256
+import sys
+from io import open
+
+import boto3
+import requests
+from botocore.exceptions import ClientError
+from tqdm import tqdm
+
+try:
+    from urllib.parse import urlparse
+except ImportError:
+    from urlparse import urlparse
+
+try:
+    from pathlib import Path
+    PYTORCH_PRETRAINED_BERT_CACHE = Path(
+        os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
+                  Path.home() / '.pytorch_pretrained_bert'))
+except AttributeError:
+    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv(
+        'PYTORCH_PRETRAINED_BERT_CACHE',
+        os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+def url_to_filename(url, etag=None):
+    """
+    Convert `url` into a hashed filename in a repeatable way.
+    If `etag` is specified, append its hash to the url's, delimited
+    by a period.
+    """
+    url_bytes = url.encode('utf-8')
+    url_hash = sha256(url_bytes)
+    filename = url_hash.hexdigest()
+
+    if etag:
+        etag_bytes = etag.encode('utf-8')
+        etag_hash = sha256(etag_bytes)
+        filename += '.' + etag_hash.hexdigest()
+
+    return filename
+
+
+def filename_to_url(filename, cache_dir=None):
+    """
+    Return the url and etag (which may be ``None``) stored for `filename`.
+    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    cache_path = os.path.join(cache_dir, filename)
+    if not os.path.exists(cache_path):
+        raise EnvironmentError("file {} not found".format(cache_path))
+
+    meta_path = cache_path + '.json'
+    if not os.path.exists(meta_path):
+        raise EnvironmentError("file {} not found".format(meta_path))
+
+    with open(meta_path, encoding="utf-8") as meta_file:
+        metadata = json.load(meta_file)
+    url = metadata['url']
+    etag = metadata['etag']
+
+    return url, etag
+
+
+def cached_path(url_or_filename, cache_dir=None):
+    """
+    Given something that might be a URL (or might be a local path),
+    determine which. If it's a URL, download the file and cache it, and
+    return the path to the cached file. If it's already a local path,
+    make sure the file exists and then return the path.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
+        url_or_filename = str(url_or_filename)
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    parsed = urlparse(url_or_filename)
+
+    if parsed.scheme in ('http', 'https', 's3'):
+        # URL, so get it from the cache (downloading if necessary)
+        return get_from_cache(url_or_filename, cache_dir)
+    elif os.path.exists(url_or_filename):
+        # File, and it exists.
+        return url_or_filename
+    elif parsed.scheme == '':
+        # File, but it doesn't exist.
+        raise EnvironmentError("file {} not found".format(url_or_filename))
+    else:
+        # Something unknown
+        raise ValueError(
+            "unable to parse {} as a URL or as a local path".format(
+                url_or_filename))
+
+
+def split_s3_path(url):
+    """Split a full s3 path into the bucket name and path."""
+    parsed = urlparse(url)
+    if not parsed.netloc or not parsed.path:
+        raise ValueError("bad s3 path {}".format(url))
+    bucket_name = parsed.netloc
+    s3_path = parsed.path
+    # Remove '/' at beginning of path.
+    if s3_path.startswith("/"):
+        s3_path = s3_path[1:]
+    return bucket_name, s3_path
+
+
+def s3_request(func):
+    """
+    Wrapper function for s3 requests in order to create more helpful error
+    messages.
+    """
+    @wraps(func)
+    def wrapper(url, *args, **kwargs):
+        try:
+            return func(url, *args, **kwargs)
+        except ClientError as exc:
+            if int(exc.response["Error"]["Code"]) == 404:
+                raise EnvironmentError("file {} not found".format(url))
+            else:
+                raise
+
+    return wrapper
+
+
+@s3_request
+def s3_etag(url):
+    """Check ETag on S3 object."""
+    s3_resource = boto3.resource("s3")
+    bucket_name, s3_path = split_s3_path(url)
+    s3_object = s3_resource.Object(bucket_name, s3_path)
+    return s3_object.e_tag
+
+
+@s3_request
+def s3_get(url, temp_file):
+    """Pull a file directly from S3."""
+    s3_resource = boto3.resource("s3")
+    bucket_name, s3_path = split_s3_path(url)
+    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
+
+
+def http_get(url, temp_file):
+    req = requests.get(url, stream=True)
+    content_length = req.headers.get('Content-Length')
+    total = int(content_length) if content_length is not None else None
+    progress = tqdm(unit="B", total=total)
+    for chunk in req.iter_content(chunk_size=1024):
+        if chunk:  # filter out keep-alive new chunks
+            progress.update(len(chunk))
+            temp_file.write(chunk)
+    progress.close()
+
+
+def get_from_cache(url, cache_dir=None):
+    """
+    Given a URL, look for the corresponding dataset in the local cache.
+    If it's not there, download it. Then return the path to the cached file.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    if not os.path.exists(cache_dir):
+        os.makedirs(cache_dir)
+
+    # Get eTag to add to filename, if it exists.
+    if url.startswith("s3://"):
+        etag = s3_etag(url)
+    else:
+        response = requests.head(url, allow_redirects=True)
+        if response.status_code != 200:
+            raise IOError(
+                "HEAD request failed for url {} with status code {}".format(
+                    url, response.status_code))
+        etag = response.headers.get("ETag")
+
+    filename = url_to_filename(url, etag)
+
+    # get cache path to put the file
+    cache_path = os.path.join(cache_dir, filename)
+
+    if not os.path.exists(cache_path):
+        # Download to temporary file, then copy to cache dir once finished.
+        # Otherwise you get corrupt cache entries if the download gets interrupted.
+        with tempfile.NamedTemporaryFile() as temp_file:
+            logger.info("%s not found in cache, downloading to %s", url,
+                        temp_file.name)
+
+            # GET file object
+            if url.startswith("s3://"):
+                s3_get(url, temp_file)
+            else:
+                http_get(url, temp_file)
+
+            # we are copying the file before closing it, so flush to avoid truncation
+            temp_file.flush()
+            # shutil.copyfileobj() starts at the current position, so go to the start
+            temp_file.seek(0)
+
+            logger.info("copying %s to cache at %s", temp_file.name,
+                        cache_path)
+            with open(cache_path, 'wb') as cache_file:
+                shutil.copyfileobj(temp_file, cache_file)
+
+            logger.info("creating metadata file for %s", cache_path)
+            meta = {'url': url, 'etag': etag}
+            meta_path = cache_path + '.json'
+            with open(meta_path, 'w', encoding="utf-8") as meta_file:
+                json.dump(meta, meta_file)
+
+            logger.info("removing temp file %s", temp_file.name)
+
+    return cache_path
+
+
+def read_set_from_file(filename):
+    '''
+    Extract a de-duped collection (set) of text from a file.
+    Expected file format is one item per line.
+    '''
+    collection = set()
+    with open(filename, 'r', encoding='utf-8') as file_:
+        for line in file_:
+            collection.add(line.rstrip())
+    return collection
+
+
+def get_file_extension(path, dot=True, lower=True):
+    ext = os.path.splitext(path)[1]
+    ext = ext if dot else ext[1:]
+    return ext.lower() if lower else ext
--- a/Deepspeed/BingBertGlue/turing/logger.py
+++ b/Deepspeed/BingBertGlue/turing/logger.py
+import logging
+import torch.distributed as dist
+
+logging.basicConfig(
+    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+    datefmt='%m/%d/%Y %H:%M:%S',
+    level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class Logger():
+    def __init__(self, cuda=False):
+        self.logger = logging.getLogger(__name__)
+        self.cuda = cuda
+
+    def info(self, message, *args, **kwargs):
+        if (self.cuda and dist.get_rank() == 0) or not self.cuda:
+            self.logger.info(message, *args, **kwargs)
+
+    def error(self, message, *args, **kwargs):
+        self.logger.error(message, *args, **kwargs)
--- a/Deepspeed/BingBertGlue/turing/loss.py
+++ b/Deepspeed/BingBertGlue/turing/loss.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+
+class FocalLoss(nn.Module):
+    r"""
+        This criterion is a implemenation of Focal Loss, which is proposed in
+        Focal Loss for Dense Object Detection.
+
+            Loss(x, class) = - \alpha (1-softmax(x)[class])^gamma \log(softmax(x)[class])
+
+        The losses are averaged across observations for each minibatch.
+        Args:
+            alpha(1D Tensor, Variable) : the scalar factor for this criterion
+            gamma(float, double) : gamma > 0; reduces the relative loss for well-classiﬁed examples (p > .5),
+                                   putting more focus on hard, misclassiﬁed examples
+            size_average(bool): size_average(bool): By default, the losses are averaged over observations for each minibatch.
+                                However, if the field size_average is set to False, the losses are
+                                instead summed for each minibatch.
+    """
+    def __init__(self, class_num, alpha=None, gamma=2, size_average=True):
+        super(FocalLoss, self).__init__()
+        if alpha is None:
+            self.alpha = torch.ones(class_num, 1)
+        else:
+            if isinstance(alpha, Variable):
+                self.alpha = alpha
+            else:
+                self.alpha = Variable(alpha)
+        self.gamma = gamma
+        self.class_num = class_num
+        self.size_average = size_average
+
+    def forward(self, inputs, targets):
+        N = inputs.size(0)
+        C = inputs.size(1)
+        P = F.softmax(inputs)
+
+        class_mask = inputs.data.new(N, C).fill_(0)
+        # class_mask = Variable(class_mask)
+        ids = targets.view(-1, 1)
+        class_mask.scatter_(1, ids.data, 1.)
+
+        if inputs.is_cuda and not self.alpha.is_cuda:
+            self.alpha = self.alpha.cuda()
+        alpha = self.alpha[ids.data.view(-1)]
+
+        probs = (P * class_mask).sum(1).view(-1, 1)
+
+        log_p = probs.log()
+
+        batch_loss = -alpha * (torch.pow((1 - probs), self.gamma)) * log_p
+
+        if self.size_average:
+            loss = batch_loss.mean()
+        else:
+            loss = batch_loss.sum()
+        return loss
--- a/Deepspeed/BingBertGlue/turing/models.py
+++ b/Deepspeed/BingBertGlue/turing/models.py
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from turing.utils import TorchTuple
+
+from pytorch_pretrained_bert.modeling import BertModel
+from pytorch_pretrained_bert.modeling import BertPreTrainingHeads, PreTrainedBertModel, BertPreTrainingHeads
+from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
+
+class BertPretrainingLoss(PreTrainedBertModel):
+    def __init__(self, bert_encoder, config):
+        super(BertPretrainingLoss, self).__init__(config)
+        self.bert = bert_encoder
+        self.cls = BertPreTrainingHeads(
+            config, self.bert.embeddings.word_embeddings.weight)
+        self.cls.apply(self.init_bert_weights)
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                attention_mask=None,
+                masked_lm_labels=None,
+                next_sentence_label=None):
+        sequence_output, pooled_output = self.bert(
+            input_ids,
+            token_type_ids,
+            attention_mask,
+            output_all_encoded_layers=False)
+        prediction_scores, seq_relationship_score = self.cls(
+            sequence_output, pooled_output)
+
+        if masked_lm_labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
+                                          next_sentence_label.view(-1))
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                masked_lm_labels.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+            return total_loss
+        else:
+            return prediction_scores, seq_relationship_score
+
+
+class BertClassificationLoss(PreTrainedBertModel):
+    def __init__(self, bert_encoder, config, num_labels: int = 1):
+        super(BertClassificationLoss, self).__init__(config)
+        self.bert = bert_encoder
+        self.num_labels = num_labels
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, num_labels)
+        self.classifier.apply(self.init_bert_weights)
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                attention_mask=None,
+                labels=None):
+        _, pooled_output = self.bert(input_ids,
+                                     token_type_ids,
+                                     attention_mask,
+                                     output_all_encoded_layers=False)
+        pooled_output = self.dropout(pooled_output)
+        scores = self.classifier(pooled_output)
+        if labels is not None:
+            loss_fct = nn.BCEWithLogitsLoss()
+            loss = loss_fct(scores.view(-1, self.num_labels),
+                            labels.view(-1, 1))
+            return loss
+        else:
+            return scores
+
+
+class BertRegressionLoss(PreTrainedBertModel):
+    def __init__(self, bert_encoder, config):
+        super(BertRegressionLoss, self).__init__(config)
+        self.bert = bert_encoder
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+        self.classifier.apply(self.init_bert_weights)
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                attention_mask=None,
+                labels=None):
+        _, pooled_output = self.bert(input_ids,
+                                     token_type_ids,
+                                     attention_mask,
+                                     output_all_encoded_layers=False)
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        if labels is not None:
+            loss_fct = MSELoss()
+            loss = loss_fct(logits.view(-1, 1), labels.view(-1, 1))
+            return loss
+        else:
+            return logits
+
+
+class BertMultiTask:
+    def __init__(self, args):
+        self.config = args.config
+
+        if not args.use_pretrain:
+
+            if args.progressive_layer_drop:
+                print("BertConfigPreLnLayerDrop")
+                from nvidia.modelingpreln_layerdrop import BertForPreTrainingPreLN, BertConfig
+            else:
+                from nvidia.modelingpreln import BertForPreTrainingPreLN, BertConfig
+
+            bert_config = BertConfig(**self.config["bert_model_config"])
+            bert_config.vocab_size = len(args.tokenizer.vocab)
+
+            # Padding for divisibility by 8
+            if bert_config.vocab_size % 8 != 0:
+                bert_config.vocab_size += 8 - (bert_config.vocab_size % 8)
+            print("VOCAB SIZE:", bert_config.vocab_size)
+
+            self.network = BertForPreTrainingPreLN(bert_config, args)
+        # Use pretrained bert weights
+        else:
+            self.bert_encoder = BertModel.from_pretrained(
+                self.config['bert_model_file'],
+                cache_dir=PYTORCH_PRETRAINED_BERT_CACHE /
+                'distributed_{}'.format(args.local_rank))
+            bert_config = self.bert_encoder.config
+
+        self.device = None
+
+    def set_device(self, device):
+        self.device = device
+
+    def save(self, filename: str):
+        network = self.network.module
+        return torch.save(network.state_dict(), filename)
+
+    def load(self, model_state_dict: str):
+        return self.network.module.load_state_dict(
+            torch.load(model_state_dict,
+                       map_location=lambda storage, loc: storage))
+
+    def move_batch(self, batch: TorchTuple, non_blocking=False):
+        return batch.to(self.device, non_blocking)
+
+    def eval(self):
+        self.network.eval()
+
+    def train(self):
+        self.network.train()
+
+    def save_bert(self, filename: str):
+        return torch.save(self.bert_encoder.state_dict(), filename)
+
+    def to(self, device):
+        assert isinstance(device, torch.device)
+        self.network.to(device)
+
+    def half(self):
+        self.network.half()
--- a/Deepspeed/BingBertGlue/turing/sources.py
+++ b/Deepspeed/BingBertGlue/turing/sources.py
+from tqdm import tqdm
+from typing import Tuple
+from random import shuffle
+import pickle
+import random
+import numpy as np
+from pathlib import Path
+
+from pytorch_pretrained_bert.tokenization import BertTokenizer
+
+
+def truncate_input_sequence(tokens_a, tokens_b, max_num_tokens):
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_num_tokens:
+            break
+
+        trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
+        assert len(trunc_tokens) >= 1
+
+        # We want to sometimes truncate from the front and sometimes from the
+        # back to add more randomness and avoid biases.
+        if random.random() < 0.5:
+            del trunc_tokens[0]
+        else:
+            trunc_tokens.pop()
+
+
+class TokenInstance:
+    """ This TokenInstance is a obect to have the basic units of data that should be
+        extracted from the raw text file and can be consumed by any BERT like model.
+    """
+    def __init__(self, tokens_a, tokens_b, is_next, lang="en"):
+        self.tokens_a = tokens_a
+        self.tokens_b = tokens_b
+        self.is_next = is_next  # 0 is if in continuation, 1 if is random
+        self.lang = lang
+
+    def get_values(self):
+        return (self.tokens_a, self.tokens_b, self.is_next)
+
+    def get_lang(self):
+        return self.lang
+
+
+class QueryPassageDataset:
+    def __init__(self, path, readin=20000000):
+        all_pairs = []
+        with open(path, encoding="utf-8") as fd:
+            for i, line in enumerate(tqdm(fd)):
+                line = line.replace('\n', '')
+                qpl_tuple: Tuple[str, str, str] = line.split('\t')
+                all_pairs.append(qpl_tuple)
+                if i > readin:
+                    break
+
+        shuffle(all_pairs)
+        self.all_pairs = all_pairs
+        self.len = len(self.all_pairs)
+
+    def __len__(self):
+        return self.len
+
+
+class QueryPassageFineTuningDataset:
+    def __init__(self, path, readin=20000000):
+        all_pairs = []
+        with open(path, encoding="utf-8") as fd:
+            for i, line in enumerate(tqdm(fd)):
+                line = line.replace('\n', '')
+                entities = line.split('\t')
+                qpl_tuple: Tuple[str, str,
+                                 str] = (entities[0], entities[2], entities[4])
+                all_pairs.append(qpl_tuple)
+                if i > readin:
+                    break
+
+        shuffle(all_pairs)
+        self.all_pairs = all_pairs
+        self.len = len(self.all_pairs)
+
+    def __len__(self):
+        return self.len
+
+
+class QueryInstanceDataset:
+    def __init__(self, path, readin=20000000):
+        all_pairs = []
+        with open(path, encoding="utf-8") as fd:
+            for i, line in enumerate(tqdm(fd)):
+                line = line.replace('\n', '')
+                qpl_tuple: Tuple[str, str, str] = line.split('\t')
+                all_pairs.append(qpl_tuple)
+                if i > readin:
+                    break
+
+        shuffle(all_pairs)
+        self.all_pairs = all_pairs
+        self.len = len(self.all_pairs)
+
+    def __len__(self):
+        return self.len
+
+
+class PretrainingDataCreator:
+    def __init__(self,
+                 path,
+                 tokenizer: BertTokenizer,
+                 max_seq_length,
+                 readin: int = 2000000,
+                 dupe_factor: int = 5,
+                 small_seq_prob: float = 0.1):
+        self.dupe_factor = dupe_factor
+        self.max_seq_length = max_seq_length
+        self.small_seq_prob = small_seq_prob
+
+        documents = []
+        instances = []
+        with open(path, encoding='utf-8') as fd:
+            for i, line in enumerate(tqdm(fd)):
+                line = line.replace('\n', '')
+                # Expected format (Q,T,U,S,D)
+                # query, title, url, snippet, document = line.split('\t')
+                # ! remove this following line later
+                document = line
+                if len(document.split("<sep>")) <= 3:
+                    continue
+                lines = document.split("<sep>")
+                document = []
+                for seq in lines:
+                    document.append(tokenizer.tokenize(seq))
+                # document = list(map(tokenizer.tokenize, lines))
+                documents.append(document)
+
+        documents = [x for x in documents if x]
+
+        self.documents = documents
+        for _ in range(self.dupe_factor):
+            for index in range(len(self.documents)):
+                instances.extend(self.create_training_instance(index))
+
+        shuffle(instances)
+        self.instances = instances
+        self.len = len(self.instances)
+        self.documents = None
+        documents = None
+
+    def __len__(self):
+        return self.len
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+
+    def save(self, filename):
+        with open(filename, 'wb') as outfile:
+            pickle.dump(self, outfile)
+
+    @staticmethod
+    def load(filename):
+        with open(filename, 'rb') as f:
+            return pickle.load(f)
+
+    def create_training_instance(self, index):
+        document = self.documents[index]
+        # l = 0
+        # for s in document:
+        #     l+=len(s)
+        # print(l)
+        # print(document)
+
+        # Need to add [CLS] + 2*[SEP] tokens
+        max_num_tokens = self.max_seq_length - 3
+
+        # We want to maximize the inp sequence but also want inputs similar
+        # to our generic task inputs which will be compartively smaller
+        # than the data on which we intend to pre-train.
+        target_seq_length = max_num_tokens
+        if random.random() < self.small_seq_prob:
+            target_seq_length = random.randint(5, max_num_tokens)
+
+        # Need to make the sequences split for NSP task for interesting
+        # rather than choosing some arbitrary point. If not the NSP
+        # task might become way too easy.
+        instances = []
+        current_chunk = []
+        current_length = 0
+        i = 0
+        while i < len(document):
+            segment = document[i]
+            current_chunk.append(segment)
+            current_length += len(segment)
+            if i == len(document) - 1 or current_length >= target_seq_length:
+                if current_chunk:
+                    # `a_end` is how many segments from `current_chunk` go into the `A`
+                    # (first) sentence.
+                    a_end = 1
+                    if len(current_chunk) >= 2:
+                        a_end = random.randint(1, len(current_chunk) - 1)
+
+                    tokens_a = []
+                    for j in range(a_end):
+                        tokens_a.extend(current_chunk[j])
+
+                    tokens_b = []
+
+                    # Random Next
+                    is_random_next = False
+                    if len(current_chunk) == 1 or random.random() < 0.5:
+                        is_random_next = True
+                        target_b_length = target_seq_length - len(tokens_a)
+
+                        # Pick a random document
+                        for _ in range(10):
+                            random_doc_index = random.randint(
+                                0,
+                                len(self.documents) - 1)
+                            if random_doc_index != index:
+                                break
+
+                        random_doc = self.documents[random_doc_index]
+                        random_start = random.randint(0, len(random_doc) - 1)
+                        for j in range(random_start, len(random_doc)):
+                            tokens_b.extend(random_doc[j])
+                            if len(tokens_b) >= target_b_length:
+                                break
+
+                        # We didn't actually use these segments so we "put them back" so
+                        # they don't go to waste.
+                        num_unused_segments = len(current_chunk) - a_end
+                        i -= num_unused_segments
+
+                    # Actual Next
+                    else:
+                        is_random_next = False
+                        for j in range(a_end, len(current_chunk)):
+                            tokens_b.extend(current_chunk[j])
+
+                    truncate_input_sequence(tokens_a, tokens_b, max_num_tokens)
+
+                    assert len(tokens_a) >= 1
+                    assert len(tokens_b) >= 1
+
+                    instances.append(
+                        TokenInstance(tokens_a, tokens_b, int(is_random_next)))
+                    # print(instances[-1])
+                current_chunk = []
+                current_length = 0
+            i += 1
+        # print(len(instances))
+        return instances
+
+
+class CleanBodyDataCreator(PretrainingDataCreator):
+    def __init__(self,
+                 path,
+                 tokenizer: BertTokenizer,
+                 max_seq_length: int = 512,
+                 readin: int = 2000000,
+                 dupe_factor: int = 5,
+                 small_seq_prob: float = 0.1):
+        self.dupe_factor = dupe_factor
+        self.max_seq_length = max_seq_length
+        self.small_seq_prob = small_seq_prob
+
+        documents = []
+        instances = []
+        with open(path, encoding='utf-8') as fd:
+            for i, line in enumerate(tqdm(fd)):
+                line = line.replace('\n', '')
+                url, cleanbody, rand_int = line.rstrip("\n").split("\t")
+                cleanbody = cleanbody.replace("#TAB#", " ").replace(
+                    "#NULL#", "").replace("#HASH#", "#")
+                cleanbody_parts = cleanbody.split("#R##N#")
+                for document in cleanbody_parts:
+                    lines = document.split("#N#")
+                    document = []
+                    document_len = 0
+                    for seq in lines:
+                        tok_seq = tokenizer.tokenize(seq)
+                        if len(tok_seq) != 0:
+                            document.append(tok_seq)
+                            document_len += len(tok_seq)
+                    if document_len >= 200:
+                        documents.append(document)
+
+        documents = [x for x in documents if x]
+
+        self.documents = documents
+        for _ in range(self.dupe_factor):
+            for index in range(len(self.documents)):
+                instances.extend(self.create_training_instance(index))
+
+        shuffle(instances)
+        self.instances = instances
+        self.len = len(self.instances)
+        self.documents = None
+        documents = None
+
+
+class WikiNBookCorpusPretrainingDataCreator(PretrainingDataCreator):
+    def __init__(self,
+                 path,
+                 tokenizer: BertTokenizer,
+                 max_seq_length: int = 512,
+                 readin: int = 2000000,
+                 dupe_factor: int = 6,
+                 small_seq_prob: float = 0.1):
+        self.dupe_factor = dupe_factor
+        self.max_seq_length = max_seq_length
+        self.small_seq_prob = small_seq_prob
+
+        documents = []
+        instances = []
+        with open(path, encoding='utf-8') as fd:
+            document = []
+            for i, line in enumerate(tqdm(fd)):
+                line = line.replace('\n', '')
+                # document = line
+                # if len(document.split("<sep>")) <= 3:
+                #     continue
+                if len(line) == 0:  # This is end of document
+                    documents.append(document)
+                    document = []
+                if len(line.split(' ')) > 2:
+                    document.append(tokenizer.tokenize(line))
+            if len(document) > 0:
+                documents.append(document)
+
+        documents = [x for x in documents if x]
+        print(documents[0])
+        print(len(documents))
+        self.documents = documents
+        for _ in range(self.dupe_factor):
+            for index in range(len(self.documents)):
+                instances.extend(self.create_training_instance(index))
+
+        shuffle(instances)
+        self.instances = instances
+        self.len = len(self.instances)
+        self.documents = None
+        documents = None
+
+
+class WikiPretrainingDataCreator(PretrainingDataCreator):
+    def __init__(self,
+                 path,
+                 tokenizer: BertTokenizer,
+                 max_seq_length: int = 512,
+                 readin: int = 2000000,
+                 dupe_factor: int = 6,
+                 small_seq_prob: float = 0.1):
+        self.dupe_factor = dupe_factor
+        self.max_seq_length = max_seq_length
+        self.small_seq_prob = small_seq_prob
+
+        documents = []
+        instances = []
+        with open(path, encoding='utf-8') as fd:
+            document = []
+            for i, line in enumerate(tqdm(fd)):
+                line = line.replace('\n', '')
+                # document = line
+                # if len(document.split("<sep>")) <= 3:
+                #     continue
+                if len(line
+                       ) > 0 and line[:2] == "[[":  # This is end of document
+                    documents.append(document)
+                    document = []
+                if len(line.split(' ')) > 2:
+                    document.append(tokenizer.tokenize(line))
+            if len(document) > 0:
+                documents.append(document)
+
+        documents = [x for x in documents if x]
+        # print(len(documents))
+        # print(len(documents[0]))
+        # print(documents[0][0:10])
+        self.documents = documents
+        for _ in range(self.dupe_factor):
+            for index in range(len(self.documents)):
+                instances.extend(self.create_training_instance(index))
+
+        shuffle(instances)
+        self.instances = instances
+        self.len = len(self.instances)
+        self.documents = None
+        documents = None
+
+
+class NumpyByteInstances:
+    TOKEN_SEP_VAL = int.from_bytes(b'\x1f', byteorder='big')
+
+    def __init__(self, data_creator):
+        self.data_creator = data_creator
+        self.getitem_fixed = self.sep_getitem_fixed if self.data_creator.use_separators else self.data_creator.nosep_getitem_fixed
+        # if self.data_creator.multilingual:
+        #     self.__getitem__ = self.getitem_multilingual
+        # else:
+        #     self.__getitem__ = self.getitem_monolingual
+
+    def getitem_multilingual(self, i):
+        tokens_a, tokens_b, is_next = self.getitem_fixed(i)
+        return TokenInstance(tokens_a,
+                             tokens_b,
+                             is_next,
+                             lang=self.data_creator.lang[i])
+
+    def getitem_monolingual(self, i):
+        return TokenInstance(*self.getitem_fixed(i))
+
+    def __getitem__(self, i):
+        if self.data_creator.multilingual:
+            return self.getitem_multilingual(i)
+        else:
+            return self.getitem_monolingual(i)
+
+    def nosep_getitem_fixed(self, i):
+        if i > self.data_creator.len:
+            raise IndexError
+        if i < 0:
+            i += self.data_creator.len
+        instance_start, instance_end = self.data_creator.instance_offsets[i:i +
+                                                                          2]
+        tok_offsets_start, tok_offsets_end = self.data_creator.instance_token_offsets[
+            i:i + 2]
+        token_offsets = self.data_creator.token_offsets[
+            tok_offsets_start:tok_offsets_end]
+        tokens_split = self.data_creator.tokens_split[i]
+        token_arrs = np.split(
+            self.data_creator.data[instance_start:instance_end], token_offsets)
+        tokens = [t.tostring().decode('utf8') for t in token_arrs]
+
+        return tokens[:tokens_split], tokens[
+            tokens_split:], self.data_creator.is_next[i]
+
+    def sep_getitem_fixed(self, i):
+        if i > self.data_creator.len:
+            raise IndexError
+        if i < 0:
+            i += self.data_creator.len
+
+        instance_start, instance_end = self.data_creator.instance_offsets[i:i +
+                                                                          2]
+        instance_data = self.data_creator.data[instance_start:instance_end]
+
+        tokens_split = self.data_creator.tokens_split[i]
+        token_arrs = np.split(
+            instance_data,
+            np.where(instance_data == NumpyByteInstances.TOKEN_SEP_VAL)
+            [0])  # split on the token separator
+        tokens = [
+            (t[1:] if i > 0 else t).tostring().decode('utf8')
+            for i, t in enumerate(token_arrs)
+        ]  # ignore first byte, which will be separator, for tokens after the first
+
+        return tokens[:tokens_split], tokens[
+            tokens_split:], self.data_creator.is_next[i]
+
+    def __len__(self):
+        return self.data_creator.len
+
+
+class NumpyPretrainingDataCreator:
+    def __init__(self, path, mmap=False):
+        path = Path(path)
+        self.path = path
+
+        mmap_mode = 'r' if mmap else None
+
+        self.data = np.load(str(path / 'data.npy'), mmap_mode=mmap_mode)
+        self.is_next = np.load(str(path / 'is_next.npy'), mmap_mode=mmap_mode)
+        self.tokens_split = np.load(str(path / 'tokens_split.npy'),
+                                    mmap_mode=mmap_mode)
+        self.instance_offsets = np.load(str(path / 'instance_offsets.npy'),
+                                        mmap_mode=mmap_mode)
+
+        if (path / 'instance_token_offsets.npy').is_file():
+            self.use_separators = False
+            self.instance_token_offsets = np.load(str(
+                path / 'instance_token_offsets.npy'),
+                                                  mmap_mode=mmap_mode)
+            self.token_offsets = np.load(str(path / 'token_offsets.npy'),
+                                         mmap_mode=mmap_mode)
+        else:
+            self.use_separators = True
+            self.instance_token_offsets = None
+            self.token_offsets = None
+
+        if (path / 'lang.npy').is_file():
+            self.multilingual = True
+            self.lang = np.load(str(path / 'lang.npy'), mmap_mode=mmap_mode)
+        else:
+            self.multilingual = False
+            self.lang = None
+
+        self.instances = NumpyByteInstances(self)
+
+        self.len = len(self.is_next)
+
+    def __len__(self):
+        return self.len
+
+    @classmethod
+    def load(cls, path):
+        return cls(path)
--- a/Deepspeed/BingBertGlue/turing/text.py
+++ b/Deepspeed/BingBertGlue/turing/text.py
+import torch
+
+PAD = 0
+
+
+def mask(x):
+    return x != PAD
+
+
+def torch_long(x):
+    return torch.LongTensor(x)
--- a/Deepspeed/BingBertGlue/turing/utils.py
+++ b/Deepspeed/BingBertGlue/turing/utils.py
+import sys as _sys
+
+from typing import List
+from collections import _iskeyword  # type: ignore
+from tensorboardX import SummaryWriter
+import os
+
+SUMMARY_WRITER_DIR_NAME = 'runs'
+
+
+def get_sample_writer(name, base=".."):
+    """Returns a tensorboard summary writer
+    """
+    return SummaryWriter(
+        log_dir=os.path.join(base, SUMMARY_WRITER_DIR_NAME, name))
+
+
+class TorchTuple(tuple):
+    def to(self, device, non_blocking=False):
+        raise NotImplementedError("")
+
+
+_class_template = """\
+from builtins import property as _property, tuple as _tuple
+from operator import itemgetter as _itemgetter
+from collections import OrderedDict
+
+from turing.utils import TorchTuple
+
+import torch
+
+class {typename}(TorchTuple):
+    '{typename}({arg_list})'
+
+    __slots__ = ()
+
+    _fields = {field_names!r}
+
+    def __new__(_cls, {arg_list}):
+        'Create new instance of {typename}({arg_list})'
+        return _tuple.__new__(_cls, ({arg_list}))
+
+    @classmethod
+    def _make(cls, iterable, new=tuple.__new__, len=len):
+        'Make a new {typename} object from a sequence or iterable'
+        result = new(cls, iterable)
+        if len(result) != {num_fields:d}:
+            raise TypeError('Expected {num_fields:d} arguments, got %d' % len(result))
+        return result
+
+    def _replace(_self, **kwds):
+        'Return a new {typename} object replacing specified fields with new values'
+        result = _self._make(map(kwds.pop, {field_names!r}, _self))
+        if kwds:
+            raise ValueError('Got unexpected field names: %r' % list(kwds))
+        return result
+
+    def __repr__(self):
+        'Return a nicely formatted representation string'
+        return self.__class__.__name__ + '({repr_fmt})' % self
+
+    @property
+    def __dict__(self):
+        'A new OrderedDict mapping field names to their values'
+        return OrderedDict(zip(self._fields, self))
+
+    def _asdict(self):
+        '''Return a new OrderedDict which maps field names to their values.
+           This method is obsolete.  Use vars(nt) or nt.__dict__ instead.
+        '''
+        return self.__dict__
+
+    def __getnewargs__(self):
+        'Return self as a plain tuple.  Used by copy and pickle.'
+        return tuple(self)
+
+    def __getstate__(self):
+        'Exclude the OrderedDict from pickling'
+        return None
+
+    def to(self, device, non_blocking=False):
+        _dict = self.__dict__.copy()
+        new_dict = dict()
+        for key, value in _dict.items():
+            if isinstance(value, torch.Tensor):
+                if device.type != 'cpu' and non_blocking and torch.cuda.is_available():
+                    new_dict[key] = value.cuda(device, non_blocking=non_blocking)
+                else:
+                    new_dict[key] = value.to(device)
+            else:
+                new_dict[key] = value
+        return {typename}(**new_dict)
+{field_defs}
+"""
+
+_repr_template = '{name}=%r'
+
+_field_template = '''\
+    {name} = _property(_itemgetter({index:d}), doc='Alias for field number {index:d}')
+'''
+
+
+def namedtorchbatch(typename: str,
+                    field_names: List[str],
+                    verbose: bool = False,
+                    rename: bool = False):
+    """Returns a new subclass of tuple with named fields leveraging use of torch tensors.
+    """
+
+    # Validate the field names.  At the user's option, either generate an error
+    # message or automatically replace the field name with a valid name.
+    if isinstance(field_names, str):
+        field_names = field_names.replace(',', ' ').split()
+    field_names = list(map(str, field_names))
+    if rename:
+        seen: set = set()
+        for index, name in enumerate(field_names):
+            if (not name.isidentifier() or _iskeyword(name)
+                    or name.startswith('_') or name in seen):
+                field_names[index] = '_%d' % index
+            seen.add(name)
+    for name in [typename] + field_names:
+        if not name.isidentifier():
+            raise ValueError('Type names and field names must be valid '
+                             'identifiers: %r' % name)
+        if _iskeyword(name):
+            raise ValueError('Type names and field names cannot be a '
+                             'keyword: %r' % name)
+    seen = set()
+    for name in field_names:
+        if name.startswith('_') and not rename:
+            raise ValueError('Field names cannot start with an underscore: '
+                             '%r' % name)
+        if name in seen:
+            raise ValueError('Encountered duplicate field name: %r' % name)
+        seen.add(name)
+
+    # Fill-in the class template
+    class_definition = _class_template.format(
+        typename=typename,
+        field_names=tuple(field_names),
+        num_fields=len(field_names),
+        arg_list=repr(tuple(field_names)).replace("'", "")[1:-1],
+        repr_fmt=', '.join(
+            _repr_template.format(name=name) for name in field_names),
+        field_defs='\n'.join(
+            _field_template.format(index=index, name=name)
+            for index, name in enumerate(field_names)))
+
+    # Execute the template string in a temporary namespace and support
+    # tracing utilities by setting a value for frame.f_globals['__name__']
+    namespace = dict(__name__='namedtuple_%s' % typename)
+    exec(class_definition, namespace)
+    result = namespace[typename]
+    result._source = class_definition  # type: ignore
+    if verbose:
+        print(result._source)  # type: ignore
+
+    # For pickling to work, the __module__ variable needs to be set to the frame
+    # where the named tuple is created.  Bypass this step in enviroments where
+    # sys._getframe is not defined (Jython for example) or sys._getframe is not
+    # defined for arguments greater than 0 (IronPython).
+    try:
+        result.__module__ = _sys._getframe(1).f_globals.get(
+            '__name__', '__main__')
+    except (AttributeError, ValueError):
+        pass
+
+    return result
--- a/Deepspeed/BingBertSquad/1-bit_adam/mpi_ethernet/deepspeed_onebitadam_bsz96_config.json
+++ b/Deepspeed/BingBertSquad/1-bit_adam/mpi_ethernet/deepspeed_onebitadam_bsz96_config.json
+{
+  "train_batch_size": 96,
+  "train_micro_batch_size_per_gpu": 3,
+  "steps_per_print": 100,
+  "optimizer": {
+    "type": "OnebitAdam",
+    "params": {
+      "lr": 3e-5,
+      "freeze_step": 400,
+      "weight_decay": 0.0,
+      "bias_correction": false,
+      "cuda_aware": false,
+      "comm_backend_name": "mpi"
+    }
+  },
+  "gradient_clipping": 1.0,
+  "fp16": {
+    "enabled": true
+  }
+}
--- a/Deepspeed/BingBertSquad/1-bit_adam/mpi_ethernet/run_squad_deepspeed_onebitadam.sh
+++ b/Deepspeed/BingBertSquad/1-bit_adam/mpi_ethernet/run_squad_deepspeed_onebitadam.sh
+# If you are able to install pytorch >= 1.8
+# (and nccl >= 2.8.3 if you have 64 or more GPUs),
+# we highly recommend you to use the NCCL-based 1-bit Adam
+# which has better performance and ease of use
+# (see scripts in DeepSpeedExamples/BingBertSquad/1-bit_adam/nccl
+# and read the tutorial for more details:
+# https://www.deepspeed.ai/tutorials/onebit-adam/)
+
+NUM_NODES=4
+NGPU_PER_NODE=8
+MODEL_FILE="../../ckpt/bert-large-uncased-whole-word-masking-pytorch_model.bin"
+ORIGIN_CONFIG_FILE="../../ckpt/bert-large-uncased-whole-word-masking-config.json"
+SQUAD_DIR="../../data"
+OUTPUT_DIR=$1
+LR=3e-5
+SEED=$RANDOM
+MASTER_PORT=12345
+DROPOUT=0.1
+
+sudo rm -rf ${OUTPUT_DIR}
+
+NGPU=$((NGPU_PER_NODE*NUM_NODES))
+EFFECTIVE_BATCH_SIZE=96
+MAX_GPU_BATCH_SIZE=3
+PER_GPU_BATCH_SIZE=$((EFFECTIVE_BATCH_SIZE/NGPU))
+if [[ $PER_GPU_BATCH_SIZE -lt $MAX_GPU_BATCH_SIZE ]]; then
+       GRAD_ACCUM_STEPS=1
+else
+       GRAD_ACCUM_STEPS=$((PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE))
+fi
+JOB_NAME="onebit_deepspeed_${NGPU}GPUs_${EFFECTIVE_BATCH_SIZE}batch_size"
+config_json=deepspeed_onebitadam_bsz96_config.json
+
+# NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed.
+NCCL_TREE_THRESHOLD=0 NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 deepspeed --launcher=openmpi ../../nvidia_run_squad_deepspeed.py \
+--bert_model bert-large-uncased \
+--do_train \
+--do_lower_case \
+--predict_batch_size 3 \
+--do_predict \
+--train_file $SQUAD_DIR/train-v1.1.json \
+--predict_file $SQUAD_DIR/dev-v1.1.json \
+--train_batch_size $PER_GPU_BATCH_SIZE \
+--learning_rate ${LR} \
+--num_train_epochs 2.0 \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir $OUTPUT_DIR \
+--job_name ${JOB_NAME} \
+--gradient_accumulation_steps ${GRAD_ACCUM_STEPS} \
+--fp16 \
+--deepspeed \
+--deepspeed_mpi \
+--deepspeed_transformer_kernel \
+--deepspeed_config ${config_json} \
+--dropout ${DROPOUT} \
+--model_file $MODEL_FILE \
+--seed ${SEED} \
+--ckpt_type HF \
+--origin_bert_config_file ${ORIGIN_CONFIG_FILE} \
--- a/Deepspeed/BingBertSquad/1-bit_adam/mpi_ethernet/run_squad_mpi_onebitadam.sh
+++ b/Deepspeed/BingBertSquad/1-bit_adam/mpi_ethernet/run_squad_mpi_onebitadam.sh
+# If you are able to install pytorch >= 1.8
+# (and nccl >= 2.8.3 if you have 64 or more GPUs),
+# we highly recommend you to use the NCCL-based 1-bit Adam
+# which has better performance and ease of use
+# (see scripts in DeepSpeedExamples/BingBertSquad/1-bit_adam/nccl
+# and read the tutorial for more details:
+# https://www.deepspeed.ai/tutorials/onebit-adam/)
+
+NUM_NODES=4
+NGPU_PER_NODE=8
+MODEL_FILE="../../ckpt/bert-large-uncased-whole-word-masking-pytorch_model.bin"
+ORIGIN_CONFIG_FILE="../../ckpt/bert-large-uncased-whole-word-masking-config.json"
+SQUAD_DIR="../../data"
+OUTPUT_DIR=$1
+LR=3e-5
+SEED=$RANDOM
+MASTER_PORT=12345
+DROPOUT=0.1
+
+sudo rm -rf ${OUTPUT_DIR}
+
+NGPU=$((NGPU_PER_NODE*NUM_NODES))
+EFFECTIVE_BATCH_SIZE=96
+MAX_GPU_BATCH_SIZE=3
+PER_GPU_BATCH_SIZE=$((EFFECTIVE_BATCH_SIZE/NGPU))
+if [[ $PER_GPU_BATCH_SIZE -lt $MAX_GPU_BATCH_SIZE ]]; then
+       GRAD_ACCUM_STEPS=1
+else
+       GRAD_ACCUM_STEPS=$((PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE))
+fi
+JOB_NAME="onebit_deepspeed_${NGPU}GPUs_${EFFECTIVE_BATCH_SIZE}batch_size"
+config_json=deepspeed_onebitadam_bsz96_config.json
+
+# NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed.
+mpirun -n $NGPU -npernode $NGPU_PER_NODE -hostfile /job/hostfile -x UCX_TLS=tcp --mca btl ^openib --mca btl_tcp_if_include eth0 -x NCCL_TREE_THRESHOLD=0 -x NCCL_IB_DISABLE=1 -x NCCL_SOCKET_IFNAME=eth0 python ../../nvidia_run_squad_deepspeed.py \
+--bert_model bert-large-uncased \
+--do_train \
+--do_lower_case \
+--predict_batch_size 3 \
+--do_predict \
+--train_file $SQUAD_DIR/train-v1.1.json \
+--predict_file $SQUAD_DIR/dev-v1.1.json \
+--train_batch_size $PER_GPU_BATCH_SIZE \
+--learning_rate ${LR} \
+--num_train_epochs 2.0 \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir $OUTPUT_DIR \
+--job_name ${JOB_NAME} \
+--gradient_accumulation_steps ${GRAD_ACCUM_STEPS} \
+--fp16 \
+--deepspeed \
+--deepspeed_mpi \
+--deepspeed_transformer_kernel \
+--deepspeed_config ${config_json} \
+--dropout ${DROPOUT} \
+--model_file $MODEL_FILE \
+--seed ${SEED} \
+--ckpt_type HF \
+--origin_bert_config_file ${ORIGIN_CONFIG_FILE} \
--- a/Deepspeed/BingBertSquad/1-bit_adam/mpi_infiniband/deepspeed_onebitadam_bsz96_config.json
+++ b/Deepspeed/BingBertSquad/1-bit_adam/mpi_infiniband/deepspeed_onebitadam_bsz96_config.json
+{
+  "train_batch_size": 96,
+  "train_micro_batch_size_per_gpu": 3,
+  "steps_per_print": 100,
+  "optimizer": {
+    "type": "OnebitAdam",
+    "params": {
+      "lr": 3e-5,
+      "freeze_step": 400,
+      "weight_decay": 0.0,
+      "bias_correction": false,
+      "cuda_aware": true,
+      "comm_backend_name": "mpi"
+    }
+  },
+  "gradient_clipping": 1.0,
+  "fp16": {
+    "enabled": true
+  }
+}
--- a/Deepspeed/BingBertSquad/1-bit_adam/mpi_infiniband/run_squad_deepspeed_onebitadam.sh
+++ b/Deepspeed/BingBertSquad/1-bit_adam/mpi_infiniband/run_squad_deepspeed_onebitadam.sh
+# If you are able to install pytorch >= 1.8
+# (and nccl >= 2.8.3 if you have 64 or more GPUs),
+# we highly recommend you to use the NCCL-based 1-bit Adam
+# which has better performance and ease of use
+# (see scripts in DeepSpeedExamples/BingBertSquad/1-bit_adam/nccl
+# and read the tutorial for more details:
+# https://www.deepspeed.ai/tutorials/onebit-adam/)
+
+NUM_NODES=4
+NGPU_PER_NODE=8
+MODEL_FILE="../../ckpt/bert-large-uncased-whole-word-masking-pytorch_model.bin"
+ORIGIN_CONFIG_FILE="../../ckpt/bert-large-uncased-whole-word-masking-config.json"
+SQUAD_DIR="../../data"
+OUTPUT_DIR=$1
+LR=3e-5
+SEED=$RANDOM
+MASTER_PORT=12345
+DROPOUT=0.1
+
+sudo rm -rf ${OUTPUT_DIR}
+
+NGPU=$((NGPU_PER_NODE*NUM_NODES))
+EFFECTIVE_BATCH_SIZE=96
+MAX_GPU_BATCH_SIZE=3
+PER_GPU_BATCH_SIZE=$((EFFECTIVE_BATCH_SIZE/NGPU))
+if [[ $PER_GPU_BATCH_SIZE -lt $MAX_GPU_BATCH_SIZE ]]; then
+       GRAD_ACCUM_STEPS=1
+else
+       GRAD_ACCUM_STEPS=$((PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE))
+fi
+JOB_NAME="onebit_deepspeed_${NGPU}GPUs_${EFFECTIVE_BATCH_SIZE}batch_size"
+config_json=deepspeed_onebitadam_bsz96_config.json
+
+NCCL_TREE_THRESHOLD=0 deepspeed --launcher=mvapich ../../nvidia_run_squad_deepspeed.py \
+--bert_model bert-large-uncased \
+--do_train \
+--do_lower_case \
+--predict_batch_size 3 \
+--do_predict \
+--train_file $SQUAD_DIR/train-v1.1.json \
+--predict_file $SQUAD_DIR/dev-v1.1.json \
+--train_batch_size $PER_GPU_BATCH_SIZE \
+--learning_rate ${LR} \
+--num_train_epochs 2.0 \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir $OUTPUT_DIR \
+--job_name ${JOB_NAME} \
+--gradient_accumulation_steps ${GRAD_ACCUM_STEPS} \
+--fp16 \
+--deepspeed \
+--deepspeed_mpi \
+--deepspeed_transformer_kernel \
+--deepspeed_config ${config_json} \
+--dropout ${DROPOUT} \
+--model_file $MODEL_FILE \
+--seed ${SEED} \
+--ckpt_type HF \
+--origin_bert_config_file ${ORIGIN_CONFIG_FILE} \
--- a/Deepspeed/BingBertSquad/1-bit_adam/mpi_infiniband/run_squad_mpi_onebitadam.sh
+++ b/Deepspeed/BingBertSquad/1-bit_adam/mpi_infiniband/run_squad_mpi_onebitadam.sh
+# If you are able to install pytorch >= 1.8
+# (and nccl >= 2.8.3 if you have 64 or more GPUs),
+# we highly recommend you to use the NCCL-based 1-bit Adam
+# which has better performance and ease of use
+# (see scripts in DeepSpeedExamples/BingBertSquad/1-bit_adam/nccl
+# and read the tutorial for more details:
+# https://www.deepspeed.ai/tutorials/onebit-adam/)
+
+NUM_NODES=4
+NGPU_PER_NODE=8
+MODEL_FILE="../../ckpt/bert-large-uncased-whole-word-masking-pytorch_model.bin"
+ORIGIN_CONFIG_FILE="../../ckpt/bert-large-uncased-whole-word-masking-config.json"
+SQUAD_DIR="../../data"
+OUTPUT_DIR=$1
+LR=3e-5
+SEED=$RANDOM
+MASTER_PORT=12345
+DROPOUT=0.1
+
+sudo rm -rf ${OUTPUT_DIR}
+
+NGPU=$((NGPU_PER_NODE*NUM_NODES))
+EFFECTIVE_BATCH_SIZE=96
+MAX_GPU_BATCH_SIZE=3
+PER_GPU_BATCH_SIZE=$((EFFECTIVE_BATCH_SIZE/NGPU))
+if [[ $PER_GPU_BATCH_SIZE -lt $MAX_GPU_BATCH_SIZE ]]; then
+       GRAD_ACCUM_STEPS=1
+else
+       GRAD_ACCUM_STEPS=$((PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE))
+fi
+JOB_NAME="onebit_deepspeed_${NGPU}GPUs_${EFFECTIVE_BATCH_SIZE}batch_size"
+config_json=deepspeed_onebitadam_bsz96_config.json
+
+mpirun -n $NGPU -ppn $NGPU_PER_NODE -f /tmp/deepspeed_mvapich_hostfile -env MV2_SUPPORT_DL=1 -env MV2_USE_GDR=0 -env MV2_USE_CUDA=1 -env MV2_USE_GDRCOPY=0 -env MV2_SMP_USE_CMA=0 -env MV2_DEBUG_SHOW_BACKTRACE=1 python ../../nvidia_run_squad_deepspeed.py \
+--bert_model bert-large-uncased \
+--do_train \
+--do_lower_case \
+--predict_batch_size 3 \
+--do_predict \
+--train_file $SQUAD_DIR/train-v1.1.json \
+--predict_file $SQUAD_DIR/dev-v1.1.json \
+--train_batch_size $PER_GPU_BATCH_SIZE \
+--learning_rate ${LR} \
+--num_train_epochs 2.0 \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir $OUTPUT_DIR \
+--job_name ${JOB_NAME} \
+--gradient_accumulation_steps ${GRAD_ACCUM_STEPS} \
+--fp16 \
+--deepspeed \
+--deepspeed_mpi \
+--deepspeed_transformer_kernel \
+--deepspeed_config ${config_json} \
+--dropout ${DROPOUT} \
+--model_file $MODEL_FILE \
+--seed ${SEED} \
+--ckpt_type HF \
+--origin_bert_config_file ${ORIGIN_CONFIG_FILE} \
--- a/Deepspeed/BingBertSquad/1-bit_adam/nccl/deepspeed_onebitadam_bsz96_config.json
+++ b/Deepspeed/BingBertSquad/1-bit_adam/nccl/deepspeed_onebitadam_bsz96_config.json
+{
+  "train_batch_size": 96,
+  "train_micro_batch_size_per_gpu": 3,
+  "steps_per_print": 100,
+  "optimizer": {
+    "type": "OnebitAdam",
+    "params": {
+      "lr": 3e-5,
+      "freeze_step": 400,
+      "weight_decay": 0.0,
+      "bias_correction": false,
+      "cuda_aware": false,
+      "comm_backend_name": "nccl"
+    }
+  },
+  "gradient_clipping": 1.0,
+  "fp16": {
+    "enabled": true
+  }
+}
--- a/Deepspeed/BingBertSquad/1-bit_adam/nccl/run_squad_deepspeed_onebitadam.sh
+++ b/Deepspeed/BingBertSquad/1-bit_adam/nccl/run_squad_deepspeed_onebitadam.sh
+# This script requires pytorch >= 1.8
+# (and nccl >= 2.8.3 if you have 64 or more GPUs).
+# Read the tutorial for more details:
+# https://www.deepspeed.ai/tutorials/onebit-adam/
+
+NUM_NODES=4
+NGPU_PER_NODE=8
+MODEL_FILE="../../ckpt/bert-large-uncased-whole-word-masking-pytorch_model.bin"
+ORIGIN_CONFIG_FILE="../../ckpt/bert-large-uncased-whole-word-masking-config.json"
+SQUAD_DIR="../../data"
+OUTPUT_DIR=$1
+LR=3e-5
+SEED=$RANDOM
+MASTER_PORT=12345
+DROPOUT=0.1
+
+sudo rm -rf ${OUTPUT_DIR}
+
+NGPU=$((NGPU_PER_NODE*NUM_NODES))
+EFFECTIVE_BATCH_SIZE=96
+MAX_GPU_BATCH_SIZE=3
+PER_GPU_BATCH_SIZE=$((EFFECTIVE_BATCH_SIZE/NGPU))
+if [[ $PER_GPU_BATCH_SIZE -lt $MAX_GPU_BATCH_SIZE ]]; then
+       GRAD_ACCUM_STEPS=1
+else
+       GRAD_ACCUM_STEPS=$((PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE))
+fi
+JOB_NAME="onebit_deepspeed_${NGPU}GPUs_${EFFECTIVE_BATCH_SIZE}batch_size"
+config_json=deepspeed_onebitadam_bsz96_config.json
+
+# NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed.
+NCCL_TREE_THRESHOLD=0 NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 deepspeed ../../nvidia_run_squad_deepspeed.py \
+--bert_model bert-large-uncased \
+--do_train \
+--do_lower_case \
+--predict_batch_size 3 \
+--do_predict \
+--train_file $SQUAD_DIR/train-v1.1.json \
+--predict_file $SQUAD_DIR/dev-v1.1.json \
+--train_batch_size $PER_GPU_BATCH_SIZE \
+--learning_rate ${LR} \
+--num_train_epochs 2.0 \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir $OUTPUT_DIR \
+--job_name ${JOB_NAME} \
+--gradient_accumulation_steps ${GRAD_ACCUM_STEPS} \
+--fp16 \
+--deepspeed \
+--deepspeed_transformer_kernel \
+--deepspeed_config ${config_json} \
+--dropout ${DROPOUT} \
+--model_file $MODEL_FILE \
+--seed ${SEED} \
+--ckpt_type HF \
+--origin_bert_config_file ${ORIGIN_CONFIG_FILE} \
--- a/Deepspeed/BingBertSquad/NOTICE.txt
+++ b/Deepspeed/BingBertSquad/NOTICE.txt
+NOTICES AND INFORMATION
+Do Not Translate or Localize
+
+This software incorporates material from third parties. Microsoft makes certain
+open source code available at https://3rdpartysource.microsoft.com, or you may
+send a check or money order for US $5.00, including the product name, the open
+source component name, and version number, to:
+
+Source Code Compliance Team
+Microsoft Corporation
+One Microsoft Way
+Redmond, WA 98052
+USA
+
+Notwithstanding any other terms, you may reverse engineer this software to the
+extent required to debug changes to any libraries licensed under the GNU Lesser
+General Public License.
+
+Component. BingBertSquad
+
+Open Source License/Copyright Notice.
+
+Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
--- a/Deepspeed/BingBertSquad/ckpt/bert-large-uncased-whole-word-masking-config.json
+++ b/Deepspeed/BingBertSquad/ckpt/bert-large-uncased-whole-word-masking-config.json
+{
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "pad_token_id": 0,
+  "type_vocab_size": 2,
+  "vocab_size": 30522
+}
--- a/Deepspeed/BingBertSquad/convert_bert_ckpt_to_deepspeed.py
+++ b/Deepspeed/BingBertSquad/convert_bert_ckpt_to_deepspeed.py
+# coding=utf-8
+# This script references to below file from HuggingFace:
+#   https://github.com/huggingface/transformers/blob/d541938/src/transformers/modeling_bert.py
+#
+# It converts Tensorflow and Huggingface checkpoint files to DeepSpeed.
+
+import os
+import argparse
+import logging
+import torch
+import re
+import numpy as np
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+def set_data(param, array):
+    try:
+        assert param.shape == array.shape
+    except AssertionError as e:
+        e.args += (param.shape, array.shape)
+        raise
+    param.data = torch.from_numpy(array)
+
+def load_tf_weights_in_bert_kernel(model, ckpt_path, voc_size_diff):
+    """ Load tf checkpoints in DeepSpeed model.
+    """
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in DeepSpeed, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(ckpt_path)
+    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    qkv = {}
+    for name_str, array in zip(names, arrays):
+        name = name_str.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info("Skipping {}".format("/".join(name)))
+            continue
+        pointer = model
+        key = None
+        skipping = False
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            # Special in deepspeed.
+            elif name_str.find("bert/pooler/dense") >= 0 and scope_names[0] == "dense":
+                pointer = getattr(pointer, "dense_act")
+            elif name_str.find("bert/embeddings/LayerNorm/gamma") >= 0 and scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif name_str.find("bert/embeddings/LayerNorm/beta") >= 0 and scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info("Skipping {}".format("/".join(name)))
+                    skipping = True
+                    break
+
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+
+                pointer = pointer[num]
+
+                # For transofrmer kernel layers.
+                if scope_names[0] == 'layer':
+                    if name_str.find("attention/self/query/kernel") > 0:
+                        key = "qw"
+                    elif name_str.find("attention/self/query/bias") > 0:
+                        key = "qb"
+                    elif name_str.find("attention/self/key/kernel") > 0:
+                        key = "kw"
+                    elif name_str.find("attention/self/key/bias") > 0:
+                        key = "kb"
+                    elif name_str.find("attention/self/value/kernel") > 0:
+                        key = "vw"
+                    elif name_str.find("attention/self/value/bias") > 0:
+                        key = "vb"
+                    elif name_str.find("attention/output/dense/kernel") > 0:
+                        pointer = getattr(pointer, "attn_ow")
+                    elif name_str.find("attention/output/dense/bias") > 0:
+                        pointer = getattr(pointer, "attn_ob")
+                    elif name_str.find("attention/output/LayerNorm/gamma") > 0:
+                        pointer = getattr(pointer, "attn_nw")
+                    elif name_str.find("attention/output/LayerNorm/beta") > 0:
+                        pointer = getattr(pointer, "attn_nb")
+                    elif name_str.find("intermediate/dense/kernel") > 0:
+                        pointer = getattr(pointer, "inter_w")
+                    elif name_str.find("intermediate/dense/bias") > 0:
+                        pointer = getattr(pointer, "inter_b")
+                    elif name_str.find("output/dense/kernel") > 0 and name_str.find("attention") < 0:
+                        pointer = getattr(pointer, "output_w")
+                    elif name_str.find("output/dense/bias") > 0 and name_str.find("attention") < 0:
+                        pointer = getattr(pointer, "output_b")
+                    elif name_str.find("output/LayerNorm/gamma") > 0 and name_str.find("attention") < 0:
+                        pointer = getattr(pointer, "norm_w")
+                    elif name_str.find("output/LayerNorm/beta") > 0 and name_str.find("attention") < 0:
+                        pointer = getattr(pointer, "norm_b")
+                    else:
+                        raise ValueError(f"unexpect scope name {name_str} in transformer layer.")
+                    break
+
+        if skipping:
+            continue
+
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif "kernel" in name:
+            array = np.transpose(array)
+
+        if key is not None:
+            qkv[key] = array
+
+        if all(k in qkv for k in ("qw", "kw", "vw")):
+            array = np.concatenate((qkv["qw"], qkv["kw"], qkv["vw"]), axis=0)
+            pointer = getattr(pointer, "attn_qkvw")
+            qkv.pop("qw")
+            qkv.pop("kw")
+            qkv.pop("vw")
+        elif all(k in qkv for k in ("qb", "kb", "vb")):
+            array = np.concatenate((qkv["qb"], qkv["kb"], qkv["vb"]), axis=0)
+            pointer = getattr(pointer, "attn_qkvb")
+            qkv.pop("qb")
+            qkv.pop("kb")
+            qkv.pop("vb")
+        elif key is not None:
+            # For Q/K/V weight/bias in TF, do nothing if not all ready to merge.
+            continue
+
+        # DeepSpeed BERT model has voc_size 8 aligned.
+        if voc_size_diff > 0 and name_str.find("embeddings/word_embeddings") >= 0:
+            z = np.zeros((voc_size_diff, array.shape[1]), dtype=array.dtype)
+            array = np.concatenate((array, z), axis=0)
+
+        set_data(pointer, array)
+        logger.info("Initialize DeepSpeed weight {}".format(name))
+
+    return model
+
+def load_hf_weights_in_bert_kernel(model, ckpt_path, voc_size_diff):
+    """ Load huggingface checkpoints and convert to a deepspeed model.
+    """
+    hf_path = os.path.abspath(ckpt_path)
+    logger.info("Converting Huggingface checkpoint from {}".format(hf_path))
+    # Load weights from Huggingface model
+    ckpt = torch.load(hf_path, map_location=torch.device("cpu"))
+
+    qkv = {}
+    for name_str in ckpt.keys():
+        array = ckpt[name_str].numpy()
+        logger.info("Loading Huggingface weight {} with shape {}".format(name_str, array.shape))
+        name = name_str.split(".")
+        pointer = model
+        key = None
+        is_layer = False
+        skipping = False
+        for m_name in name:
+            # Special in deepspeed.
+            if name_str.find("bert.pooler.dense") >= 0 and m_name == "dense":
+                pointer = getattr(pointer, "dense_act")
+            elif is_layer:
+                pass
+            else:
+                try:
+                    pointer = getattr(pointer, m_name)
+                except AttributeError:
+                    logger.info("Skipping {}".format(".".join(name)))
+                    skipping = True
+                    break
+
+            if m_name == "layer":
+                is_layer = True
+                continue
+
+            if m_name.isnumeric() and is_layer:
+                num = int(m_name)
+                pointer = pointer[num]
+                is_layer = False
+
+                # For transofrmer kernel layers.
+                if name_str.find("attention.self.query.weight") > 0:
+                    key = "qw"
+                elif name_str.find("attention.self.query.bias") > 0:
+                    key = "qb"
+                elif name_str.find("attention.self.key.weight") > 0:
+                    key = "kw"
+                elif name_str.find("attention.self.key.bias") > 0:
+                    key = "kb"
+                elif name_str.find("attention.self.value.weight") > 0:
+                    key = "vw"
+                elif name_str.find("attention.self.value.bias") > 0:
+                    key = "vb"
+                elif name_str.find("attention.output.dense.weight") > 0:
+                    pointer = getattr(pointer, "attn_ow")
+                elif name_str.find("attention.output.dense.bias") > 0:
+                    pointer = getattr(pointer, "attn_ob")
+                elif name_str.find("attention.output.LayerNorm.weight") > 0:
+                    pointer = getattr(pointer, "attn_nw")
+                elif name_str.find("attention.output.LayerNorm.bias") > 0:
+                    pointer = getattr(pointer, "attn_nb")
+                elif name_str.find("intermediate.dense.weight") > 0:
+                    pointer = getattr(pointer, "inter_w")
+                elif name_str.find("intermediate.dense.bias") > 0:
+                    pointer = getattr(pointer, "inter_b")
+                elif name_str.find("output.dense.weight") > 0 and name_str.find("attention") < 0:
+                    pointer = getattr(pointer, "output_w")
+                elif name_str.find("output.dense.bias") > 0 and name_str.find("attention") < 0:
+                    pointer = getattr(pointer, "output_b")
+                elif name_str.find("output.LayerNorm.weight") > 0 and name_str.find("attention") < 0:
+                    pointer = getattr(pointer, "norm_w")
+                elif name_str.find("output.LayerNorm.bias") > 0 and name_str.find("attention") < 0:
+                    pointer = getattr(pointer, "norm_b")
+                else:
+                    raise ValueError(f"unexpect scope name {name_str} in transformer layer.")
+                break
+
+        if skipping:
+            continue
+
+        if key is not None:
+            qkv[key] = array
+
+        if all(k in qkv for k in ("qw", "kw", "vw")):
+            array = np.concatenate((qkv["qw"], qkv["kw"], qkv["vw"]), axis=0)
+            pointer = getattr(pointer, "attn_qkvw")
+            qkv.pop("qw")
+            qkv.pop("kw")
+            qkv.pop("vw")
+        elif all(k in qkv for k in ("qb", "kb", "vb")):
+            array = np.concatenate((qkv["qb"], qkv["kb"], qkv["vb"]), axis=0)
+            pointer = getattr(pointer, "attn_qkvb")
+            qkv.pop("qb")
+            qkv.pop("kb")
+            qkv.pop("vb")
+        elif key is not None:
+            # For Q/K/V weight/bias in HF, do nothing if not all ready to merge.
+            continue
+
+        # DeepSpeed BERT model has voc_size 8 aligned.
+        if voc_size_diff > 0 and name_str.find("embeddings.word_embeddings") >= 0:
+            z = np.zeros((voc_size_diff, array.shape[1]), dtype=array.dtype)
+            array = np.concatenate((array, z), axis=0)
+
+        set_data(pointer, array)
+        logger.info("Initialize DeepSpeed weight {}".format(name))
+
+    return model
+
+def load_hf_weights_in_bert_torch(model, ckpt_path, voc_size_diff):
+    """ Load huggingface checkpoints and convert to a deepspeed model.
+    """
+    hf_path = os.path.abspath(ckpt_path)
+    logger.info("Converting Huggingface checkpoint from {}".format(hf_path))
+    # Load weights from Huggingface model
+    ckpt = torch.load(hf_path, map_location=torch.device("cpu"))
+
+    qkv = {}
+    for name_str in ckpt.keys():
+        array = ckpt[name_str].numpy()
+        logger.info("Loading Huggingface weight {} with shape {}".format(name_str, array.shape))
+        name = name_str.split(".")
+        pointer = model
+        key = None
+        is_layer = False
+        skipping = False
+        for m_name in name:
+            # Special in deepspeed.
+            if name_str.find("intermediate.dense") >= 0 and m_name == "dense":
+                pointer = getattr(pointer, "dense_act")
+            elif name_str.find("pooler.dense") >= 0 and m_name == "dense":
+                pointer = getattr(pointer, "dense_act")
+            else:
+                try:
+                    pointer = getattr(pointer, m_name)
+                except AttributeError:
+                    logger.info("Skipping {}".format(".".join(name)))
+                    skipping = True
+                    break
+
+        if skipping:
+            continue
+
+        # DeepSpeed BERT model has voc_size 8 aligned.
+        if voc_size_diff > 0 and name_str.find("embeddings.word_embeddings") >= 0:
+            z = np.zeros((voc_size_diff, array.shape[1]), dtype=array.dtype)
+            array = np.concatenate((array, z), axis=0)
+
+        set_data(pointer, array)
+        logger.info("Initialize DeepSpeed weight {}".format(name))
+
+    return model
+
+def convert_ckpt_to_deepspeed(model, ckpt_type, ckpt_path, vocab_diff, kernel_enabled):
+
+    # Load weights from checkpoint
+    if ckpt_type == "HF":
+        if kernel_enabled:
+            load_hf_weights_in_bert_kernel(model, ckpt_path, vocab_diff)
+        else:
+            load_hf_weights_in_bert_torch(model, ckpt_path, vocab_diff)
+    elif ckpt_type == "TF":
+        if kernel_enabled:
+            load_tf_weights_in_bert_kernel(model, ckpt_path, vocab_diff)
+        else:
+            raise ValueError("--deepspeed_transformer_kernel is required for loading TF checkpoint.")
+    else:
+        raise ValueError(f"Invalid ckpt_type.")
--- a/Deepspeed/BingBertSquad/deepspeed_bsz24_config.json
+++ b/Deepspeed/BingBertSquad/deepspeed_bsz24_config.json
+{
+  "train_batch_size": 24,
+  "train_micro_batch_size_per_gpu": 3,
+  "steps_per_print": 10,
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 3e-5,
+      "weight_decay": 0.0,
+      "bias_correction": false
+    }
+  },
+  "gradient_clipping": 1.0,
+  "fp16": {
+    "enabled": true
+  }
+
+}
--- a/Deepspeed/BingBertSquad/evaluate-v1.1.py
+++ b/Deepspeed/BingBertSquad/evaluate-v1.1.py
+import argparse
+import json
+import evaluate as eval
+
+if __name__ == '__main__':
+    expected_version = '1.1'
+    parser = argparse.ArgumentParser(description='Evaluation for SQuAD ' +
+                                     expected_version)
+    parser.add_argument('dataset_file', help='Dataset file')
+    parser.add_argument('prediction_file', help='Prediction File')
+    args = parser.parse_args()
+
+    print(
+        json.dumps(
+            eval.evaluate(expected_version, args.dataset_file,
+                          args.prediction_file)))