初始化仓库

e5ca7e62 · hepj987 · e5ca7e62 · e5ca7e62 · e5ca7e62 · e5ca7e62
Commit e5ca7e62 authored Jul 17, 2023 by hepj987
20 changed files
--- a/single_squad4.sh
+++ b/single_squad4.sh
+#!/bin/bash
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_FIND_MODE=3
+module unload compiler/rocm/2.9
+echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+APP="python3 run_squad_v4.py \
+  --train_file  /public/home/hepj/data/sq1.1/train-v1.1.json \
+  --predict_file  /public/home/hepj/data/sq1.1/dev-v1.1.json \
+  --init_checkpoint  /public/home/hepj/model_source/pytorch_bert/model.ckpt-28252.pt \
+  --vocab_file  /public/home/hepj/model_source/pytorch_bert/vocab.txt \
+  --output_dir  /public/home/hepj/outdir/torch/SQuAD4 \
+  --config_file  /public/home/hepj/model_source/pytorch_bert/bert_config.json \
+  --json-summary  /public/home/hepj/outdir/torch/SQuAD4/results.json \
+  --bert_model bert-large-uncased \
+  --do_train \
+  --do_predict \
+  --do_eval \
+  --train_batch_size  4 \
+  --predict_batch_size 4 \
+  --gpus_per_node  1 \
+  --local_rank ${comm_rank} \
+  --world_size 4 \
+  --use_env  \
+  --dist_url tcp://localhost:34567 \
+  --eval_script ./evaluate-v1.1.py
+ "
+#  --fp16  \
+#  --amp \
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  echo numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  #echo GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP} 
+  #GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=1
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  echo numactl --cpunodebind=1 --membind=1 ${APP}
+  numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=2
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  echo numactl --cpunodebind=2 --membind=2 ${APP} 
+  numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  echo numactl --cpunodebind=3 --membind=3 ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
--- a/single_squad4_fp16.sh
+++ b/single_squad4_fp16.sh
+#!/bin/bash
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_FIND_MODE=3
+module unload compiler/rocm/2.9
+echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+APP="python3 run_squad_v4.py \
+  --train_file  /public/home/hepj/data/sq1.1/train-v1.1.json \
+  --predict_file  /public/home/hepj/data/sq1.1/dev-v1.1.json \
+  --init_checkpoint  /public/home/hepj/model_source/pytorch_bert/model.ckpt-28252.pt \
+  --vocab_file  /public/home/hepj/model_source/pytorch_bert/vocab.txt \
+  --output_dir  /public/home/hepj/outdir/torch/SQuAD4 \
+  --config_file  /public/home/hepj/model_source/pytorch_bert/bert_config.json \
+  --json-summary  /public/home/hepj/outdir/torch/SQuAD4/results.json \
+  --bert_model bert-large-uncased \
+  --do_train \
+  --do_predict \
+  --do_eval \
+  --train_batch_size  4 \
+  --predict_batch_size 4 \
+  --gpus_per_node  1 \
+  --local_rank ${comm_rank} \
+  --world_size 4 \
+  --use_env  \
+  --fp16 \
+  --amp \
+  --dist_url tcp://localhost:34567 \
+  --eval_script ./evaluate-v1.1.py
+ "
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  echo numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  #echo GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP} 
+  #GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=1
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  echo numactl --cpunodebind=1 --membind=1 ${APP}
+  numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=2
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  echo numactl --cpunodebind=2 --membind=2 ${APP} 
+  numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  echo numactl --cpunodebind=3 --membind=3 ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
--- a/single_squad_fp16.sh
+++ b/single_squad_fp16.sh
+#!/bin/bash
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_FIND_MODE=3
+module unload compiler/rocm/2.9
+echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+APP="python3 run_squad_v1.py \
+  --train_file  /public/home/hepj/data/sq1.1/train-v1.1.json \
+  --predict_file  /public/home/hepj/data/sq1.1/dev-v1.1.json \
+  --init_checkpoint  /public/home/hepj/model_source/pytorch_bert/model.ckpt-28252.pt \
+  --vocab_file  /public/home/hepj/model_source/pytorch_bert/vocab.txt \
+  --output_dir  /public/home/hepj/outdir/tourch/SQuAD \
+  --config_file  /public/home/hepj/model_source/pytorch_bert/bert_config.json \
+  --json-summary  ./log/results.json \
+  --bert_model bert-large-uncased \
+  --do_train \
+  --do_predict \
+  --do_eval \
+  --train_batch_size  4 \
+  --predict_batch_size 4 \
+  --gpus_per_node  1 \
+  --local_rank -1 \
+  --fp16  \
+  --amp  \
+  --eval_script ./evaluate-v1.1.py
+ "
+#--json-summary  /public/home/hepj/out_dir/tourch/SQuAD/results.json 
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  echo numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  #echo GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP} 
+  #GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=1
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  echo numactl --cpunodebind=1 --membind=1 ${APP}
+  numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=2
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  echo numactl --cpunodebind=2 --membind=2 ${APP} 
+  numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  echo numactl --cpunodebind=3 --membind=3 ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
--- a/tf_to_torch/bmm1.py
+++ b/tf_to_torch/bmm1.py
+import torch
+import mhalib
+###########################################################################################
+class Bmm1Function(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, batch1, batch2, seqlen, batch, maxseqlen, heads, embed, scale, stream, sync):
+        ctx.save_for_backward(batch1, batch2, seqlen)
+        ctx.batch = batch
+        ctx.maxseqlen = maxseqlen
+        ctx.heads = heads
+        ctx.embed = embed
+        ctx.scale = scale
+        ctx.sync = sync
+        ctx.stream = stream
+        ntokens = seqlen.sum().item()
+        ctx.ntokens = ntokens
+        ntokens2 = 0
+        for i in range(batch):
+            ntokens2 += seqlen[i]*seqlen[i]
+        output = torch.empty(ntokens2*heads, device="cuda", dtype=torch.float16)
+        mhalib.FastBmm1Fprop(batch2.flatten().contiguous(), batch1.flatten().contiguous(), output.flatten().contiguous(), batch, seqlen, heads, embed, scale, False, stream, sync)
+        return output[:ntokens2*heads]
+    @staticmethod
+    def backward(ctx, grad_output):
+        batch1, batch2, seqlen = ctx.saved_tensors
+        batch = ctx.batch
+        maxseqlen = ctx.maxseqlen
+        heads = ctx.heads
+        embed = ctx.embed
+        ntokens = ctx.ntokens
+        grad_batch1 = torch.empty(ntokens,heads*embed, device="cuda", dtype=torch.float16)
+        grad_batch2 = torch.empty(ntokens,heads*embed, device="cuda", dtype=torch.float16)
+        mhalib.FastBmm1Dgrad2(batch2.flatten().contiguous(), grad_output.flatten().contiguous(), grad_batch1.flatten().contiguous(), batch, seqlen, heads, embed, ctx.scale, False, ctx.stream, ctx.sync)
+        mhalib.FastBmm1Dgrad1(batch1.flatten().contiguous(), grad_output.flatten().contiguous(), grad_batch2.flatten().contiguous(), batch, seqlen, heads, embed, ctx.scale, False, ctx.stream, ctx.sync)
+        return grad_batch1[:ntokens], grad_batch2[:ntokens], None, None, None, None, None, None, None, None
+class Bmm1(torch.nn.Module):
+    def __init__(self, batch, seqlen, heads, embed, scale=False, stream=True, sync=True):
+        super(Bmm1, self).__init__()
+        self.heads = heads
+        self.embed = embed
+        self.maxseqlen = seqlen
+        self.scale = scale
+        self.sync = sync
+        self.stream = stream
+    def forward(self, batch1, batch2, batch, seqlen):
+        return Bmm1Function.apply(batch1, batch2, seqlen, batch, self.maxseqlen, self.heads, self.embed, self.scale, self.stream, self.sync)
+##########################################################################################
+class Bmm1StridedFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, mixed, seqlen, batch, maxseqlen, heads, embed, scale, stream, sync, timers):
+        ctx.save_for_backward(mixed, seqlen)
+        ctx.batch = batch
+        ctx.maxseqlen = maxseqlen
+        ctx.heads = heads
+        ctx.embed = embed
+        ctx.scale = scale
+        ctx.sync = sync
+        ctx.stream = stream
+        ctx.timers = timers
+        ntokens = seqlen.sum().item()
+        ctx.ntokens = ntokens
+        ntokens2 = 0
+        for i in range(batch):
+            ntokens2 += seqlen[i]*seqlen[i]
+        output = torch.empty(ntokens2*heads, device="cuda", dtype=torch.float16)
+        if timers: timers['start_fprop'].record()
+        mhalib.FastBmm1Fprop(mixed, mixed, output, batch, seqlen, heads, embed, scale, True, stream, sync)
+        if timers: timers['stop_fprop'].record()
+        return output[:ntokens2*heads], mixed
+    @staticmethod
+    #def backward(ctx, grad_output):
+    def backward(ctx, grad_output, grad_mixed):
+        mixed, seqlen = ctx.saved_tensors
+        batch = ctx.batch
+        maxseqlen = ctx.maxseqlen
+        heads = ctx.heads
+        embed = ctx.embed
+        ntokens = ctx.ntokens
+        #grad_mixed = torch.empty([ntokens,heads*3*embed], device="cuda", dtype=torch.float16)
+        if ctx.timers: ctx.timers['start_dgrad'].record()
+        mhalib.FastBmm1Dgrad2(mixed, grad_output, grad_mixed, batch, seqlen, heads, embed, ctx.scale, True, ctx.stream, ctx.sync)
+        if ctx.timers: ctx.timers['stop_dgrad'].record()
+        if ctx.timers: ctx.timers['start_wgrad'].record()
+        mhalib.FastBmm1Dgrad1(mixed, grad_output, grad_mixed, batch, seqlen, heads, embed, ctx.scale, True, ctx.stream, ctx.sync)
+        if ctx.timers: ctx.timers['stop_wgrad'].record()
+        #return grad_mixed[:ntokens], None, None, None, None, None, None, None, None, None
+        return grad_mixed[:ntokens], grad_mixed, None, None, None, None, None, None, None, None, None
+class Bmm1Strided(torch.nn.Module):
+    def __init__(self, batch, seqlen, heads, embed, scale=True, stream=True, sync=True, timer=False):
+        super(Bmm1Strided, self).__init__()
+        self.heads = heads
+        self.embed = embed
+        self.maxseqlen = seqlen
+        self.scale = scale
+        self.sync = sync
+        self.stream = stream
+        if timer:
+            self.timers = {'start_fprop':torch.cuda.Event(enable_timing=True),
+                           'start_dgrad':torch.cuda.Event(enable_timing=True),
+                           'start_wgrad':torch.cuda.Event(enable_timing=True),
+                           'stop_fprop':torch.cuda.Event(enable_timing=True),
+                           'stop_dgrad':torch.cuda.Event(enable_timing=True),
+                           'stop_wgrad':torch.cuda.Event(enable_timing=True)}
+        else:
+            self.timers = None
+    def forward(self, mixed, batch, seqlen):
+        return Bmm1StridedFunction.apply(mixed, seqlen, batch, self.maxseqlen, self.heads, self.embed, self.scale, self.stream, self.sync, self.timers)
+###########################################################################################
--- a/tf_to_torch/bmm2.py
+++ b/tf_to_torch/bmm2.py
+import torch
+import mhalib
+###########################################################################################
+class Bmm2Function(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, batch1, batch2, seqlen, batch, maxseqlen, heads, embed, sync, stream):
+        ctx.save_for_backward(batch1, batch2, seqlen)
+        ctx.batch = batch
+        ctx.maxseqlen = maxseqlen
+        ctx.heads = heads
+        ctx.embed = embed
+        ctx.stream = stream
+        ctx.sync = sync
+        ntokens = seqlen.sum().item()
+        ctx.ntokens = ntokens
+        output = torch.empty([ntokens,heads,embed], device="cuda", dtype=torch.float16)
+        mhalib.FastBmm2Fprop(batch2.flatten().contiguous(), batch1.flatten().contiguous(), output, batch, seqlen, heads, embed, False, False, stream, sync)
+        return output[:ntokens]
+    @staticmethod
+    def backward(ctx, grad_output):
+        batch1, batch2, seqlen = ctx.saved_tensors
+        batch = ctx.batch
+        maxseqlen = ctx.maxseqlen
+        heads = ctx.heads
+        embed = ctx.embed
+        ntokens = ctx.ntokens
+        ntokens2 = 0
+        for i in range(batch):
+            ntokens2 += seqlen[i]*seqlen[i]
+        grad_batch1 = torch.empty([ntokens2*heads], device="cuda", dtype=torch.float16)
+        grad_batch2 = torch.empty([ntokens,heads*embed], device="cuda", dtype=torch.float16)
+        mhalib.FastBmm2Dgrad1(batch2.flatten().contiguous(), grad_output, grad_batch1, batch, seqlen, heads, embed, False, False, ctx.stream, ctx.sync)
+        mhalib.FastBmm2Dgrad2(grad_output, batch1, grad_batch2, batch, seqlen, heads, embed, False, False, ctx.stream, ctx.sync)
+        return grad_batch1[:ntokens2*heads], grad_batch2[:ntokens], None, None, None, None, None, None, None
+class Bmm2(torch.nn.Module):
+    def __init__(self, batch, seqlen, heads, embed, stream=True, sync=True):
+        super(Bmm2, self).__init__()
+        self.heads = heads
+        self.embed = embed
+        self.maxseqlen = seqlen
+        self.stream = stream
+        self.sync = sync
+    def forward(self, batch1, batch2, batch, seqlen):
+        return Bmm2Function.apply(batch1, batch2, seqlen, batch, self.maxseqlen, self.heads, self.embed, self.stream, self.sync)
+###########################################################################################
+class Bmm2StridedFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, batch1, mixed, seqlen, batch, maxseqlen, heads, embed, stream, sync, timers):
+        ctx.save_for_backward(batch1, mixed, seqlen)
+        ctx.batch = batch
+        ctx.maxseqlen = maxseqlen
+        ctx.heads = heads
+        ctx.embed = embed
+        ctx.stream = stream
+        ctx.sync = sync
+        ctx.timers = timers
+        ntokens = seqlen.sum().item()
+        ctx.ntokens = ntokens
+        output = torch.empty([ntokens,heads,embed], device="cuda", dtype=torch.float16)
+        if timers: timers['start_fprop'].record()
+        mhalib.FastBmm2Fprop(mixed, batch1, output, batch, seqlen, heads, embed, False, True, stream, sync)
+        if timers: timers['stop_fprop'].record()
+        return output[:ntokens]
+    @staticmethod
+    def backward(ctx, grad_output):
+        batch1, mixed, seqlen = ctx.saved_tensors
+        batch = ctx.batch
+        maxseqlen = ctx.maxseqlen
+        heads = ctx.heads
+        embed = ctx.embed
+        ntokens = ctx.ntokens
+        ntokens2 = 0
+        for i in range(batch):
+            ntokens2 += seqlen[i]*seqlen[i]
+        grad_batch1 = torch.empty(ntokens2*heads, device="cuda", dtype=torch.float16)
+        grad_mixed = torch.empty([ntokens,heads*3*embed], device="cuda", dtype=torch.float16)
+        if ctx.timers: ctx.timers['start_dgrad'].record()
+        mhalib.FastBmm2Dgrad1(mixed, grad_output, grad_batch1, batch, seqlen, heads, embed, False, True, ctx.stream, ctx.sync)
+        if ctx.timers: ctx.timers['stop_dgrad'].record()
+        if ctx.timers: ctx.timers['start_wgrad'].record()
+        mhalib.FastBmm2Dgrad2(grad_output, batch1, grad_mixed, batch, seqlen, heads, embed, False, True, ctx.stream, ctx.sync)
+        if ctx.timers: ctx.timers['stop_wgrad'].record()
+        return grad_batch1[:ntokens2*heads], grad_mixed[:ntokens], None, None, None, None, None, None, None, None
+class Bmm2Strided(torch.nn.Module):
+    def __init__(self, batch, seqlen, heads, embed, stream=True, sync=True, timer=False):
+        super(Bmm2Strided, self).__init__()
+        self.heads = heads
+        self.embed = embed
+        self.maxseqlen = seqlen
+        self.stream = stream
+        self.sync = sync
+        if timer:
+            self.timers = {'start_fprop':torch.cuda.Event(enable_timing=True),
+                           'start_dgrad':torch.cuda.Event(enable_timing=True),
+                           'start_wgrad':torch.cuda.Event(enable_timing=True),
+                           'stop_fprop':torch.cuda.Event(enable_timing=True),
+                           'stop_dgrad':torch.cuda.Event(enable_timing=True),
+                           'stop_wgrad':torch.cuda.Event(enable_timing=True)}
+        else:
+            self.timers = None
+    def forward(self, batch1, mixed, batch, seqlen):
+        return Bmm2StridedFunction.apply(batch1, mixed, seqlen, batch, self.maxseqlen, self.heads, self.embed, self.stream, self.sync, self.timers)
+###########################################################################################
--- a/tf_to_torch/convert_tf_checkpoint.py
+++ b/tf_to_torch/convert_tf_checkpoint.py
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import argparse
+from modeling import BertForPreTraining, BertConfig
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--bert_model", default="bert-large-uncased", type=str,
+                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
+                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
+    parser.add_argument('--tf_checkpoint',
+                        type=str,
+                        default="/google_bert_data",
+                        help="Path to directory containing TF checkpoint")
+    parser.add_argument('--bert_config_path',
+                        type=str,
+                        default="/workspace/phase1",
+                        help="Path bert_config.json is located in")
+    parser.add_argument('--output_checkpoint', type=str,
+                        default='./checkpoint.pt',
+                        help="Path to output PyT checkpoint")
+    return parser.parse_args()
+def prepare_model(args, device):
+    # Prepare model
+    config = BertConfig.from_json_file(args.bert_config_path)
+    # Padding for divisibility by 8
+    if config.vocab_size % 8 != 0:
+        config.vocab_size += 8 - (config.vocab_size % 8)
+        print('padded vocab size to: {}'.format(config.vocab_size))
+    # Set some options that the config file is expected to have (but don't need to be set properly
+    # at this point)
+    config.pad = False
+    config.unpad = False
+    config.dense_seq_output = False
+    config.fused_mha = False
+    config.fused_gelu_bias = False
+    config.fuse_qkv = False
+    config.fuse_scale = False
+    config.fuse_mask = False
+    config.fuse_dropout = False
+    config.apex_softmax = False
+    config.enable_stream = False
+    if config.fuse_mask == True: config.apex_softmax = True
+    if config.pad == False: config.enable_stream = True
+    if config.unpad == True: config.fused_mha = False
+    #Load from TF checkpoint
+    model = BertForPreTraining.from_pretrained(args.tf_checkpoint, from_tf=True, config=config)
+    return model
+def main():
+    args = parse_arguments()
+    device = torch.device("cuda")
+    model = prepare_model(args, device)
+    torch.save({'model' : model.state_dict() }, args.output_checkpoint)
+if __name__ == "__main__":
+    main()
--- a/tf_to_torch/file_utils.py
+++ b/tf_to_torch/file_utils.py
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utilities for working with the local dataset cache.
+This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
+Copyright by the AllenNLP authors.
+"""
+from __future__ import (absolute_import, division, print_function, unicode_literals)
+import json
+import logging
+import os
+import shutil
+import tempfile
+from functools import wraps
+from hashlib import sha256
+import sys
+from io import open
+import boto3
+import requests
+from botocore.exceptions import ClientError
+from tqdm import tqdm
+try:
+    from urllib.parse import urlparse
+except ImportError:
+    from urlparse import urlparse
+try:
+    from pathlib import Path
+    PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
+                                                   Path.home() / '.pytorch_pretrained_bert'))
+except AttributeError:
+    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
+                                              os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+def url_to_filename(url, etag=None):
+    """
+    Convert `url` into a hashed filename in a repeatable way.
+    If `etag` is specified, append its hash to the url's, delimited
+    by a period.
+    """
+    url_bytes = url.encode('utf-8')
+    url_hash = sha256(url_bytes)
+    filename = url_hash.hexdigest()
+    if etag:
+        etag_bytes = etag.encode('utf-8')
+        etag_hash = sha256(etag_bytes)
+        filename += '.' + etag_hash.hexdigest()
+    return filename
+def filename_to_url(filename, cache_dir=None):
+    """
+    Return the url and etag (which may be ``None``) stored for `filename`.
+    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+    cache_path = os.path.join(cache_dir, filename)
+    if not os.path.exists(cache_path):
+        raise EnvironmentError("file {} not found".format(cache_path))
+    meta_path = cache_path + '.json'
+    if not os.path.exists(meta_path):
+        raise EnvironmentError("file {} not found".format(meta_path))
+    with open(meta_path, encoding="utf-8") as meta_file:
+        metadata = json.load(meta_file)
+    url = metadata['url']
+    etag = metadata['etag']
+    return url, etag
+def cached_path(url_or_filename, cache_dir=None, from_tf=False):
+    """
+    Given something that might be a URL (or might be a local path),
+    determine which. If it's a URL, download the file and cache it, and
+    return the path to the cached file. If it's already a local path,
+    make sure the file exists and then return the path.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
+        url_or_filename = str(url_or_filename)
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+    parsed = urlparse(url_or_filename)
+    # if not os.path.exists(url_or_filename):
+    #     raise ValueError("Local cached file does not exist: {}".format(parsed))
+    if parsed.scheme in ('http', 'https', 's3'):
+        # URL, so get it from the cache (downloading if necessary)
+        return get_from_cache(url_or_filename, cache_dir)
+    elif os.path.exists(url_or_filename):
+        # File, and it exists.
+        return url_or_filename
+    elif from_tf and os.path.exists(url_or_filename + ".meta"):
+        # TF checkpoint exists
+        return url_or_filename
+    elif parsed.scheme == '':
+        # File, but it doesn't exist.
+        raise EnvironmentError("file {} not found".format(url_or_filename))
+    else:
+        # Something unknown
+        raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
+def split_s3_path(url):
+    """Split a full s3 path into the bucket name and path."""
+    parsed = urlparse(url)
+    if not parsed.netloc or not parsed.path:
+        raise ValueError("bad s3 path {}".format(url))
+    bucket_name = parsed.netloc
+    s3_path = parsed.path
+    # Remove '/' at beginning of path.
+    if s3_path.startswith("/"):
+        s3_path = s3_path[1:]
+    return bucket_name, s3_path
+def s3_request(func):
+    """
+    Wrapper function for s3 requests in order to create more helpful error
+    messages.
+    """
+    @wraps(func)
+    def wrapper(url, *args, **kwargs):
+        try:
+            return func(url, *args, **kwargs)
+        except ClientError as exc:
+            if int(exc.response["Error"]["Code"]) == 404:
+                raise EnvironmentError("file {} not found".format(url))
+            else:
+                raise
+    return wrapper
+@s3_request
+def s3_etag(url):
+    """Check ETag on S3 object."""
+    s3_resource = boto3.resource("s3")
+    bucket_name, s3_path = split_s3_path(url)
+    s3_object = s3_resource.Object(bucket_name, s3_path)
+    return s3_object.e_tag
+@s3_request
+def s3_get(url, temp_file):
+    """Pull a file directly from S3."""
+    s3_resource = boto3.resource("s3")
+    bucket_name, s3_path = split_s3_path(url)
+    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
+def http_get(url, temp_file):
+    req = requests.get(url, stream=True)
+    content_length = req.headers.get('Content-Length')
+    total = int(content_length) if content_length is not None else None
+    progress = tqdm(unit="B", total=total)
+    for chunk in req.iter_content(chunk_size=1024):
+        if chunk: # filter out keep-alive new chunks
+            progress.update(len(chunk))
+            temp_file.write(chunk)
+    progress.close()
+def get_from_cache(url, cache_dir=None):
+    """
+    Given a URL, look for the corresponding dataset in the local cache.
+    If it's not there, download it. Then return the path to the cached file.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+    if not os.path.exists(cache_dir):
+        os.makedirs(cache_dir)
+    # Get eTag to add to filename, if it exists.
+    if url.startswith("s3://"):
+        etag = s3_etag(url)
+    else:
+        response = requests.head(url, allow_redirects=True)
+        if response.status_code != 200:
+            raise IOError("HEAD request failed for url {} with status code {}"
+                          .format(url, response.status_code))
+        etag = response.headers.get("ETag")
+    filename = url_to_filename(url, etag)
+    # get cache path to put the file
+    cache_path = os.path.join(cache_dir, filename)
+    if not os.path.exists(cache_path):
+        raise ValueError("local cached file {} doesn't exist".format(cache_path))
+        # Download to temporary file, then copy to cache dir once finished.
+        # Otherwise you get corrupt cache entries if the download gets interrupted.
+        with tempfile.NamedTemporaryFile() as temp_file:
+            logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
+            # GET file object
+            if url.startswith("s3://"):
+                s3_get(url, temp_file)
+            else:
+                http_get(url, temp_file)
+            # we are copying the file before closing it, so flush to avoid truncation
+            temp_file.flush()
+            # shutil.copyfileobj() starts at the current position, so go to the start
+            temp_file.seek(0)
+            logger.info("copying %s to cache at %s", temp_file.name, cache_path)
+            with open(cache_path, 'wb') as cache_file:
+                shutil.copyfileobj(temp_file, cache_file)
+            logger.info("creating metadata file for %s", cache_path)
+            meta = {'url': url, 'etag': etag}
+            meta_path = cache_path + '.json'
+            with open(meta_path, 'w', encoding="utf-8") as meta_file:
+                json.dump(meta, meta_file)
+            logger.info("removing temp file %s", temp_file.name)
+    return cache_path
+def read_set_from_file(filename):
+    '''
+    Extract a de-duped collection (set) of text from a file.
+    Expected file format is one item per line.
+    '''
+    collection = set()
+    with open(filename, 'r', encoding='utf-8') as file_:
+        for line in file_:
+            collection.add(line.rstrip())
+    return collection
+def get_file_extension(path, dot=True, lower=True):
+    ext = os.path.splitext(path)[1]
+    ext = ext if dot else ext[1:]
+    return ext.lower() if lower else ext
--- a/tf_to_torch/layers/__init__.py
+++ b/tf_to_torch/layers/__init__.py
+import torch
+from .fused_gelu import bias_gelu_impl
+__all__ = ["bias_gelu_impl"]
--- a/tf_to_torch/layers/fused_gelu.py
+++ b/tf_to_torch/layers/fused_gelu.py
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+torch._C._jit_set_profiling_mode(False)                                                                                    
+torch._C._jit_set_profiling_executor(False)                                                                                
+torch._C._jit_override_can_fuse_on_cpu(True)                                                                               
+torch._C._jit_override_can_fuse_on_gpu(True) 
+# 1/sqrt(2*pi)-> 0.3989423
+# 1/sqrt(2)   -> 0.70710678
+# sqrt(2/pi)  -> 0.79788456
+# this function is tanh approximation of gelu
+# actual gelu is:
+# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
+@torch.jit.script
+def bias_gelu(bias, y):
+    x = bias + y
+    return  x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
+# gradient of tanh approximation of gelu
+# gradient of actual gelu is:
+# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
+@torch.jit.script
+def bias_gelu_back(g, bias, y):
+    x = bias + y
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
+    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
+    return ff*g
+class GeLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input, bias):
+        ctx.save_for_backward(input, bias)
+        return bias_gelu(bias, input)
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, bias = ctx.saved_tensors
+        tmp = bias_gelu_back(grad_output, bias, input)
+        return tmp, tmp
+bias_gelu_impl = GeLUFunction.apply
--- a/tf_to_torch/mha.py
+++ b/tf_to_torch/mha.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from apex.contrib.multihead_attn import fast_mask_softmax_dropout_func
+from bmm1 import *
+from bmm2 import *
+from padding import *
+from softmax import *
+class FastUnpadBertSelfAttention(nn.Module):
+    def __init__(self, config, enable_stream=True, enable_sync=True, fuse_mask=True, fuse_scale=True, fuse_qkv=True, fuse_dropout=True, apex_softmax=True, pad=True):
+        super(FastUnpadBertSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.hidden_size = config.hidden_size
+        self.fuse_qkv = fuse_qkv
+        self.fuse_scale = fuse_scale
+        self.fuse_mask = fuse_mask
+        self.fuse_dropout = fuse_dropout
+        self.apex_softmax = apex_softmax
+        self.pad = pad
+        self.enable_stream = enable_stream
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        if self.fuse_qkv:
+            self.bmm1 = Bmm1Strided(None,None,self.num_attention_heads,self.attention_head_size, scale=self.fuse_scale, stream=enable_stream, sync=enable_sync, timer=False)
+            self.bmm2 = Bmm2Strided(None,None,self.num_attention_heads,self.attention_head_size, stream=enable_stream, sync=enable_sync, timer=False)
+        else:
+            self.bmm1 = Bmm1(None,None,self.num_attention_heads,self.attention_head_size, scale=self.fuse_scale, stream=enable_stream, sync=enable_sync)
+            self.bmm2 = Bmm2(None,None,self.num_attention_heads,self.attention_head_size, stream=enable_stream, sync=enable_sync)
+        if self.fuse_dropout == False:
+            self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        if self.fuse_mask == True and self.fuse_dropout == True:
+            self.softmax = FastMaskSoftmaxDropout(dim=-1, dropout_prob=config.attention_probs_dropout_prob,stream=enable_stream, sync=(not self.pad), timer=False)
+        elif self.fuse_mask == True:
+            self.softmax = FastMaskSoftmax(dim=-1, stream=enable_stream, sync=enable_sync, timer=False)
+        else:
+            self.softmax = FastSoftmax(dim=-1, stream=enable_stream, sync=enable_sync, timer=False)
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = torch.reshape(x, new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def transpose_key_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = torch.reshape(x, new_x_shape)
+        return x.permute(0, 2, 3, 1)
+    def pytorch_softmax(self,attention_scores, batch, seqlen, heads):
+        ntokens2 = 0
+        for i in range(batch):
+            ntokens2 += seqlen[i]*seqlen[i]*self.num_attention_heads
+        attention_probs = torch.zeros(ntokens2, device="cuda", dtype=torch.float16)
+        ntokens2 = 0
+        for i in range(batch):
+            tokens2 = seqlen[i]*seqlen[i]*self.num_attention_heads
+            attention_probs[ntokens2:ntokens2+tokens2] = F.softmax(attention_scores[ntokens2:ntokens2+tokens2].view(1,self.num_attention_heads,seqlen[i],seqlen[i]), dim=-1).flatten().contiguous()
+            ntokens2 += tokens2
+        return attention_probs
+    def forward(self, hidden_states, attention_mask, seqlen, batch, is_training=True):
+        self.batch = batch
+        # QKV
+        if self.fuse_qkv:
+            weight = torch.cat([self.query.weight.view(self.num_attention_heads,self.attention_head_size,1,self.hidden_size), self.key.weight.view(self.num_attention_heads,self.attention_head_size,1,self.hidden_size), self.value.weight.view(self.num_attention_heads,self.attention_head_size,1,self.hidden_size)], dim=1).reshape(self.all_head_size*3,self.hidden_size).contiguous()
+            bias = torch.cat([self.query.bias.view(self.num_attention_heads,1,self.attention_head_size), self.key.bias.view(self.num_attention_heads,1,self.attention_head_size), self.value.bias.view(self.num_attention_heads,1,self.attention_head_size)],dim=1).reshape(3*self.hidden_size).contiguous()
+            mixed_x_layer = torch.addmm(bias, hidden_states, weight.t())
+        else:
+            query_layer = self.query(hidden_states)
+            key_layer = self.key(hidden_states)
+            value_layer = self.value(hidden_states)            
+        # BMM1.
+        if self.enable_stream: torch.cuda.synchronize()
+        if self.fuse_qkv:
+            attention_scores, qkv_layer = self.bmm1(mixed_x_layer, self.batch, seqlen)
+        else:
+            attention_scores = self.bmm1(query_layer, key_layer, self.batch, seqlen)            
+        if self.enable_stream: torch.cuda.synchronize()
+        if self.fuse_scale == False:
+            attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Softmax.
+        if self.enable_stream: torch.cuda.synchronize()        
+        if self.fuse_mask ==True and self.fuse_dropout == True:
+            attention_probs = self.softmax(attention_scores, attention_mask, self.batch, seqlen, self.num_attention_heads, is_training)
+        elif self.fuse_mask == True:
+            attention_probs = self.softmax(attention_scores, attention_mask, self.batch, seqlen, self.num_attention_heads)
+        else:
+            attention_scores = attention_scores + attention_mask.view(-1)
+            if self.apex_softmax == True:
+                attention_probs = self.softmax(attention_scores, self.batch, seqlen, self.num_attention_heads)
+            else:
+                if self.pad == True:
+                    attention_probs = F.softmax(attention_scores.view(batch,self.num_attention_heads,seqlen[0],seqlen[0]), dim=-1).flatten().contiguous()
+                else:
+                    attention_probs = self.pytorch_softmax(attention_scores, self.batch, seqlen, self.num_attention_heads)
+        # Dropout.
+        if self.enable_stream: torch.cuda.synchronize()                
+        if self.fuse_dropout == False:
+            attention_probs = self.dropout(attention_probs)
+        # BMM2.
+        if self.enable_stream: torch.cuda.synchronize()
+        if self.fuse_qkv:
+            context_layer = self.bmm2(attention_probs, qkv_layer, self.batch, seqlen)
+        else:
+            context_layer = self.bmm2(attention_probs, value_layer, self.batch, seqlen)
+        if self.enable_stream: torch.cuda.synchronize()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = torch.reshape(context_layer, new_context_layer_shape)
+        return context_layer
--- a/tf_to_torch/mhalib/mha_funcs.cu
+++ b/tf_to_torch/mhalib/mha_funcs.cu
+#include <vector>
+#include <iostream>
+#include <ATen/ATen.h>
+#include <cuda.h>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+//#include <cuda_profiler_api.h>
+#include "THC/THC.h"
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include <math.h>
+#include "/opt/pytorch/apex/apex/contrib/csrc/multihead_attn/softmax.h"
+#define nstreams 16
+// global variables.
+cudaStream_t stream[nstreams];
+cublasHandle_t handle;
+///////////////////////////////////////////////////////////////////////////////////////////////////
+void FastBmm1Fprop_(torch::Tensor &A,
+                         torch::Tensor &B,
+                         torch::Tensor &C,
+	    	         int batch,
+  	  	         torch::Tensor &seq_len,
+                         int heads,
+		         int embed,
+			 bool scale,
+			 bool strided,
+			 bool enable_stream,
+			 bool sync)
+{
+    float one = 1.0, zero = 0.0, alpha = 1.0 / sqrt(static_cast<float>(embed));
+    int *seqlen = static_cast<int*>(seq_len.data_ptr());
+    void *ptrA = static_cast<void*>(static_cast<half*>(A.data_ptr()) + (strided ? embed : 0)); 	// key
+    void *ptrB = static_cast<void*>(static_cast<half*>(B.data_ptr())); 				// query
+    void *ptrC = static_cast<void*>(static_cast<half*>(C.data_ptr())); 	        		// output
+    for(int i = 0; i < (enable_stream ? batch : 1); i++) {
+        cublasSetStream(handle, enable_stream ? stream[i%nstreams]: at::cuda::getCurrentCUDAStream());
+        cublasGemmStridedBatchedEx(handle,
+                                   CUBLAS_OP_T,
+                                   CUBLAS_OP_N,
+                                   seqlen[i],
+                                   seqlen[i],
+                                   embed,
+                                   static_cast<const void*>(scale ? &alpha : &one),
+                                   ptrA,
+                                   CUDA_R_16F,
+                                   (enable_stream ? 1 : batch) * (strided ? heads*3*embed : heads*embed),
+                                   strided ? 3*embed : embed,
+                                   ptrB,
+                                   CUDA_R_16F,
+                                   (enable_stream ? 1 : batch) * (strided ? heads*3*embed : heads*embed),
+                                   strided ? 3*embed : embed,
+                                   static_cast<const void*>(&zero),
+                                   ptrC,
+                                   CUDA_R_16F,
+                                   seqlen[i],
+                                   seqlen[i]*seqlen[i],
+                                   enable_stream ? heads : batch*heads,
+                                   CUDA_R_32F,
+                                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+	ptrA = static_cast<void*>(static_cast<half*>(ptrA) + (strided ? seqlen[i]*heads*3*embed : seqlen[i]*heads*embed));
+	ptrB = static_cast<void*>(static_cast<half*>(ptrB) + (strided ? seqlen[i]*heads*3*embed : seqlen[i]*heads*embed));
+	ptrC = static_cast<void*>(static_cast<half*>(ptrC) + heads*seqlen[i]*seqlen[i]);
+    }
+    for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
+        if(sync) cudaStreamSynchronize(stream[i]);
+    }
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+void FastBmm2Fprop_(torch::Tensor &A,
+                    torch::Tensor &B,
+                    torch::Tensor &C,
+                    int batch,
+                    torch::Tensor &seq_len,
+                    int heads,
+                    int embed,
+		    bool scale,
+		    bool strided,
+		    bool enable_stream,
+		    bool sync)
+{
+    float one = 1.0, zero = 0.0;
+    int *seqlen = static_cast<int*>(seq_len.data_ptr());
+    void *ptrA = static_cast<void*>(static_cast<half*>(A.data_ptr()) + (strided ? 2*embed : 0));  // value 
+    void *ptrB = static_cast<void*>(static_cast<half*>(B.data_ptr()));            		// query*key
+    void *ptrC = static_cast<void*>(static_cast<half*>(C.data_ptr()));           		 // output
+    for(int i = 0; i < (enable_stream ? batch : 1); i++) {
+        cublasSetStream(handle, enable_stream ? stream[i%nstreams]: at::cuda::getCurrentCUDAStream());
+        cublasGemmStridedBatchedEx(handle,
+                                   CUBLAS_OP_N,
+                                   CUBLAS_OP_N,
+                                   embed,
+                                   seqlen[i],
+                                   seqlen[i],
+                                   static_cast<const void*>(&one),
+                                   ptrA,
+                                   CUDA_R_16F,
+                                   (enable_stream ? 1 : batch) * (strided ? heads*3*embed : heads*embed),
+                                   strided ? 3*embed : embed,
+                                   ptrB,
+                                   CUDA_R_16F,
+                                   seqlen[i],
+                                   seqlen[i]*seqlen[i],
+                                   static_cast<const void*>(&zero),
+                                   ptrC,
+                                   CUDA_R_16F,
+                                   enable_stream ? heads*embed : batch*heads*embed,
+                                   embed,
+                                   enable_stream ? heads : batch*heads,
+                                   CUDA_R_32F,
+                                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+        ptrA = static_cast<void*>(static_cast<half*>(ptrA) + (strided ? seqlen[i]*heads*3*embed : seqlen[i]*heads*embed));
+        ptrB = static_cast<void*>(static_cast<half*>(ptrB) + heads*seqlen[i]*seqlen[i]);
+        ptrC = static_cast<void*>(static_cast<half*>(ptrC) + seqlen[i]*heads*embed);
+    }
+    for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
+        if(sync) cudaStreamSynchronize(stream[i]);
+    }
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+void FastBmm1Dgrad1_(torch::Tensor &A,
+                         torch::Tensor &B,
+                         torch::Tensor &C,
+                         int batch,
+                         torch::Tensor &seq_len,
+                         int heads,
+                         int embed,
+			 bool scale,
+			 bool strided,
+			 bool enable_stream,
+			 bool sync)
+{
+    float one = 1.0, zero = 0.0, alpha = 1.0 / sqrt(static_cast<float>(embed));
+    int *seqlen = static_cast<int*>(seq_len.data_ptr());
+    void *ptrA = static_cast<void*>(static_cast<half*>(A.data_ptr()));           		// query
+    void *ptrB = static_cast<void*>(static_cast<half*>(B.data_ptr()));
+    void *ptrC = static_cast<void*>(static_cast<half*>(C.data_ptr()) + (strided ? embed : 0)); 	// grad_key
+    for(int i = 0; i < (enable_stream ? batch : 1); i++) {
+        cublasSetStream(handle, enable_stream ? stream[i%nstreams] : at::cuda::getCurrentCUDAStream());
+        cublasGemmStridedBatchedEx(handle,
+                                   CUBLAS_OP_N,
+                                   CUBLAS_OP_T,
+                                   embed,
+                                   seqlen[i],
+                                   seqlen[i],
+                                   static_cast<const void*>(scale ? &alpha : &one),
+                                   ptrA,
+                                   CUDA_R_16F,
+                                   (enable_stream ? 1 : batch) * (strided ? heads*3*embed : heads*embed),
+                                   strided ? 3*embed : embed,
+                                   ptrB,
+                                   CUDA_R_16F,
+                                   seqlen[i],
+                                   seqlen[i]*seqlen[i],
+                                   static_cast<const void*>(&zero),
+                                   ptrC,
+                                   CUDA_R_16F,
+                                   (enable_stream ? 1 : batch) * (strided ? heads*3*embed : heads*embed),
+                                   strided ? 3*embed : embed,
+                                   enable_stream ? heads : heads*batch,
+                                   CUDA_R_32F,
+                                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+        ptrA = static_cast<void*>(static_cast<half*>(ptrA) + (strided ? seqlen[i]*heads*3*embed : seqlen[i]*heads*embed));
+        ptrB = static_cast<void*>(static_cast<half*>(ptrB) + heads*seqlen[i]*seqlen[i]);
+        ptrC = static_cast<void*>(static_cast<half*>(ptrC) + (strided ? seqlen[i]*heads*3*embed : seqlen[i]*heads*embed));
+    }
+    for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
+        if(sync) cudaStreamSynchronize(stream[i]);
+    }
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+void FastBmm2Dgrad1_(torch::Tensor &A,
+                     torch::Tensor &B,
+                     torch::Tensor &C,
+                     int batch,
+                     torch::Tensor &seq_len,
+                     int heads,
+                     int embed,
+		     bool scale,
+		     bool strided,
+		     bool enable_stream,
+		     bool sync)
+{
+    float one = 1.0, zero = 0.0;
+    int *seqlen = static_cast<int*>(seq_len.data_ptr());
+    void *ptrA = static_cast<void*>(static_cast<half*>(A.data_ptr()) + (strided ? 2*embed : 0));  // value
+    void *ptrB = static_cast<void*>(static_cast<half*>(B.data_ptr()));
+    void *ptrC = static_cast<void*>(static_cast<half*>(C.data_ptr()));
+    for(int i = 0; i < (enable_stream ? batch : 1); i++) {
+        cublasSetStream(handle, enable_stream ? stream[i%nstreams] : at::cuda::getCurrentCUDAStream());
+        cublasGemmStridedBatchedEx(handle,
+                                   CUBLAS_OP_T,
+                                   CUBLAS_OP_N,
+                                   seqlen[i],
+                                   seqlen[i],
+                                   embed,
+                                   static_cast<const void*>(&one),
+                                   ptrA,
+                                   CUDA_R_16F,
+                                   (enable_stream ? 1 : batch) * (strided ? heads*3*embed : heads*embed),
+                                   strided ? 3*embed : embed,
+                                   ptrB,
+                                   CUDA_R_16F,
+				   enable_stream ? heads*embed : batch*heads*embed,
+                                   embed,
+                                   static_cast<const void*>(&zero),
+                                   ptrC,
+                                   CUDA_R_16F,
+                                   seqlen[i],
+                                   seqlen[i]*seqlen[i],
+                                   enable_stream ? heads : batch*heads,
+                                   CUDA_R_32F,
+                                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+        ptrA = static_cast<void*>(static_cast<half*>(ptrA) + (strided ? seqlen[i]*heads*3*embed : seqlen[i]*heads*embed));
+        ptrB = static_cast<void*>(static_cast<half*>(ptrB) + seqlen[i]*heads*embed);
+        ptrC = static_cast<void*>(static_cast<half*>(ptrC) + heads*seqlen[i]*seqlen[i]);
+    }
+    for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
+        if(sync) cudaStreamSynchronize(stream[i]);
+    }
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+void FastBmm1Dgrad2_(torch::Tensor &A,
+                         torch::Tensor &B,
+                         torch::Tensor &C,
+                         int batch,
+                         torch::Tensor &seq_len,
+                         int heads,
+                         int embed,
+			 bool scale,
+			 bool strided,
+			 bool enable_stream,
+			 bool sync)
+{
+    float one = 1.0, zero = 0.0, alpha = 1.0 / sqrt(static_cast<float>(embed));
+    int *seqlen = static_cast<int*>(seq_len.data_ptr());
+    void *ptrA = static_cast<void*>(static_cast<half*>(A.data_ptr()) + (strided ? embed : 0));  	// key
+    void *ptrB = static_cast<void*>(static_cast<half*>(B.data_ptr()));
+    void *ptrC = static_cast<void*>(static_cast<half*>(C.data_ptr()));          		// grad query
+    for(int i = 0; i < (enable_stream ? batch : 1); i++) {
+        cublasSetStream(handle, enable_stream ? stream[i%nstreams] : at::cuda::getCurrentCUDAStream());
+        cublasGemmStridedBatchedEx(handle,
+                                   CUBLAS_OP_N,
+                                   CUBLAS_OP_N,
+                                   embed,
+                                   seqlen[i],
+                                   seqlen[i],
+                                   static_cast<const void*>(scale ? &alpha : &one),
+                                   ptrA,
+                                   CUDA_R_16F,
+                                   (enable_stream ? 1 : batch) * (strided ? heads*3*embed : heads*embed),
+                                   strided ? 3*embed : embed,
+                                   ptrB,
+                                   CUDA_R_16F,
+                                   seqlen[i],
+                                   seqlen[i]*seqlen[i],
+                                   static_cast<const void*>(&zero),
+                                   ptrC,
+                                   CUDA_R_16F,
+                                   (enable_stream ? 1 : batch) * (strided ? heads*3*embed : heads*embed),
+                                   strided ? 3*embed : embed,
+                                   enable_stream ? heads : batch*heads,
+                                   CUDA_R_32F,
+                                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+        ptrA = static_cast<void*>(static_cast<half*>(ptrA) + (strided ? seqlen[i]*heads*3*embed : seqlen[i]*heads*embed));
+        ptrB = static_cast<void*>(static_cast<half*>(ptrB) + heads*seqlen[i]*seqlen[i]);
+        ptrC = static_cast<void*>(static_cast<half*>(ptrC) + (strided ? seqlen[i]*heads*3*embed : seqlen[i]*heads*embed));
+    }
+    for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
+        if(sync) cudaStreamSynchronize(stream[i]);
+    }
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+void FastBmm2Dgrad2_(torch::Tensor &A,
+                     torch::Tensor &B,
+                     torch::Tensor &C,
+                     int batch,
+                     torch::Tensor &seq_len,
+                     int heads,
+                     int embed,
+		     bool scale,
+		     bool strided,
+		     bool enable_stream,
+		     bool sync)
+{
+    float one = 1.0, zero = 0.0;
+    int *seqlen = static_cast<int*>(seq_len.data_ptr());
+    void *ptrA = static_cast<void*>(static_cast<half*>(A.data_ptr()));
+    void *ptrB = static_cast<void*>(static_cast<half*>(B.data_ptr()));
+    void *ptrC = static_cast<void*>(static_cast<half*>(C.data_ptr()) + (strided ? 2*embed : 0));  // grad-value
+    for(int i = 0; i < (enable_stream ? batch : 1); i++) {
+        cublasSetStream(handle, enable_stream ? stream[i%nstreams] : at::cuda::getCurrentCUDAStream());
+        cublasGemmStridedBatchedEx(handle,
+                                   CUBLAS_OP_N,
+                                   CUBLAS_OP_T,
+                                   embed,
+                                   seqlen[i],
+                                   seqlen[i],
+                                   static_cast<const void*>(&one),
+                                   ptrA,
+                                   CUDA_R_16F,
+				   enable_stream ? heads*embed : batch*heads*embed,
+                                   embed,
+                                   ptrB,
+                                   CUDA_R_16F,
+                                   seqlen[i],
+                                   seqlen[i]*seqlen[i],
+                                   static_cast<const void*>(&zero),
+                                   ptrC,
+                                   CUDA_R_16F,
+                                   (enable_stream ? 1 : batch) * (strided ? heads*3*embed : heads*embed),
+                                   strided ? 3*embed : embed,
+                                   enable_stream ? heads : batch*heads,
+                                   CUDA_R_32F,
+                                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+        ptrA = static_cast<void*>(static_cast<half*>(ptrA) + seqlen[i]*heads*embed);
+        ptrB = static_cast<void*>(static_cast<half*>(ptrB) + heads*seqlen[i]*seqlen[i]);
+        ptrC = static_cast<void*>(static_cast<half*>(ptrC) + (strided ? seqlen[i]*heads*3*embed : seqlen[i]*heads*embed));
+    }
+    for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
+        if(sync) cudaStreamSynchronize(stream[i]);
+    }
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+void FastSoftmaxFprop_(torch::Tensor &input,
+		  int batch,
+                  torch::Tensor &seq_len,
+		  int heads,
+		  bool enable_stream,
+		  bool sync)
+{
+    int *seqlen = static_cast<int*>(seq_len.data_ptr());
+    void *ptrIn = static_cast<void*>(input.data_ptr());
+    for(int i = 0; i < (enable_stream ? batch : 1); i++) {
+        dispatch_softmax<half, half, float>(
+                                 reinterpret_cast<half*>(ptrIn),
+                                 reinterpret_cast<const half*>(ptrIn),
+                                 seqlen[i],
+                                 seqlen[i],
+                                 enable_stream ? heads*seqlen[i] : batch*heads*seqlen[i]);
+        ptrIn = static_cast<void*>(static_cast<half*>(ptrIn) + heads*seqlen[i]*seqlen[i]);
+    }
+    for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
+        if(sync) cudaStreamSynchronize(stream[i]);
+    }
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+void FastSoftmaxBprop_(torch::Tensor &input,
+		       torch::Tensor &output,
+                       int batch,
+                       torch::Tensor &seq_len,
+                       int heads,
+		       bool enable_stream,
+		       bool sync)
+{
+    int *seqlen = static_cast<int*>(seq_len.data_ptr());
+    void *ptrIn = static_cast<void*>(input.data_ptr());
+    void *ptrOut = static_cast<void*>(output.data_ptr());
+    for(int i = 0; i < (enable_stream ? batch : 1); i++) {
+        dispatch_softmax_backward_stream<half, half, float>(
+                                 static_cast<half*>(ptrOut),
+                                 static_cast<half*>(ptrOut),
+                                 reinterpret_cast<half const*>(ptrIn),
+                                 seqlen[i],
+                                 seqlen[i],
+                                 enable_stream ? heads*seqlen[i] : batch*heads*seqlen[i], 
+				 enable_stream ? stream[i%nstreams] : at::cuda::getCurrentCUDAStream());
+        ptrIn = static_cast<void*>(static_cast<half*>(ptrIn) + heads*seqlen[i]*seqlen[i]);
+        ptrOut = static_cast<void*>(static_cast<half*>(ptrOut) + heads*seqlen[i]*seqlen[i]);	
+    }
+    for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
+        if(sync) cudaStreamSynchronize(stream[i]);
+    }	
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+void FastMaskSoftmaxFprop_(torch::Tensor &input,
+                           torch::Tensor &mask,
+                           int batch,
+                           torch::Tensor &seq_len,
+                           int heads,
+			   bool enable_stream,
+			   bool sync)
+{
+    int *seqlen = static_cast<int*>(seq_len.data_ptr());
+    void *ptrIn = static_cast<void*>(input.data_ptr());
+    void *ptrMask = static_cast<void*>(mask.data_ptr());
+    for(int i = 0; i < (enable_stream ? batch : 1); i++) {
+        dispatch_additive_masked_softmax_stream<half, half, float>(
+                                 reinterpret_cast<half*>(ptrIn),
+                                 reinterpret_cast<const half*>(ptrIn),
+                                 reinterpret_cast<const half*>(ptrMask),				 
+                                 seqlen[i],
+                                 seqlen[i],
+                                 enable_stream ? heads*seqlen[i] : batch*heads*seqlen[i],
+				 enable_stream ? heads*seqlen[i] : heads*seqlen[i], 
+				 enable_stream ? stream[i%nstreams] : at::cuda::getCurrentCUDAStream());
+        ptrIn = static_cast<void*>(static_cast<half*>(ptrIn) + heads*seqlen[i]*seqlen[i]);
+        ptrMask = static_cast<void*>(static_cast<half*>(ptrMask) + seqlen[i]);	
+    }
+    for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
+        if(sync) cudaStreamSynchronize(stream[i]);
+    }
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+std::vector<torch::Tensor> FastMaskSoftmaxDropoutFprop_(torch::Tensor &input,
+                                  torch::Tensor &mask,
+                                  int batch,
+                                  torch::Tensor &seq_len,
+                                  int heads,
+                                  float dropout_prob,
+                                  bool enable_stream,
+                                  bool sync,
+                                  bool is_training)
+{
+    int *seqlen = static_cast<int*>(seq_len.data_ptr());
+    void *ptrIn = static_cast<void*>(input.data_ptr());
+    void *ptrMask = static_cast<void*>(mask.data_ptr());
+    for(int i = 0; i < (enable_stream ? batch : 1); i++) {
+        dispatch_additive_masked_softmax_stream<half, half, float>(
+                                 reinterpret_cast<half*>(ptrIn),
+                                 reinterpret_cast<const half*>(ptrIn),
+                                 reinterpret_cast<const half*>(ptrMask),
+                                 seqlen[i],
+                                 seqlen[i],
+                                 enable_stream ? heads*seqlen[i] : batch*heads*seqlen[i],
+                                 enable_stream ? heads*seqlen[i] : heads*seqlen[i],
+                                 enable_stream ? stream[i%nstreams] : at::cuda::getCurrentCUDAStream());
+        ptrIn = static_cast<void*>(static_cast<half*>(ptrIn) + heads*seqlen[i]*seqlen[i]);
+        ptrMask = static_cast<void*>(static_cast<half*>(ptrMask) + seqlen[i]);
+    }
+    for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
+        if(sync) cudaStreamSynchronize(stream[i]);
+    }
+    int ntokens = seqlen[0];
+    for(int i = 1; i < (enable_stream ? batch : 2); i++) {
+        ntokens += seqlen[i];
+    }
+    auto act_options  = input.options().requires_grad(false);
+    auto mask_options = act_options.dtype(torch::kUInt8);
+    torch::Tensor dropout_results   = torch::empty({batch*heads, ntokens},   act_options);
+    torch::Tensor dropout_mask      = torch::empty({batch*heads, ntokens},   mask_options);
+    //torch::Tensor dropout_results   = torch::empty({batch*heads, seqlen[0], seqlen[0]},   act_options);
+    //torch::Tensor dropout_mask      = torch::empty({batch*heads, seqlen[0], seqlen[0]},   mask_options);
+    if (is_training) {
+        //use at:: function so that C++ version generates the same random mask as python version
+        auto dropout_tuple = at::_fused_dropout(input, 1.0f-dropout_prob);
+        dropout_results = std::get<0>(dropout_tuple);
+        dropout_mask = std::get<1>(dropout_tuple);
+    }
+    return {dropout_results, dropout_mask};
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+void FastMaskSoftmaxDropoutBprop_(torch::Tensor &input,
+                              torch::Tensor &output,
+                              torch::Tensor &dropout_mask,
+                              int batch,
+                              torch::Tensor &seq_len,
+                              int heads,
+                              float dropout_prob,
+                              bool enable_stream,
+                              bool sync)
+{
+    int *seqlen = static_cast<int*>(seq_len.data_ptr());
+    void *ptrIn = static_cast<void*>(input.data_ptr());
+    void *ptrOut = static_cast<void*>(output.data_ptr());
+    void *ptrDropoutMask = static_cast<void*>(dropout_mask.data_ptr());
+    for(int i = 0; i < (enable_stream ? batch : 1); i++) {
+        dispatch_masked_scale_softmax_backward_stream<half, half, float, false>(
+                                 static_cast<half*>(ptrOut),
+                                 static_cast<half*>(ptrOut),
+                                 reinterpret_cast<half const*>(ptrIn),
+                                 reinterpret_cast<uint8_t const*>(ptrDropoutMask),
+                                 1.0/(1.0-dropout_prob),
+                                 seqlen[i],
+                                 seqlen[i],
+                                 enable_stream ? heads*seqlen[i] : batch*heads*seqlen[i],
+                                 enable_stream ? stream[i%nstreams] : at::cuda::getCurrentCUDAStream());
+        ptrIn = static_cast<void*>(static_cast<half*>(ptrIn) + heads*seqlen[i]*seqlen[i]);
+        ptrOut = static_cast<void*>(static_cast<half*>(ptrOut) + heads*seqlen[i]*seqlen[i]);
+    }
+    for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
+        if(sync) cudaStreamSynchronize(stream[i]);
+    }
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+void init_mha_cuda_extension()
+{
+    // CUDA Stream.
+    for(int i = 0; i < nstreams; i++) {
+        cudaStreamCreate(&stream[i]);
+    }
+    // CuBlas Handle.
+    cublasCreate(&handle);
+    cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("InitMHACUDAExtension", &init_mha_cuda_extension, "InitMHACUDAExtension");
+  m.def("FastBmm1Fprop", &FastBmm1Fprop_, "FastBmm1Fprop");
+  m.def("FastBmm1Dgrad1", &FastBmm1Dgrad1_, "FastBmm1Dgrad1"); 
+  m.def("FastBmm1Dgrad2", &FastBmm1Dgrad2_, "FastBmm1Dgrad2"); 
+  m.def("FastBmm2Fprop", &FastBmm2Fprop_, "FastBmm2Fprop");
+  m.def("FastBmm2Dgrad1", &FastBmm2Dgrad1_, "FastBmm2Dgrad1");
+  m.def("FastBmm2Dgrad2", &FastBmm2Dgrad2_, "FastBmm2Dgrad2");
+  m.def("FastSoftmaxFprop", &FastSoftmaxFprop_, "FastSoftmaxFprop");
+  m.def("FastSoftmaxBprop", &FastSoftmaxBprop_, "FastSoftmaxBprop");
+  m.def("FastMaskSoftmaxFprop", &FastMaskSoftmaxFprop_, "FastMaskSoftmaxFprop");
+  m.def("FastMaskSoftmaxDropoutFprop", &FastMaskSoftmaxDropoutFprop_, "FastMaskSoftmaxDropoutFprop");  
+  m.def("FastMaskSoftmaxDropoutBprop", &FastMaskSoftmaxDropoutBprop_, "FastMaskSoftmaxDropoutBprop");
+}
--- a/tf_to_torch/mhalib/setup.py
+++ b/tf_to_torch/mhalib/setup.py
+import torch
+import setuptools
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+setup(
+    name='mhalib',
+    ext_modules=[
+        CUDAExtension(
+            name='mhalib',
+            sources=['mha_funcs.cu'],
+            extra_compile_args={
+                               'cxx': ['-O3',],
+                                'nvcc':['-O3','-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', "--expt-relaxed-constexpr", "-ftemplate-depth=1024", '-gencode arch=compute_70,code=sm_70','-gencode arch=compute_80,code=sm_80','-gencode arch=compute_80,code=compute_80']
+                               }
+            )
+    ],
+    cmdclass={
+        'build_ext': BuildExtension
+})
--- a/tf_to_torch/modeling.py
+++ b/tf_to_torch/modeling.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model."""
+from __future__ import absolute_import, division, print_function, unicode_literals
+import copy
+import json
+import logging
+import math
+import os
+import shutil
+import tarfile
+import tempfile
+import sys
+from io import open
+from operator import mul
+from functools import reduce
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from torch.utils import checkpoint
+from apex.contrib.multihead_attn import SelfMultiheadAttn
+from file_utils import cached_path
+from layers.fused_gelu import bias_gelu_impl as bias_gelu
+from utils import get_rank
+import mhalib
+from mha import *
+logger = logging.getLogger(__name__)
+torch._C._jit_set_profiling_mode(False)
+torch._C._jit_set_profiling_executor(False)
+torch._C._jit_override_can_fuse_on_cpu(True)
+torch._C._jit_override_can_fuse_on_gpu(True)
+def remap_attn_names_tf(name):
+    if 'attention' in name:
+        ind = name.index("attention")
+        if 'self' in name and 'query' in name and 'kernel' in name:
+            name = name[:(ind+1)] + ['multi_head_attention', 'q_weight']
+        if 'self' in name and 'query' in name and 'bias' in name:
+            name = name[:(ind+1)] + ['multi_head_attention', 'q_bias']
+        if 'self' in name and 'key' in name and 'kernel' in name:
+            name = name[:(ind+1)] + ['multi_head_attention', 'k_weight']
+        if 'self' in name and 'key' in name and 'bias' in name:
+            name = name[:(ind+1)] + ['multi_head_attention', 'k_bias']
+        if 'self' in name and 'value' in name and 'kernel' in name:
+            name = name[:(ind+1)] + ['multi_head_attention', 'v_weight']
+        if 'self' in name and 'value' in name and 'bias' in name:
+            name = name[:(ind+1)] + ['multi_head_attention', 'v_bias']
+        if 'output' in name and 'dense' in name and 'kernel' in name:
+            name = name[:(ind+1)] + ['multi_head_attention', 'out_proj_weight']
+        if 'output' in name and 'dense' in name and 'bias' in name:
+            name = name[:(ind+1)] + ['multi_head_attention', 'out_proj_bias']
+        if 'output' in name and 'LayerNorm' in name:
+            name = name[:(ind+1)] + ['layer_norm'] + name[-1:]
+    return name
+def load_tf_weights_in_bert(model, tf_checkpoint_path, use_fast_mha=False):
+    """ Load tf checkpoints in a pytorch model
+    """
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions.")
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    if get_rank() == 0:
+        print("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        if get_rank() == 0:
+            print("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+    # MHA params need to be treated separately
+    if use_fast_mha:
+        mha_params = ['q_weight', 'q_bias', 'k_weight', 'k_bias', 'v_weight', 'v_bias', 'out_proj_weight', 'out_proj_bias']
+    else:
+        mha_params = []
+    for name, array in zip(names, arrays):
+        name = name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(n in ["adam_v", "adam_m", "global_step", "LAMB", "LAMB_1", "beta1_power", "beta2_power"] for n in name):
+            if get_rank() == 0:
+                print("Skipping {}".format("/".join(name)))
+            continue
+        if use_fast_mha:
+            name = remap_attn_names_tf(name)
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                l = re.split(r'_(\d+)', m_name)
+            else:
+                l = [m_name]
+            if l[0] in mha_params:
+                pointer = getattr(pointer, l[0])
+            elif l[0] == 'kernel' or l[0] == 'gamma':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'output_bias' or l[0] == 'beta':
+                pointer = getattr(pointer, 'bias')
+            elif l[0] == 'output_weights':
+                pointer = getattr(pointer, 'weight')
+            else:
+                pointer = getattr(pointer, l[0])
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        if m_name[-11:] == '_embeddings':
+            pointer = getattr(pointer, 'weight')
+        elif m_name == 'kernel' or (m_name in mha_params and 'bias' not in m_name):
+            array = np.ascontiguousarray(np.transpose(array))
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            # If copying smaller into larger, assume padded and ok
+            if reduce(mul, pointer.shape) > reduce(mul, array.shape):
+                if get_rank() == 0:
+                    print("Initialize padded PyTorch weight {}".format(name))
+                pointer.data.zero_()
+                def generate_slices():
+                    slices = []
+                    for i in range(array.ndim):
+                        slices.append(slice(0, array.shape[i], 1))
+                    return slices
+                # pointer.data[generate_slices()] = torch.from_numpy(array)
+                pointer.data[generate_slices()] = torch.from_numpy(array)
+            else:
+                e.args += (pointer.shape, array.shape)
+                raise
+        else:
+            if get_rank() == 0:
+                print("Initialize PyTorch weight {}".format(name))
+            pointer.data = torch.from_numpy(array)
+    return model
+def swish(x):
+    return x * torch.sigmoid(x)
+def fast_gelu(x):
+    pi = 3.1415926535897932
+    cdf = 0.5 * (1.0 + torch.tanh((math.sqrt(2 / pi) * (x + 0.044715 * torch.pow(x, 3)))))
+    return x*cdf
+#torch.nn.functional.gelu(x) # Breaks ONNX export
+#ACT2FN = {"gelu": torch.nn.functional.gelu, "bias_gelu": bias_gelu, "relu": torch.nn.functional.relu, "swish": swish}
+ACT2FN = {"gelu": fast_gelu, "bias_gelu": bias_gelu, "relu": torch.nn.functional.relu, "swish": swish}
+class LinearActivation(torch.nn.Linear):
+    r"""Fused Linear and activation Module.
+    """
+    __constants__ = ['bias']
+    def __init__(self, in_features, out_features, act='gelu', bias=True):
+        super(LinearActivation, self).__init__(in_features, out_features, bias)
+        self.act_fn = nn.Identity()                                                         #
+        self.biased_act_fn = None                                                           # 
+        if isinstance(act, str) or (sys.version_info[0] == 2 and isinstance(act, unicode)): # For TorchScript
+            if bias and not 'bias' in act:                                                  # compatibility
+                act = 'bias_' + act                                                         #
+                self.biased_act_fn = ACT2FN[act]                                            #
+            else:
+                self.act_fn = ACT2FN[act]
+        else:
+            self.act_fn = act
+    def forward(self, input):
+        if not self.bias is None:
+            return self.biased_act_fn(self.bias, nn.functional.linear(input, self.weight, None))
+        else:
+            return self.act_fn(F.linear(input, self.weight, self.bias))
+class BertConfig(object):
+    """Configuration class to store the configuration of a `BertModel`.
+    """
+    def __init__(self,
+                 vocab_size_or_config_json_file,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act="gelu",
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02):
+        """Constructs BertConfig.
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `BertModel`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+        """
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.hidden_act = hidden_act
+            self.intermediate_size = intermediate_size
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.initializer_range = initializer_range
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             "or the path to a pretrained model config file (str)")
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `BertConfig` from a Python dictionary of parameters."""
+        config = BertConfig(vocab_size_or_config_json_file=-1)
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `BertConfig` from a json file of parameters."""
+        with open(json_file, "r", encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+    def __repr__(self):
+        return str(self.to_json_string())
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+try:
+    import apex
+    #apex.amp.register_half_function(apex.normalization.fused_layer_norm, 'FusedLayerNorm')
+    import apex.normalization
+    #apex.amp.register_float_function(apex.normalization.FusedLayerNorm, 'forward')
+    BertLayerNorm = apex.normalization.FusedLayerNorm
+except ImportError:
+    print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.")
+    class BertLayerNorm(nn.Module):
+        def __init__(self, hidden_size, eps=1e-12):
+            """Construct a layernorm module in the TF style (epsilon inside the square root).
+            """
+            super(BertLayerNorm, self).__init__()
+            self.weight = nn.Parameter(torch.ones(hidden_size))
+            self.bias = nn.Parameter(torch.zeros(hidden_size))
+            self.variance_epsilon = eps
+        def forward(self, x):
+            u = x.mean(-1, keepdim=True)
+            s = (x - u).pow(2).mean(-1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+            return self.weight * x + self.bias
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+    def __init__(self, config):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, input_ids, token_type_ids=None):
+        seq_length = input_ids.size(1)
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
+        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+        words_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = words_embeddings + position_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class BertSelfAttention(nn.Module):
+    def __init__(self, config):
+        super(BertSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.softmax = nn.Softmax(dim=-1)
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def transpose_key_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 3, 1)
+    def forward(self, hidden_states, attention_mask):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_key_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer)
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        attention_scores = attention_scores + attention_mask.unsqueeze(1).unsqueeze(2)
+        # Normalize the attention scores to probabilities.
+        attention_probs = self.softmax(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        return context_layer
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+# This module uses Apex C++ multihead attention implementation with fusions. 
+class FastBertAttention(nn.Module):
+    def __init__(self, config):
+        super(FastBertAttention, self).__init__()
+        self.multi_head_attention = SelfMultiheadAttn(config.hidden_size, config.num_attention_heads, dropout = config.attention_probs_dropout_prob, bias=True, include_norm_add=False, impl='fast', separate_qkv_params=True, mask_additive=True)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.layer_norm = BertLayerNorm(config.hidden_size, eps=1e-12)
+    def forward(self, input_tensor, attention_mask):
+        residual=input_tensor
+        multi_head_attention_output,_ = self.multi_head_attention(query = input_tensor, key = input_tensor, value = input_tensor, key_padding_mask=attention_mask, need_weights=True,attn_mask = None, is_training = self.training)
+        attention_output = self.dropout(multi_head_attention_output)
+        attention_output = self.layer_norm(attention_output + residual)
+        return attention_output
+class FastUnpadBertAttention(nn.Module):
+    def __init__(self, config):
+        super(FastUnpadBertAttention, self).__init__()
+        self.self = FastUnpadBertSelfAttention(config, enable_stream=config.enable_stream, enable_sync=False, fuse_mask=config.fuse_mask, fuse_scale=config.fuse_scale, fuse_qkv=config.fuse_qkv, fuse_dropout=config.fuse_dropout, apex_softmax=config.apex_softmax, pad=config.pad)
+        self.output = BertSelfOutput(config)
+    def forward(self, input_tensor, attention_mask, seqlen, batch):
+        self_output = self.self(input_tensor, attention_mask, seqlen, batch, is_training = self.training)
+        attention_output = self.output(self_output, input_tensor)
+        return attention_output
+class BertAttention(nn.Module):
+    def __init__(self, config):
+        super(BertAttention, self).__init__()
+        self.self = BertSelfAttention(config)
+        self.output = BertSelfOutput(config)
+    def forward(self, input_tensor, attention_mask):
+        self_output = self.self(input_tensor, attention_mask)
+        attention_output = self.output(self_output, input_tensor)
+        return attention_output
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.fused_gelu_bias = config.fused_gelu_bias
+        if config.fused_gelu_bias:
+            self.dense = LinearActivation(config.hidden_size, config.intermediate_size, act=config.hidden_act)
+        else:
+            self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+            if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+                self.intermediate_act_fn = ACT2FN[config.hidden_act]
+            else:
+                self.intermediate_act_fn = config.hidden_act
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        if not self.fused_gelu_bias:
+            hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertLayer(nn.Module):
+    def __init__(self, config):
+        super(BertLayer, self).__init__()
+        self.unpad = config.unpad
+        if config.fused_mha:
+            self.attention = FastBertAttention(config)
+        elif config.unpad:
+            self.attention = FastUnpadBertAttention(config)
+        else:
+            self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+    def forward(self, hidden_states, attention_mask, seqlen, batch):
+        if self.unpad:
+            attention_output = self.attention(hidden_states, attention_mask, seqlen, batch)
+        else:
+            attention_output = self.attention(hidden_states, attention_mask)
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super(BertEncoder, self).__init__()
+        layer = BertLayer(config)
+        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
+        self.num_attention_heads = config.num_attention_heads
+        self.fused_mha=config.fused_mha
+        self.unpad=config.unpad
+        self.pad = config.pad
+        self.fuse_mask = config.fuse_mask
+        self.enable_stream = config.enable_stream
+    # def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
+    #     all_encoder_layers = []
+    #     for layer_module in self.layer:
+    #         hidden_states = layer_module(hidden_states, attention_mask)
+    #         if output_all_encoded_layers:
+    #             all_encoder_layers.append(hidden_states)
+    #     if not output_all_encoded_layers:
+    #         all_encoder_layers.append(hidden_states)
+    #     return all_encoder_layers
+    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, checkpoint_activations=False):
+        # Unpad inputs and mask. It will remove tokens that are padded. Assume ntokens is total number of tokens (padded and non-padded)
+        # and ntokens_unpad is total number of non-padded tokens. Then unpadding performs the following compression of the inputs:
+        #        hidden_states[ntokens,hidden] -> hidden_states[ntokens_unpad,hidden]
+        batch = None
+        seqlen = None
+        if self.unpad:
+            batch = hidden_states.shape[0]
+            maxseqlen = hidden_states.shape[1]
+            hidden_size = hidden_states.shape[2]
+            attention_indices, attention_mask, seqlen, ntokens = generate_mask(attention_mask, self.num_attention_heads, pad=self.pad, fuse_mask=self.fuse_mask)
+            if self.pad == True and self.enable_stream == False:
+                hidden_states = hidden_states.view(batch,maxseqlen,hidden_size).permute(1,0,2).contiguous().view(batch*maxseqlen,hidden_size).contiguous()
+            if self.pad == True and self.enable_stream == True:
+                hidden_states = hidden_states.view(batch*maxseqlen,hidden_size)
+            if self.pad == False:
+                hidden_states = UnpadInput.apply(hidden_states.view(batch*maxseqlen, hidden_size).contiguous(), attention_indices, batch, maxseqlen, hidden_size, ntokens)
+        all_encoder_layers = []
+        def custom(start, end):
+            def custom_forward(*inputs):
+                layers = self.layer[start:end]
+                x_ = inputs[0]
+                for layer in layers:
+                    x_ = layer(x_, inputs[1])
+                return x_
+            return custom_forward
+        if checkpoint_activations:
+            l = 0
+            num_layers = len(self.layer)
+            chunk_length = math.ceil(math.sqrt(num_layers))
+            while l < num_layers:
+                hidden_states = checkpoint.checkpoint(custom(l, l+chunk_length), hidden_states, attention_mask*1)
+                l += chunk_length
+            # decoder layers
+        else:
+            if self.fused_mha:
+                hidden_states = hidden_states.permute(1,0,2).contiguous()
+            for i,layer_module in enumerate(self.layer):
+                hidden_states = layer_module(hidden_states, attention_mask, seqlen, batch)
+                if output_all_encoded_layers:
+                    if self.fused_mha:
+                        all_encoder_layers.append(hidden_states.permute(1,0,2).contiguous())
+                    else:
+                        all_encoder_layers.append(hidden_states)
+        # Pad inputs and mask. It will insert back zero-padded tokens. Assume ntokens is total number of tokens (padded and non-padded)
+        # and ntokens_unpad is total number of non-padded tokens. Then padding performs the following de-compression:
+        #        hidden_states[ntokens_unpad,hidden] -> hidden_states[ntokens,hidden]
+        if self.unpad:
+            if self.pad == True and self.enable_stream == False:
+                hidden_states = hidden_states.view(maxseqlen,batch,hidden_size).permute(1,0,2).contiguous().view(batch,maxseqlen,hidden_size).contiguous()
+            if self.pad == True and self.enable_stream == True:
+                hidden_states = hidden_states.view(batch,maxseqlen,hidden_size)
+            if self.pad == False:
+                hidden_states = PadInput.apply(hidden_states, attention_indices, batch, maxseqlen, hidden_size, ntokens).view(batch, maxseqlen, hidden_size).contiguous()
+        if not output_all_encoded_layers or checkpoint_activations:
+            if self.fused_mha:
+                all_encoder_layers.append(hidden_states.permute(1,0,2).contiguous())
+            else:
+                all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+#class BertEncoder(nn.Module):
+#    def __init__(self, config):
+#        super(BertEncoder, self).__init__()
+#        layer = BertLayer(config)
+#        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
+#
+#    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
+#        all_encoder_layers = []
+#        for layer_module in self.layer:
+#            hidden_states = layer_module(hidden_states, attention_mask)
+#            if output_all_encoded_layers:
+#                all_encoder_layers.append(hidden_states)
+#        if not output_all_encoded_layers:
+#            all_encoder_layers.append(hidden_states)
+#        return all_encoder_layers
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super(BertPredictionHeadTransform, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertLMPredictionHead, self).__init__()
+        self.transform = BertPredictionHeadTransform(config)
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
+                                 bert_model_embedding_weights.size(0),
+                                 bias=False)
+        self.decoder.weight = bert_model_embedding_weights
+        self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states) + self.bias
+        return hidden_states
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertOnlyMLMHead, self).__init__()
+        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+class BertOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super(BertOnlyNSPHead, self).__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+class BertPreTrainingHeads(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertPreTrainingHeads, self).__init__()
+        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+        self.dense_seq_output = config.dense_seq_output
+    def forward(self, sequence_output, pooled_output, masked_lm_labels):
+        if self.dense_seq_output:
+            # We are masking out elements that won't contribute to loss because of masked lm labels
+            sequence_flattened = torch.index_select(sequence_output.view(-1,sequence_output.shape[-1]), 0, torch.nonzero(masked_lm_labels.view(-1) != -1, as_tuple=False).squeeze())
+            sequence_output = sequence_flattened
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+class BertPreTrainedModel(nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(BertPreTrainedModel, self).__init__()
+        if not isinstance(config, BertConfig):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
+                "To create a model from a Google pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                ))
+        self.config = config
+        # we want to make sure vocab size is padded to % 8 == 0
+        if self.config.vocab_size % 8 != 0:
+            self.config.vocab_size += 8 - (self.config.vocab_size % 8)
+            if get_rank == 0:
+                print(f'Padded vocab_size to : {self.config.vocab_size}')
+    def init_bert_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, BertLayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+    @classmethod
+    def from_pretrained(cls, pretrained_checkpoint, state_dict=None, cache_dir=None,
+                        from_tf=False, config=None, *inputs, **kwargs):
+        """
+        Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
+        Params:
+            pretrained_model_name_or_path: either:
+                - a path or url to a pretrained model archive containing:
+                    . `bert_config.json` a configuration file for the model
+                    . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
+                - a path or url to a pretrained model archive containing:
+                    . `bert_config.json` a configuration file for the model
+                    . `model.chkpt` a TensorFlow checkpoint
+            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+            *inputs, **kwargs: additional input for the specific Bert class
+                (ex: num_labels for BertForSequenceClassification)
+        """
+        logger.info("loading archive file {}".format(pretrained_checkpoint))
+        assert config, "BERT configuration file must be provided to from_pretraining()"
+        logger.info("Model config {}".format(config))
+        # Instantiate model.
+        model = cls(config, *inputs, **kwargs)
+        if state_dict is None and not from_tf:
+            state_dict = torch.load(pretrained_checkpoint, map_location='cpu' if not torch.cuda.is_available() else None)
+        if from_tf:
+            # Directly load from a TensorFlow checkpoint
+            return load_tf_weights_in_bert(model, pretrained_checkpoint, use_fast_mha=config.fused_mha)
+        # Load from a PyTorch state_dict
+        old_keys = []
+        new_keys = []
+        # print(f'loading keys: {state_dict.keys()}')
+        for key in state_dict.keys():
+            new_key = None
+            if 'gamma' in key:
+                new_key = key.replace('gamma', 'weight')
+            if 'beta' in key:
+                new_key = key.replace('beta', 'bias')
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, '_metadata', None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+        def load(module, prefix=''):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            module._load_from_state_dict(
+                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + '.')
+        start_prefix = ''
+        if not hasattr(model, 'bert') and any(s.startswith('bert.') for s in state_dict.keys()):
+            start_prefix = 'bert.'
+        load(model, prefix=start_prefix)
+        if len(missing_keys) > 0:
+            logger.info("Weights of {} not initialized from pretrained model: {}".format(
+                model.__class__.__name__, missing_keys))
+        if len(unexpected_keys) > 0:
+            logger.info("Weights from pretrained model not used in {}: {}".format(
+                model.__class__.__name__, unexpected_keys))
+        if len(error_msgs) > 0:
+            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
+                               model.__class__.__name__, "\n\t".join(error_msgs)))
+        return model
+class BertModel(BertPreTrainedModel):
+    """BERT model ("Bidirectional Embedding Representations from a Transformer").
+    Params:
+        config: a BertConfig class instance with the configuration to build a new model
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
+    Outputs: Tuple of (encoded_layers, pooled_output)
+        `encoded_layers`: controled by `output_all_encoded_layers` argument:
+            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
+                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
+                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
+            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
+                to the last attention block of shape [batch_size, sequence_length, hidden_size],
+        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
+            classifier pretrained on top of the hidden state associated to the first character of the
+            input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+    model = modeling.BertModel(config=config)
+    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config):
+        super(BertModel, self).__init__(config)
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config)
+        self.apply(self.init_bert_weights)
+        self.unpad = config.unpad
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True, checkpoint_activations=False):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask#.unsqueeze(1).unsqueeze(2)
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        if self.unpad == False:
+            extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+            extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        embedding_output = self.embeddings(input_ids, token_type_ids)
+        encoded_layers = self.encoder(embedding_output,
+                                      extended_attention_mask,
+                                      output_all_encoded_layers=output_all_encoded_layers, checkpoint_activations=checkpoint_activations)
+        sequence_output = encoded_layers[-1]
+        pooled_output = self.pooler(sequence_output)
+        if not output_all_encoded_layers:
+            encoded_layers = encoded_layers[-1]
+        return encoded_layers, pooled_output
+class BertForPreTraining(BertPreTrainedModel):
+    """BERT model with pre-training heads.
+    This module comprises the BERT model followed by the two pre-training heads:
+        - the masked language modeling head, and
+        - the next sentence classification head.
+    Params:
+        config: a BertConfig class instance with the configuration to build a new model.
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., vocab_size]
+        `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, 1].
+            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
+    Outputs:
+        if `masked_lm_labels` and `next_sentence_label` are not `None`:
+            Outputs the total_loss which is the sum of the masked language modeling loss and the next
+            sentence classification loss.
+        if `masked_lm_labels` or `next_sentence_label` is `None`:
+            Outputs a tuple comprising
+            - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
+            - the next sentence classification logits of shape [batch_size, 2].
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+    model = BertForPreTraining(config)
+    masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config):
+        super(BertForPreTraining, self).__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
+        self.apply(self.init_bert_weights)
+        self.dense_seq_output = config.dense_seq_output
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, next_sentence_label=None, checkpoint_activations=False):
+        sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
+                                                   output_all_encoded_layers=False, checkpoint_activations=checkpoint_activations)
+        # if dense_seq_output, prediction scores returned by this function is already masked out with masked_lm_labels, and first dimension is flattened
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output, masked_lm_labels)
+        if self.dense_seq_output:
+            masked_lm_labels_flat = masked_lm_labels.view(-1)
+            mlm_labels = masked_lm_labels_flat[masked_lm_labels_flat != -1]
+        if masked_lm_labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            if self.dense_seq_output:
+                masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), mlm_labels.view(-1))
+            else:
+                masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            #print("loss is {} {}".format(masked_lm_loss, next_sentence_loss))
+            total_loss = masked_lm_loss + next_sentence_loss
+            # Masked Language Model Accuracy
+            if not self.dense_seq_output:
+                prediction_scores_flat = prediction_scores.view(-1, prediction_scores.shape[-1])
+                masked_lm_labels_flat = masked_lm_labels.view(-1)
+                mlm_predictions_scores = prediction_scores_flat[masked_lm_labels_flat != -1]
+                mlm_predictions = mlm_predictions_scores.argmax(dim=-1)
+                mlm_labels = masked_lm_labels_flat[masked_lm_labels_flat != -1]
+            else:
+                mlm_predictions = prediction_scores.argmax(dim=-1)
+            mlm_acc = (mlm_predictions == mlm_labels).sum(dtype=torch.float)/mlm_labels.numel()
+            return total_loss, mlm_acc, mlm_labels.numel()
+        else: #TODO: Handle this path for dense sequence output as well
+            return prediction_scores, seq_relationship_score
+class BertForMaskedLM(BertPreTrainedModel):
+    """BERT model with the masked language modeling head.
+    This module comprises the BERT model followed by the masked language modeling head.
+    Params:
+        config: a BertConfig class instance with the configuration to build a new model.
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., vocab_size]
+    Outputs:
+        if `masked_lm_labels` is  not `None`:
+            Outputs the masked language modeling loss.
+        if `masked_lm_labels` is `None`:
+            Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+    model = BertForMaskedLM(config)
+    masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config):
+        super(BertForMaskedLM, self).__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
+        self.apply(self.init_bert_weights)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, checkpoint_activations=False):
+        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask,
+                                       output_all_encoded_layers=False)
+        prediction_scores = self.cls(sequence_output)
+        if masked_lm_labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
+            return masked_lm_loss
+        else:
+            return prediction_scores
+class BertForNextSentencePrediction(BertPreTrainedModel):
+    """BERT model with next sentence prediction head.
+    This module comprises the BERT model followed by the next sentence classification head.
+    Params:
+        config: a BertConfig class instance with the configuration to build a new model.
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, 1].
+            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
+    Outputs:
+        if `next_sentence_label` is not `None`:
+            Outputs the total_loss which is the sum of the masked language modeling loss and the next
+            sentence classification loss.
+        if `next_sentence_label` is `None`:
+            Outputs the next sentence classification logits of shape [batch_size, 2].
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+    model = BertForNextSentencePrediction(config)
+    seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config):
+        super(BertForNextSentencePrediction, self).__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertOnlyNSPHead(config)
+        self.apply(self.init_bert_weights)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None, checkpoint_activations=False):
+        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
+                                     output_all_encoded_layers=False)
+        seq_relationship_score = self.cls( pooled_output)
+        if next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            return next_sentence_loss
+        else:
+            return seq_relationship_score
+class BertForSequenceClassification(BertPreTrainedModel):
+    """BERT model for classification.
+    This module is composed of the BERT model with a linear layer on top of
+    the pooled output.
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model.
+        `num_labels`: the number of classes for the classifier. Default = 2.
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, ..., num_labels].
+    Outputs:
+        if `labels` is not `None`:
+            Outputs the CrossEntropy classification loss of the output with the labels.
+        if `labels` is `None`:
+            Outputs the classification logits of shape [batch_size, num_labels].
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+    num_labels = 2
+    model = BertForSequenceClassification(config, num_labels)
+    logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config, num_labels):
+        super(BertForSequenceClassification, self).__init__(config)
+        self.num_labels = num_labels
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, num_labels)
+        self.apply(self.init_bert_weights)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, checkpoint_activations=False):
+        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            return loss
+        else:
+            return logits
+class BertForMultipleChoice(BertPreTrainedModel):
+    """BERT model for multiple choice tasks.
+    This module is composed of the BERT model with a linear layer on top of
+    the pooled output.
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model.
+        `num_choices`: the number of classes for the classifier. Default = 2.
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+            with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
+            and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, ..., num_choices].
+    Outputs:
+        if `labels` is not `None`:
+            Outputs the CrossEntropy classification loss of the output with the labels.
+        if `labels` is `None`:
+            Outputs the classification logits of shape [batch_size, num_labels].
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
+    input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
+    token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+    num_choices = 2
+    model = BertForMultipleChoice(config, num_choices)
+    logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config, num_choices):
+        super(BertForMultipleChoice, self).__init__(config)
+        self.num_choices = num_choices
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+        self.apply(self.init_bert_weights)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, checkpoint_activations=False):
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1))
+        _, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False)
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, self.num_choices)
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+            return loss
+        else:
+            return reshaped_logits
+class BertForTokenClassification(BertPreTrainedModel):
+    """BERT model for token-level classification.
+    This module is composed of the BERT model with a linear layer on top of
+    the full hidden state of the last layer.
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model.
+        `num_labels`: the number of classes for the classifier. Default = 2.
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [0, ..., num_labels].
+    Outputs:
+        if `labels` is not `None`:
+            Outputs the CrossEntropy classification loss of the output with the labels.
+        if `labels` is `None`:
+            Outputs the classification logits of shape [batch_size, sequence_length, num_labels].
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+    num_labels = 2
+    model = BertForTokenClassification(config, num_labels)
+    logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config, num_labels):
+        super(BertForTokenClassification, self).__init__(config)
+        self.num_labels = num_labels
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, num_labels)
+        self.apply(self.init_bert_weights)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, checkpoint_activations=False):
+        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)[active_loss]
+                active_labels = labels.view(-1)[active_loss]
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            return loss
+        else:
+            return logits
+class BertForQuestionAnswering(BertPreTrainedModel):
+    """BERT model for Question Answering (span extraction).
+    This module is composed of the BERT model with a linear layer on top of
+    the sequence output that computes start_logits and end_logits
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model.
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
+            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
+            into account for computing the loss.
+        `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
+            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
+            into account for computing the loss.
+    Outputs:
+        if `start_positions` and `end_positions` are not `None`:
+            Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
+        if `start_positions` or `end_positions` is `None`:
+            Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
+            position tokens of shape [batch_size, sequence_length].
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+    model = BertForQuestionAnswering(config)
+    start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config):
+        super(BertForQuestionAnswering, self).__init__(config)
+        self.bert = BertModel(config)
+        # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
+        # self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+        self.apply(self.init_bert_weights)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None, end_positions=None, checkpoint_activations=False):
+        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+            return total_loss
+        else:
+            return start_logits, end_logits
--- a/tf_to_torch/padding.py
+++ b/tf_to_torch/padding.py
+import torch
+import math
+#######################################################################################################################################################################
+def unpad_input(out_, in_, indices):
+    out_[:,:] = in_[indices[:],:]
+def pad_input(out_, in_, indices):
+    out_[indices[:],:] = in_[:,:]
+def unpad_mask(out_, in_, indices):
+    out_[:] = in_.flatten()[indices[:]]
+#######################################################################################################################################################################
+def generate_mask(attention_mask, heads, pad=False, fuse_mask=True):
+    seqlen = attention_mask.sum(dim=1).float().cpu()
+    if pad == False:
+        seqlen[:] = ((seqlen[:] + 16 - 1) / 16).floor()*16
+        seqlen[seqlen < 16] = 16
+        seqlen = seqlen.int()
+        ntokens = seqlen.sum().item()
+    else:
+        batch = attention_mask.shape[0]
+        maxseqlen = attention_mask.shape[1]
+        seqlen.fill_(maxseqlen)
+        seqlen = seqlen.int()
+        ntokens = batch * maxseqlen
+    padded_mask = attention_mask.clone()
+    for i in range(len(seqlen)):
+        padded_mask[i,:seqlen[i]] = 1
+    indices = torch.nonzero(padded_mask.flatten(), as_tuple=False).flatten()
+    if pad==False and fuse_mask == True:
+        mask = torch.zeros([ntokens], device="cuda", dtype=torch.float16)
+        unpad_mask(mask, attention_mask, indices)
+        mask = (1 - mask) * -10000.0
+    elif pad==False and fuse_mask == False:
+        padded_mask = (padded_mask.unsqueeze(1) * padded_mask.unsqueeze(2)).unsqueeze(1).half().repeat(1, heads, 1, 1)
+        indices_mask = torch.nonzero(padded_mask.flatten(), as_tuple=False).flatten()            
+        mask = torch.zeros([len(indices_mask)], device="cuda", dtype=torch.float16)            
+        unpad_mask(mask, padded_mask, indices_mask)            
+        mask = (1 - mask) * -10000.0
+    elif pad==True and fuse_mask == True:
+        mask = -10000.0 * (1 - attention_mask).half().view(-1)
+    elif pad==True and fuse_mask == False:
+        mask = -10000.0 * (1 - (attention_mask.unsqueeze(1) * attention_mask.unsqueeze(2))).unsqueeze(1).half().repeat(1, heads, 1, 1).view(-1)
+    return indices, mask, seqlen, ntokens
+#######################################################################################################################################################################
+class PadInput(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, indices, batch, maxseqlen, hidden, ntokens):
+        ctx.save_for_backward(indices)
+        ctx.hidden = hidden
+        ctx.ntokens = ntokens
+        ntokens = batch*maxseqlen
+        output = torch.zeros([ntokens,hidden], device="cuda", dtype=torch.float16)
+        pad_input(output, input, indices)
+        return output[:ntokens]
+    @staticmethod
+    def backward(ctx, grad_output):
+        indices, = ctx.saved_tensors
+        grad_input = torch.zeros([ctx.ntokens,ctx.hidden], device="cuda", dtype=torch.float16)
+        unpad_input(grad_input, grad_output, indices)
+        return grad_input[:ctx.ntokens], None, None, None, None, None
+#######################################################################################################################################################################
+class UnpadInput(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, indices, batch, maxseqlen, hidden, ntokens):
+        ctx.save_for_backward(indices)
+        ctx.hidden = hidden
+        ctx.ntokens = batch*maxseqlen
+        output = torch.zeros([ntokens, hidden], device="cuda", dtype=torch.float16)
+        unpad_input(output, input, indices)
+        return output[:ntokens]
+    @staticmethod
+    def backward(ctx, grad_output):
+        indices, = ctx.saved_tensors
+        grad_input = torch.zeros([ctx.ntokens,ctx.hidden], device="cuda", dtype=torch.float16)
+        pad_input(grad_input, grad_output, indices)
+        return grad_input[:ctx.ntokens], None, None, None, None, None
+#######################################################################################################################################################################
--- a/tf_to_torch/softmax.py
+++ b/tf_to_torch/softmax.py
+import torch
+import mhalib
+###########################################################################################
+class FastSoftmaxFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(cxt, input, dim, batch, seqlen, heads, stream, sync, timers):
+        if timers: timers['start_fprop'].record()
+        mhalib.FastSoftmaxFprop(input, batch, seqlen, heads, stream, sync)
+        if timers: timers['stop_fprop'].record()
+        cxt.save_for_backward(input,seqlen)
+        cxt.dim = dim
+        cxt.batch = batch
+        cxt.heads = heads
+        cxt.stream = stream
+        cxt.sync = sync
+        cxt.timers = timers
+        return input
+    @staticmethod
+    def backward(cxt, grad_output):
+        output, seqlen, = cxt.saved_tensors
+        dim = cxt.dim
+        batch = cxt.batch
+        heads = cxt.heads
+        if cxt.timers: cxt.timers['start_dgrad'].record()
+        mhalib.FastSoftmaxBprop(output, grad_output, batch, seqlen, heads, cxt.stream, cxt.sync)
+        if cxt.timers: cxt.timers['stop_dgrad'].record()
+        return grad_output, None, None, None, None, None, None, None
+class FastSoftmax(torch.nn.Module):
+    def __init__(self, dim=None, stream=True, sync=True, timer=False):
+        super(FastSoftmax, self).__init__()
+        self.dim = dim
+        self.stream = stream
+        self.sync = sync
+        if timer:
+            self.timers = {'start_fprop':torch.cuda.Event(enable_timing=True),
+                           'start_dgrad':torch.cuda.Event(enable_timing=True),
+                           'stop_fprop':torch.cuda.Event(enable_timing=True),
+                           'stop_dgrad':torch.cuda.Event(enable_timing=True)}
+        else:
+            self.timers = None
+    def forward(self, input, batch, seqlen, heads):
+        return FastSoftmaxFunction.apply(input, self.dim, batch, seqlen, heads, self.stream, self.sync, self.timers)
+###########################################################################################
+class FastMaskSoftmaxFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(cxt, input, mask, dim, batch, seqlen, heads, stream, sync, timers):
+        if timers: timers['start_fprop'].record()
+        mhalib.FastMaskSoftmaxFprop(input, mask, batch, seqlen, heads, stream, sync)
+        if timers: timers['stop_fprop'].record()
+        cxt.save_for_backward(input,seqlen)
+        cxt.dim = dim
+        cxt.batch = batch
+        cxt.heads = heads
+        cxt.stream = stream
+        cxt.sync = sync
+        cxt.timers = timers
+        return input
+    @staticmethod
+    def backward(cxt, grad_output):
+        output, seqlen, = cxt.saved_tensors
+        dim = cxt.dim
+        batch = cxt.batch
+        heads = cxt.heads
+        if cxt.timers: cxt.timers['start_dgrad'].record()
+        mhalib.FastSoftmaxBprop(output, grad_output, batch, seqlen, heads, cxt.stream, cxt.sync)
+        if cxt.timers: cxt.timers['stop_dgrad'].record()
+        return grad_output, None, None, None, None, None, None, None, None, None, None, None
+class FastMaskSoftmax(torch.nn.Module):
+    def __init__(self, dim=None, stream=True, sync=True, timer=False):
+        super(FastMaskSoftmax, self).__init__()
+        self.dim = dim
+        self.stream = stream
+        self.sync = sync
+        if timer:
+            self.timers = {'start_fprop':torch.cuda.Event(enable_timing=True),
+                           'start_dgrad':torch.cuda.Event(enable_timing=True),
+                           'stop_fprop':torch.cuda.Event(enable_timing=True),
+                           'stop_dgrad':torch.cuda.Event(enable_timing=True)}
+        else:
+            self.timers = None
+    def forward(self, input, mask, batch, seqlen, heads):
+        return FastMaskSoftmaxFunction.apply(input, mask, self.dim, batch, seqlen, heads, self.stream, self.sync, self.timers)
+###########################################################################################
+class FastMaskSoftmaxDropoutFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(cxt, input, mask, dim, batch, seqlen, heads, dropout_prob, stream, sync, timers, is_training):
+        if timers: timers['start_fprop'].record()
+        output, dropout_mask, = mhalib.FastMaskSoftmaxDropoutFprop(input, mask, batch, seqlen, heads, dropout_prob, stream, sync, is_training)
+        if timers: timers['stop_fprop'].record()
+        cxt.save_for_backward(input,dropout_mask,seqlen)
+        cxt.dim = dim
+        cxt.batch = batch
+        cxt.heads = heads
+        cxt.dropout_prob = dropout_prob
+        cxt.stream = stream
+        cxt.sync = sync
+        cxt.timers = timers
+        return output
+    @staticmethod
+    def backward(cxt, grad_output):
+        output, dropout_mask, seqlen, = cxt.saved_tensors
+        dim = cxt.dim
+        batch = cxt.batch
+        heads = cxt.heads
+        dropout_prob = cxt.dropout_prob
+        if cxt.timers: cxt.timers['start_dgrad'].record()
+        mhalib.FastMaskSoftmaxDropoutBprop(output, grad_output, dropout_mask, batch, seqlen, heads, dropout_prob, cxt.stream, cxt.sync)
+        if cxt.timers: cxt.timers['stop_dgrad'].record()
+        return grad_output, None, None, None, None, None, None, None, None, None, None, None, None, None
+class FastMaskSoftmaxDropout(torch.nn.Module):
+    def __init__(self, dim=None, dropout_prob=None, stream=True, sync=True, timer=False):
+        super(FastMaskSoftmaxDropout, self).__init__()
+        self.dim = dim
+        self.dropout_prob = dropout_prob
+        self.stream = stream
+        self.sync = sync
+        if timer:
+            self.timers = {'start_fprop':torch.cuda.Event(enable_timing=True),
+                           'start_dgrad':torch.cuda.Event(enable_timing=True),
+                           'stop_fprop':torch.cuda.Event(enable_timing=True),
+                           'stop_dgrad':torch.cuda.Event(enable_timing=True)}
+        else:
+            self.timers = None
+    def forward(self, input, mask, batch, seqlen, heads, is_training):
+        return FastMaskSoftmaxDropoutFunction.apply(input, mask, self.dim, batch, seqlen, heads, self.dropout_prob, self.stream, self.sync, self.timers, is_training)
+###########################################################################################
--- a/tf_to_torch/tf-to-torch.sh
+++ b/tf_to_torch/tf-to-torch.sh
+python3 convert_tf_checkpoint.py \
+ --bert_model "bert-large-uncased" \
+ --tf_checkpoint  /public/home/hepj/model_source/uncased_L-24_H-1024_A-16/bert_model.ckpt \
+ --bert_config_path /public/home/hepj/model_source/uncased_L-24_H-1024_A-16/bert_config.json \
+ --output_checkpoint /public/home/hepj/model_source/model_pytorch.ckpt.pt
\ No newline at end of file
--- a/tf_to_torch/utils.py
+++ b/tf_to_torch/utils.py
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.distributed as dist
+from contextlib import contextmanager
+import logging.config
+import random
+def generate_seeds(rng, size):
+    """
+    Generate list of random seeds
+    :param rng: random number generator
+    :param size: length of the returned list
+    """
+    seeds = [rng.randint(0, 2**32 - 1) for _ in range(size)]
+    return seeds
+def broadcast_seeds(seeds, device):
+    """
+    Broadcasts random seeds to all distributed workers.
+    Returns list of random seeds (broadcasted from workers with rank 0).
+    :param seeds: list of seeds (integers)
+    :param device: torch.device
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        seeds_tensor = torch.LongTensor(seeds).to(device)
+        torch.distributed.broadcast(seeds_tensor, 0)
+        seeds = seeds_tensor.tolist()
+    return seeds
+def setup_seeds(master_seed, epochs, device):
+    """
+    Generates seeds from one master_seed.
+    Function returns (worker_seeds, shuffling_seeds), worker_seeds are later
+    used to initialize per-worker random number generators (mostly for
+    dropouts), shuffling_seeds are for RNGs resposible for reshuffling the
+    dataset before each epoch.
+    Seeds are generated on worker with rank 0 and broadcasted to all other
+    workers.
+    :param master_seed: master RNG seed used to initialize other generators
+    :param epochs: number of epochs
+    :param device: torch.device (used for distributed.broadcast)
+    """
+    if master_seed is None:
+        # random master seed, random.SystemRandom() uses /dev/urandom on Unix
+        master_seed = random.SystemRandom().randint(0, 2**32 - 1)
+        if get_rank() == 0:
+            # master seed is reported only from rank=0 worker, it's to avoid
+            # confusion, seeds from rank=0 are later broadcasted to other
+            # workers
+            logging.info(f'Using random master seed: {master_seed}')
+    else:
+        # master seed was specified from command line
+        logging.info(f'Using master seed from command line: {master_seed}')
+    # initialize seeding RNG
+    seeding_rng = random.Random(master_seed)
+    # generate worker seeds, one seed for every distributed worker
+    worker_seeds = generate_seeds(seeding_rng, get_world_size())
+    # generate seeds for data shuffling, one seed for every epoch
+    shuffling_seeds = generate_seeds(seeding_rng, epochs)
+    # broadcast seeds from rank=0 to other workers
+    worker_seeds = broadcast_seeds(worker_seeds, device)
+    shuffling_seeds = broadcast_seeds(shuffling_seeds, device)
+    return worker_seeds, shuffling_seeds
+def barrier():
+    """
+    Works as a temporary distributed barrier, currently pytorch
+    doesn't implement barrier for NCCL backend.
+    Calls all_reduce on dummy tensor and synchronizes with GPU.
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        torch.distributed.all_reduce(torch.cuda.FloatTensor(1))
+        torch.cuda.synchronize()
+def get_rank():
+    """
+    Gets distributed rank or returns zero if distributed is not initialized.
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        rank = torch.distributed.get_rank()
+    else:
+        rank = 0
+    return rank
+def get_world_size():
+    """
+    Gets total number of distributed workers or returns one if distributed is
+    not initialized.
+    """
+    if torch.distributed.is_available():
+        print("Torch distributed is available.")
+    else:
+        print("Torch distributed is not available.")
+    if torch.distributed.is_initialized():
+        print("Torch distributed is initialized.")
+    else:
+        print("Torch distributed is not initialized.")
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        world_size = torch.distributed.get_world_size()
+    else:
+        world_size = 1
+    return world_size
+def set_device(cuda, local_rank):
+    """
+    Sets device based on local_rank and returns instance of torch.device.
+    :param cuda: if True: use cuda
+    :param local_rank: local rank of the worker
+    """
+    if cuda:
+        torch.cuda.set_device(local_rank)
+        device = torch.device('cuda')
+    else:
+        device = torch.device('cpu')
+    return device
+@contextmanager
+def sync_workers():
+    """
+    Yields distributed rank and synchronizes all workers on exit.
+    """
+    rank = get_rank()
+    yield rank
+    barrier()
+def is_main_process():
+    return get_rank() == 0
+def format_step(step):
+    if isinstance(step, str):
+        return step
+    s = ""
+    if len(step) > 0:
+        s += "Training Epoch: {} ".format(step[0])
+    if len(step) > 1:
+        s += "Training Iteration: {} ".format(step[1])
+    if len(step) > 2:
+        s += "Validation Iteration: {} ".format(step[2])
+    return s
--- a/tokenization.py
+++ b/tokenization.py
+# coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+from __future__ import absolute_import, division, print_function, unicode_literals
+import collections
+import logging
+import os
+import unicodedata
+import six
+from io import open
+from file_utils import cached_path
+logger = logging.getLogger(__name__)
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'bert-base-uncased': 512,
+    'bert-large-uncased': 512,
+    'bert-base-cased': 512,
+    'bert-large-cased': 512,
+    'bert-base-multilingual-uncased': 512,
+    'bert-base-multilingual-cased': 512,
+    'bert-base-chinese': 512,
+}
+VOCAB_NAME = 'vocab.txt'
+def convert_to_unicode(text):
+  """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+  if six.PY3:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, bytes):
+      return text.decode("utf-8", "ignore")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  elif six.PY2:
+    if isinstance(text, str):
+      return text.decode("utf-8", "ignore")
+    elif isinstance(text, unicode):
+      return text
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  else:
+    raise ValueError("Not running on Python2 or Python 3?")
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        while True:
+            token = reader.readline()
+            if not token:
+                break
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+class BertTokenizer(object):
+    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
+    def __init__(self, vocab_file, do_lower_case=True, max_len=None,
+                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict(
+            [(ids, tok) for tok, ids in self.vocab.items()])
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
+                                              never_split=never_split)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+        self.max_len = max_len if max_len is not None else int(1e12)
+    def tokenize(self, text):
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+        return split_tokens
+    def convert_tokens_to_ids(self, tokens):
+        """Converts a sequence of tokens into ids using the vocab."""
+        ids = []
+        for token in tokens:
+            ids.append(self.vocab[token])
+        if len(ids) > self.max_len:
+            raise ValueError(
+                "Token indices sequence length is longer than the specified maximum "
+                " sequence length for this BERT model ({} > {}). Running this"
+                " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
+            )
+        return ids
+    def convert_ids_to_tokens(self, ids):
+        """Converts a sequence of ids in wordpiece tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            tokens.append(self.ids_to_tokens[i])
+        return tokens
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
+        else:
+            vocab_file = pretrained_model_name_or_path
+        if os.path.isdir(vocab_file):
+            vocab_file = os.path.join(vocab_file, VOCAB_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find any file "
+                "associated to this path or url.".format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    vocab_file))
+            return None
+        if resolved_vocab_file == vocab_file:
+            logger.info("loading vocabulary file {}".format(vocab_file))
+        else:
+            logger.info("loading vocabulary file {} from cache at {}".format(
+                vocab_file, resolved_vocab_file))
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
+        return tokenizer
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+    def __init__(self,
+                 do_lower_case=True,
+                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
+        """Constructs a BasicTokenizer.
+        Args:
+          do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+        self.never_split = never_split
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case and token not in self.never_split:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        if text in self.never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+        return ["".join(x) for x in output]
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+                (cp >= 0x3400 and cp <= 0x4DBF) or  #
+                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+                (cp >= 0x2B820 and cp <= 0x2CEAF) or
+                (cp >= 0xF900 and cp <= 0xFAFF) or  #
+                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+        return False
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer`.
+        Returns:
+          A list of wordpiece tokens.
+        """
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
--- a/triton/LICENSE
+++ b/triton/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
--- a/triton/README.md
+++ b/triton/README.md
+# Deploying the BERT model using Triton Inference Server
+## Solution overview
+The [NVIDIA Triton Inference Server](https://github.com/NVIDIA/triton-inference-server) provides a datacenter and cloud inferencing solution optimized for NVIDIA GPUs. The server provides an inference service via an HTTP or gRPC endpoint, allowing remote clients to request inferencing for any number of GPU or CPU models being managed by the server. 
+This folder contains detailed performance analysis as well as scripts to run SQuAD fine-tuning on BERT model using Triton Inference Server. 
+## Setup
+The first step is to train BERT for question answering. The process is the same as in the main readme. 
+1. Download the squad dataset with `cd [bert folder]/data/squad/ && bash ./squad_download.sh`. 
+2. Build the Docker container with `bash ./scripts/docker/build.sh`. 
+3. [train](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/BERT#training-process) your own checkpoint and fine-tune it, or [download](https://ngc.nvidia.com/catalog/models/nvidia:bert_large_pyt_amp_ckpt_squad_qa1_1/files) the already trained and fine-tuned checkpoint from the [NGC](https://ngc.nvidia.com/catalog/models/nvidia:bert_large_pyt_amp_ckpt_squad_qa1_1/files) model repository. 
+The checkpoint should be placed in `[bert folder]/checkpoints/<checkpoint>`. By default, the scripts assume `<checkpoint>` is `bert_qa.pt`, therefore, you might have to rename the trained or downloaded models as necessary. 
+Note: The following instructions are run from outside the container and call `docker run` commands as required. \
+Unless stated otherwise, all the commands below have to be executed from `[bert folder]`. 
+## Quick Start Guide
+### Deploying the model
+The following command exports the checkpoint to `torchscript`, and deploys the Triton model repository. 
+`bash ./triton/export_model.sh` 
+The deployed Triton model repository will be in `[bert folder]/results/triton_models`. 
+Edit `[bert folder]/triton/export_model.sh` to deploy BERT in ONNX format. 
+Change the value of `EXPORT_FORMAT` from `ts-script` to `onnx`. Additionally, change the value of `triton_model_name` from `bertQA-ts` to `bertQA-onnx`, respectively. 
+Moreover, you may set `precision` to either `fp32` or `fp16`. 
+### Running the Triton server
+To launch the Triton server, execute the following command. 
+`docker run --rm --gpus device=0 --ipc=host --network=host -p 8000:8000 -p 8001:8001 -p 8002:8002 -v $PWD/results/triton_models:/models nvcr.io/nvidia/tritonserver:20.06-v1-py3 trtserver --model-store=/models --log-verbose=1`
+Here `device=0,1,2,3` selects GPUs indexed by ordinals `0,1,2` and `3`, respectively. The server will see only these GPUs. If you write `device=all`, then the server will see all the available GPUs. 
+By default, the server expects the model repository to be in `[bert folder]/results/triton_models`. 
+### Running the custom Triton client
+The custom Triton client is found in `[bert folder]/triton/client.py`. 
+It may be used once BERT is deployed and the Triton server is running. To try it, do the following steps. 
+1. Start the BERT docker container with the following command: \
+`docker run -it --rm --ipc=host --network=host -v $PWD/vocab:/workspace/bert/vocab bert:latest` \
+Notice, that for the client, no GPU support is necessary. 
+2. Move to the triton folder with the following command: \
+`cd /workspace/bert/triton/` 
+3. Run the client with the following command: \
+`python client.py --do_lower_case --version_2_with_negative --vocab_file=../vocab/vocab --triton-model-name=bertQA-ts-script` 
+This will send a request to the already running Triton server, which will process it, and return the result to the client. The response will be printed on the screen. 
+You may send your own question-context pair for processing, using the `--question` and `--context` flags of client.py, respectively. 
+You may want to use the `--triton-model-name` flag to select the model in onnx format. 
+### Evaluating the deployed model on Squad1.1
+To deploy and evaluate your model, run the following command. 
+`bash ./triton/evaluate.sh` 
+By default, this will deploy BERT in torchscript format, and evaluate it on Squad1.1. 
+You may change the format of deployment by editing `[bert folder]/triton/evaluate.sh`. 
+Change the value of `EXPORT_FORMAT` from `ts-script` to `onnx`. Moreover, you may set `precision` to either `fp32` or `fp16`. 
+### Generating performance data
+To collect performance data, run the following command. 
+`bash ./triton/generate_figures.sh` 
+By default, this will deploy BERT in `torchscript` format, launch the server, run the perf client, collect statistics and place them in `[bert folder]/results/triton_models/perf_client`. 
+You may change the format of deployment by editing `./triton/generate_figures.sh`. Change the value of `EXPORT_FORMAT` from `ts-script` to `onnx`, respectively. 
+Moreover, you may set `precision` to either `fp32` or `fp16`. 
+## Advanced
+### Other scripts
+To launch the Triton server in a detached state, run the following command. 
+`bash ./triton/launch_triton_server.sh` 
+By default, the Triton server is expecting the model repository in `[bert folder]/results/triton_models`. 
+To make the machine wait until the server is initialized, and the model is ready for inference, run the following command. 
+`bash ./triton/wait_for_triton_server.sh` 
+## Performance
+The numbers below are averages, measured on Triton on V100 32G GPU, with [static batching](https://docs.nvidia.com/deeplearning/sdk/tensorrt-inference-server-guide/docs/model_configuration.html#scheduling-and-batching). 
+| Format | GPUs | Batch size | Sequence length | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (mixed precision/FP32)  |
+|--------|------|------------|-----------------|----------------------------------|---------------------------------------------|--------------------------------------------|
+|pytorch      | 1 | 1 | 384 | 30.1 | 28.0  | 0.93x | 
+|pytorch      | 1 | 8 | 384 | 36.0 | 116.8 | 3.24x | 
+|torchscript  | 1 | 1 | 384 | 32.20 | 38.40 | 1.19x | 
+|torchscript  | 1 | 8 | 384 | 40.00 | 134.40 | 3.36x | 
+|onnx         | 1 | 1 | 384 | 33.30 | 92.00 | 2.76x | 
+|onnx         | 1 | 8 | 384 | 42.60 | 165.30 | 3.88x |