Commit e5ca7e62 authored by hepj987's avatar hepj987
Browse files

初始化仓库

parents
Pipeline #437 failed with stages
in 0 seconds
#!/bin/bash
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=3
module unload compiler/rocm/2.9
echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
APP="python3 run_squad_v4.py \
--train_file /public/home/hepj/data/sq1.1/train-v1.1.json \
--predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json \
--init_checkpoint /public/home/hepj/model_source/pytorch_bert/model.ckpt-28252.pt \
--vocab_file /public/home/hepj/model_source/pytorch_bert/vocab.txt \
--output_dir /public/home/hepj/outdir/torch/SQuAD4 \
--config_file /public/home/hepj/model_source/pytorch_bert/bert_config.json \
--json-summary /public/home/hepj/outdir/torch/SQuAD4/results.json \
--bert_model bert-large-uncased \
--do_train \
--do_predict \
--do_eval \
--train_batch_size 4 \
--predict_batch_size 4 \
--gpus_per_node 1 \
--local_rank ${comm_rank} \
--world_size 4 \
--use_env \
--dist_url tcp://localhost:34567 \
--eval_script ./evaluate-v1.1.py
"
# --fp16 \
# --amp \
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
#echo GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
#GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=1
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
echo numactl --cpunodebind=1 --membind=1 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=2
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
echo numactl --cpunodebind=2 --membind=2 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
#!/bin/bash
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=3
module unload compiler/rocm/2.9
echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
APP="python3 run_squad_v4.py \
--train_file /public/home/hepj/data/sq1.1/train-v1.1.json \
--predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json \
--init_checkpoint /public/home/hepj/model_source/pytorch_bert/model.ckpt-28252.pt \
--vocab_file /public/home/hepj/model_source/pytorch_bert/vocab.txt \
--output_dir /public/home/hepj/outdir/torch/SQuAD4 \
--config_file /public/home/hepj/model_source/pytorch_bert/bert_config.json \
--json-summary /public/home/hepj/outdir/torch/SQuAD4/results.json \
--bert_model bert-large-uncased \
--do_train \
--do_predict \
--do_eval \
--train_batch_size 4 \
--predict_batch_size 4 \
--gpus_per_node 1 \
--local_rank ${comm_rank} \
--world_size 4 \
--use_env \
--fp16 \
--amp \
--dist_url tcp://localhost:34567 \
--eval_script ./evaluate-v1.1.py
"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
#echo GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
#GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=1
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
echo numactl --cpunodebind=1 --membind=1 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=2
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
echo numactl --cpunodebind=2 --membind=2 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
#!/bin/bash
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=3
module unload compiler/rocm/2.9
echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
APP="python3 run_squad_v1.py \
--train_file /public/home/hepj/data/sq1.1/train-v1.1.json \
--predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json \
--init_checkpoint /public/home/hepj/model_source/pytorch_bert/model.ckpt-28252.pt \
--vocab_file /public/home/hepj/model_source/pytorch_bert/vocab.txt \
--output_dir /public/home/hepj/outdir/tourch/SQuAD \
--config_file /public/home/hepj/model_source/pytorch_bert/bert_config.json \
--json-summary ./log/results.json \
--bert_model bert-large-uncased \
--do_train \
--do_predict \
--do_eval \
--train_batch_size 4 \
--predict_batch_size 4 \
--gpus_per_node 1 \
--local_rank -1 \
--fp16 \
--amp \
--eval_script ./evaluate-v1.1.py
"
#--json-summary /public/home/hepj/out_dir/tourch/SQuAD/results.json
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
#echo GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
#GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=1
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
echo numactl --cpunodebind=1 --membind=1 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=2
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
echo numactl --cpunodebind=2 --membind=2 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
import torch
import mhalib
###########################################################################################
class Bmm1Function(torch.autograd.Function):
@staticmethod
def forward(ctx, batch1, batch2, seqlen, batch, maxseqlen, heads, embed, scale, stream, sync):
ctx.save_for_backward(batch1, batch2, seqlen)
ctx.batch = batch
ctx.maxseqlen = maxseqlen
ctx.heads = heads
ctx.embed = embed
ctx.scale = scale
ctx.sync = sync
ctx.stream = stream
ntokens = seqlen.sum().item()
ctx.ntokens = ntokens
ntokens2 = 0
for i in range(batch):
ntokens2 += seqlen[i]*seqlen[i]
output = torch.empty(ntokens2*heads, device="cuda", dtype=torch.float16)
mhalib.FastBmm1Fprop(batch2.flatten().contiguous(), batch1.flatten().contiguous(), output.flatten().contiguous(), batch, seqlen, heads, embed, scale, False, stream, sync)
return output[:ntokens2*heads]
@staticmethod
def backward(ctx, grad_output):
batch1, batch2, seqlen = ctx.saved_tensors
batch = ctx.batch
maxseqlen = ctx.maxseqlen
heads = ctx.heads
embed = ctx.embed
ntokens = ctx.ntokens
grad_batch1 = torch.empty(ntokens,heads*embed, device="cuda", dtype=torch.float16)
grad_batch2 = torch.empty(ntokens,heads*embed, device="cuda", dtype=torch.float16)
mhalib.FastBmm1Dgrad2(batch2.flatten().contiguous(), grad_output.flatten().contiguous(), grad_batch1.flatten().contiguous(), batch, seqlen, heads, embed, ctx.scale, False, ctx.stream, ctx.sync)
mhalib.FastBmm1Dgrad1(batch1.flatten().contiguous(), grad_output.flatten().contiguous(), grad_batch2.flatten().contiguous(), batch, seqlen, heads, embed, ctx.scale, False, ctx.stream, ctx.sync)
return grad_batch1[:ntokens], grad_batch2[:ntokens], None, None, None, None, None, None, None, None
class Bmm1(torch.nn.Module):
def __init__(self, batch, seqlen, heads, embed, scale=False, stream=True, sync=True):
super(Bmm1, self).__init__()
self.heads = heads
self.embed = embed
self.maxseqlen = seqlen
self.scale = scale
self.sync = sync
self.stream = stream
def forward(self, batch1, batch2, batch, seqlen):
return Bmm1Function.apply(batch1, batch2, seqlen, batch, self.maxseqlen, self.heads, self.embed, self.scale, self.stream, self.sync)
##########################################################################################
class Bmm1StridedFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, mixed, seqlen, batch, maxseqlen, heads, embed, scale, stream, sync, timers):
ctx.save_for_backward(mixed, seqlen)
ctx.batch = batch
ctx.maxseqlen = maxseqlen
ctx.heads = heads
ctx.embed = embed
ctx.scale = scale
ctx.sync = sync
ctx.stream = stream
ctx.timers = timers
ntokens = seqlen.sum().item()
ctx.ntokens = ntokens
ntokens2 = 0
for i in range(batch):
ntokens2 += seqlen[i]*seqlen[i]
output = torch.empty(ntokens2*heads, device="cuda", dtype=torch.float16)
if timers: timers['start_fprop'].record()
mhalib.FastBmm1Fprop(mixed, mixed, output, batch, seqlen, heads, embed, scale, True, stream, sync)
if timers: timers['stop_fprop'].record()
return output[:ntokens2*heads], mixed
@staticmethod
#def backward(ctx, grad_output):
def backward(ctx, grad_output, grad_mixed):
mixed, seqlen = ctx.saved_tensors
batch = ctx.batch
maxseqlen = ctx.maxseqlen
heads = ctx.heads
embed = ctx.embed
ntokens = ctx.ntokens
#grad_mixed = torch.empty([ntokens,heads*3*embed], device="cuda", dtype=torch.float16)
if ctx.timers: ctx.timers['start_dgrad'].record()
mhalib.FastBmm1Dgrad2(mixed, grad_output, grad_mixed, batch, seqlen, heads, embed, ctx.scale, True, ctx.stream, ctx.sync)
if ctx.timers: ctx.timers['stop_dgrad'].record()
if ctx.timers: ctx.timers['start_wgrad'].record()
mhalib.FastBmm1Dgrad1(mixed, grad_output, grad_mixed, batch, seqlen, heads, embed, ctx.scale, True, ctx.stream, ctx.sync)
if ctx.timers: ctx.timers['stop_wgrad'].record()
#return grad_mixed[:ntokens], None, None, None, None, None, None, None, None, None
return grad_mixed[:ntokens], grad_mixed, None, None, None, None, None, None, None, None, None
class Bmm1Strided(torch.nn.Module):
def __init__(self, batch, seqlen, heads, embed, scale=True, stream=True, sync=True, timer=False):
super(Bmm1Strided, self).__init__()
self.heads = heads
self.embed = embed
self.maxseqlen = seqlen
self.scale = scale
self.sync = sync
self.stream = stream
if timer:
self.timers = {'start_fprop':torch.cuda.Event(enable_timing=True),
'start_dgrad':torch.cuda.Event(enable_timing=True),
'start_wgrad':torch.cuda.Event(enable_timing=True),
'stop_fprop':torch.cuda.Event(enable_timing=True),
'stop_dgrad':torch.cuda.Event(enable_timing=True),
'stop_wgrad':torch.cuda.Event(enable_timing=True)}
else:
self.timers = None
def forward(self, mixed, batch, seqlen):
return Bmm1StridedFunction.apply(mixed, seqlen, batch, self.maxseqlen, self.heads, self.embed, self.scale, self.stream, self.sync, self.timers)
###########################################################################################
import torch
import mhalib
###########################################################################################
class Bmm2Function(torch.autograd.Function):
@staticmethod
def forward(ctx, batch1, batch2, seqlen, batch, maxseqlen, heads, embed, sync, stream):
ctx.save_for_backward(batch1, batch2, seqlen)
ctx.batch = batch
ctx.maxseqlen = maxseqlen
ctx.heads = heads
ctx.embed = embed
ctx.stream = stream
ctx.sync = sync
ntokens = seqlen.sum().item()
ctx.ntokens = ntokens
output = torch.empty([ntokens,heads,embed], device="cuda", dtype=torch.float16)
mhalib.FastBmm2Fprop(batch2.flatten().contiguous(), batch1.flatten().contiguous(), output, batch, seqlen, heads, embed, False, False, stream, sync)
return output[:ntokens]
@staticmethod
def backward(ctx, grad_output):
batch1, batch2, seqlen = ctx.saved_tensors
batch = ctx.batch
maxseqlen = ctx.maxseqlen
heads = ctx.heads
embed = ctx.embed
ntokens = ctx.ntokens
ntokens2 = 0
for i in range(batch):
ntokens2 += seqlen[i]*seqlen[i]
grad_batch1 = torch.empty([ntokens2*heads], device="cuda", dtype=torch.float16)
grad_batch2 = torch.empty([ntokens,heads*embed], device="cuda", dtype=torch.float16)
mhalib.FastBmm2Dgrad1(batch2.flatten().contiguous(), grad_output, grad_batch1, batch, seqlen, heads, embed, False, False, ctx.stream, ctx.sync)
mhalib.FastBmm2Dgrad2(grad_output, batch1, grad_batch2, batch, seqlen, heads, embed, False, False, ctx.stream, ctx.sync)
return grad_batch1[:ntokens2*heads], grad_batch2[:ntokens], None, None, None, None, None, None, None
class Bmm2(torch.nn.Module):
def __init__(self, batch, seqlen, heads, embed, stream=True, sync=True):
super(Bmm2, self).__init__()
self.heads = heads
self.embed = embed
self.maxseqlen = seqlen
self.stream = stream
self.sync = sync
def forward(self, batch1, batch2, batch, seqlen):
return Bmm2Function.apply(batch1, batch2, seqlen, batch, self.maxseqlen, self.heads, self.embed, self.stream, self.sync)
###########################################################################################
class Bmm2StridedFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, batch1, mixed, seqlen, batch, maxseqlen, heads, embed, stream, sync, timers):
ctx.save_for_backward(batch1, mixed, seqlen)
ctx.batch = batch
ctx.maxseqlen = maxseqlen
ctx.heads = heads
ctx.embed = embed
ctx.stream = stream
ctx.sync = sync
ctx.timers = timers
ntokens = seqlen.sum().item()
ctx.ntokens = ntokens
output = torch.empty([ntokens,heads,embed], device="cuda", dtype=torch.float16)
if timers: timers['start_fprop'].record()
mhalib.FastBmm2Fprop(mixed, batch1, output, batch, seqlen, heads, embed, False, True, stream, sync)
if timers: timers['stop_fprop'].record()
return output[:ntokens]
@staticmethod
def backward(ctx, grad_output):
batch1, mixed, seqlen = ctx.saved_tensors
batch = ctx.batch
maxseqlen = ctx.maxseqlen
heads = ctx.heads
embed = ctx.embed
ntokens = ctx.ntokens
ntokens2 = 0
for i in range(batch):
ntokens2 += seqlen[i]*seqlen[i]
grad_batch1 = torch.empty(ntokens2*heads, device="cuda", dtype=torch.float16)
grad_mixed = torch.empty([ntokens,heads*3*embed], device="cuda", dtype=torch.float16)
if ctx.timers: ctx.timers['start_dgrad'].record()
mhalib.FastBmm2Dgrad1(mixed, grad_output, grad_batch1, batch, seqlen, heads, embed, False, True, ctx.stream, ctx.sync)
if ctx.timers: ctx.timers['stop_dgrad'].record()
if ctx.timers: ctx.timers['start_wgrad'].record()
mhalib.FastBmm2Dgrad2(grad_output, batch1, grad_mixed, batch, seqlen, heads, embed, False, True, ctx.stream, ctx.sync)
if ctx.timers: ctx.timers['stop_wgrad'].record()
return grad_batch1[:ntokens2*heads], grad_mixed[:ntokens], None, None, None, None, None, None, None, None
class Bmm2Strided(torch.nn.Module):
def __init__(self, batch, seqlen, heads, embed, stream=True, sync=True, timer=False):
super(Bmm2Strided, self).__init__()
self.heads = heads
self.embed = embed
self.maxseqlen = seqlen
self.stream = stream
self.sync = sync
if timer:
self.timers = {'start_fprop':torch.cuda.Event(enable_timing=True),
'start_dgrad':torch.cuda.Event(enable_timing=True),
'start_wgrad':torch.cuda.Event(enable_timing=True),
'stop_fprop':torch.cuda.Event(enable_timing=True),
'stop_dgrad':torch.cuda.Event(enable_timing=True),
'stop_wgrad':torch.cuda.Event(enable_timing=True)}
else:
self.timers = None
def forward(self, batch1, mixed, batch, seqlen):
return Bmm2StridedFunction.apply(batch1, mixed, seqlen, batch, self.maxseqlen, self.heads, self.embed, self.stream, self.sync, self.timers)
###########################################################################################
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import argparse
from modeling import BertForPreTraining, BertConfig
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("--bert_model", default="bert-large-uncased", type=str,
help="Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
parser.add_argument('--tf_checkpoint',
type=str,
default="/google_bert_data",
help="Path to directory containing TF checkpoint")
parser.add_argument('--bert_config_path',
type=str,
default="/workspace/phase1",
help="Path bert_config.json is located in")
parser.add_argument('--output_checkpoint', type=str,
default='./checkpoint.pt',
help="Path to output PyT checkpoint")
return parser.parse_args()
def prepare_model(args, device):
# Prepare model
config = BertConfig.from_json_file(args.bert_config_path)
# Padding for divisibility by 8
if config.vocab_size % 8 != 0:
config.vocab_size += 8 - (config.vocab_size % 8)
print('padded vocab size to: {}'.format(config.vocab_size))
# Set some options that the config file is expected to have (but don't need to be set properly
# at this point)
config.pad = False
config.unpad = False
config.dense_seq_output = False
config.fused_mha = False
config.fused_gelu_bias = False
config.fuse_qkv = False
config.fuse_scale = False
config.fuse_mask = False
config.fuse_dropout = False
config.apex_softmax = False
config.enable_stream = False
if config.fuse_mask == True: config.apex_softmax = True
if config.pad == False: config.enable_stream = True
if config.unpad == True: config.fused_mha = False
#Load from TF checkpoint
model = BertForPreTraining.from_pretrained(args.tf_checkpoint, from_tf=True, config=config)
return model
def main():
args = parse_arguments()
device = torch.device("cuda")
model = prepare_model(args, device)
torch.save({'model' : model.state_dict() }, args.output_checkpoint)
if __name__ == "__main__":
main()
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Utilities for working with the local dataset cache.
This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
Copyright by the AllenNLP authors.
"""
from __future__ import (absolute_import, division, print_function, unicode_literals)
import json
import logging
import os
import shutil
import tempfile
from functools import wraps
from hashlib import sha256
import sys
from io import open
import boto3
import requests
from botocore.exceptions import ClientError
from tqdm import tqdm
try:
from urllib.parse import urlparse
except ImportError:
from urlparse import urlparse
try:
from pathlib import Path
PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
Path.home() / '.pytorch_pretrained_bert'))
except AttributeError:
PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
def url_to_filename(url, etag=None):
"""
Convert `url` into a hashed filename in a repeatable way.
If `etag` is specified, append its hash to the url's, delimited
by a period.
"""
url_bytes = url.encode('utf-8')
url_hash = sha256(url_bytes)
filename = url_hash.hexdigest()
if etag:
etag_bytes = etag.encode('utf-8')
etag_hash = sha256(etag_bytes)
filename += '.' + etag_hash.hexdigest()
return filename
def filename_to_url(filename, cache_dir=None):
"""
Return the url and etag (which may be ``None``) stored for `filename`.
Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
"""
if cache_dir is None:
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
cache_dir = str(cache_dir)
cache_path = os.path.join(cache_dir, filename)
if not os.path.exists(cache_path):
raise EnvironmentError("file {} not found".format(cache_path))
meta_path = cache_path + '.json'
if not os.path.exists(meta_path):
raise EnvironmentError("file {} not found".format(meta_path))
with open(meta_path, encoding="utf-8") as meta_file:
metadata = json.load(meta_file)
url = metadata['url']
etag = metadata['etag']
return url, etag
def cached_path(url_or_filename, cache_dir=None, from_tf=False):
"""
Given something that might be a URL (or might be a local path),
determine which. If it's a URL, download the file and cache it, and
return the path to the cached file. If it's already a local path,
make sure the file exists and then return the path.
"""
if cache_dir is None:
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
url_or_filename = str(url_or_filename)
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
cache_dir = str(cache_dir)
parsed = urlparse(url_or_filename)
# if not os.path.exists(url_or_filename):
# raise ValueError("Local cached file does not exist: {}".format(parsed))
if parsed.scheme in ('http', 'https', 's3'):
# URL, so get it from the cache (downloading if necessary)
return get_from_cache(url_or_filename, cache_dir)
elif os.path.exists(url_or_filename):
# File, and it exists.
return url_or_filename
elif from_tf and os.path.exists(url_or_filename + ".meta"):
# TF checkpoint exists
return url_or_filename
elif parsed.scheme == '':
# File, but it doesn't exist.
raise EnvironmentError("file {} not found".format(url_or_filename))
else:
# Something unknown
raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
def split_s3_path(url):
"""Split a full s3 path into the bucket name and path."""
parsed = urlparse(url)
if not parsed.netloc or not parsed.path:
raise ValueError("bad s3 path {}".format(url))
bucket_name = parsed.netloc
s3_path = parsed.path
# Remove '/' at beginning of path.
if s3_path.startswith("/"):
s3_path = s3_path[1:]
return bucket_name, s3_path
def s3_request(func):
"""
Wrapper function for s3 requests in order to create more helpful error
messages.
"""
@wraps(func)
def wrapper(url, *args, **kwargs):
try:
return func(url, *args, **kwargs)
except ClientError as exc:
if int(exc.response["Error"]["Code"]) == 404:
raise EnvironmentError("file {} not found".format(url))
else:
raise
return wrapper
@s3_request
def s3_etag(url):
"""Check ETag on S3 object."""
s3_resource = boto3.resource("s3")
bucket_name, s3_path = split_s3_path(url)
s3_object = s3_resource.Object(bucket_name, s3_path)
return s3_object.e_tag
@s3_request
def s3_get(url, temp_file):
"""Pull a file directly from S3."""
s3_resource = boto3.resource("s3")
bucket_name, s3_path = split_s3_path(url)
s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
def http_get(url, temp_file):
req = requests.get(url, stream=True)
content_length = req.headers.get('Content-Length')
total = int(content_length) if content_length is not None else None
progress = tqdm(unit="B", total=total)
for chunk in req.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
progress.update(len(chunk))
temp_file.write(chunk)
progress.close()
def get_from_cache(url, cache_dir=None):
"""
Given a URL, look for the corresponding dataset in the local cache.
If it's not there, download it. Then return the path to the cached file.
"""
if cache_dir is None:
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
cache_dir = str(cache_dir)
if not os.path.exists(cache_dir):
os.makedirs(cache_dir)
# Get eTag to add to filename, if it exists.
if url.startswith("s3://"):
etag = s3_etag(url)
else:
response = requests.head(url, allow_redirects=True)
if response.status_code != 200:
raise IOError("HEAD request failed for url {} with status code {}"
.format(url, response.status_code))
etag = response.headers.get("ETag")
filename = url_to_filename(url, etag)
# get cache path to put the file
cache_path = os.path.join(cache_dir, filename)
if not os.path.exists(cache_path):
raise ValueError("local cached file {} doesn't exist".format(cache_path))
# Download to temporary file, then copy to cache dir once finished.
# Otherwise you get corrupt cache entries if the download gets interrupted.
with tempfile.NamedTemporaryFile() as temp_file:
logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
# GET file object
if url.startswith("s3://"):
s3_get(url, temp_file)
else:
http_get(url, temp_file)
# we are copying the file before closing it, so flush to avoid truncation
temp_file.flush()
# shutil.copyfileobj() starts at the current position, so go to the start
temp_file.seek(0)
logger.info("copying %s to cache at %s", temp_file.name, cache_path)
with open(cache_path, 'wb') as cache_file:
shutil.copyfileobj(temp_file, cache_file)
logger.info("creating metadata file for %s", cache_path)
meta = {'url': url, 'etag': etag}
meta_path = cache_path + '.json'
with open(meta_path, 'w', encoding="utf-8") as meta_file:
json.dump(meta, meta_file)
logger.info("removing temp file %s", temp_file.name)
return cache_path
def read_set_from_file(filename):
'''
Extract a de-duped collection (set) of text from a file.
Expected file format is one item per line.
'''
collection = set()
with open(filename, 'r', encoding='utf-8') as file_:
for line in file_:
collection.add(line.rstrip())
return collection
def get_file_extension(path, dot=True, lower=True):
ext = os.path.splitext(path)[1]
ext = ext if dot else ext[1:]
return ext.lower() if lower else ext
import torch
from .fused_gelu import bias_gelu_impl
__all__ = ["bias_gelu_impl"]
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
torch._C._jit_set_profiling_mode(False)
torch._C._jit_set_profiling_executor(False)
torch._C._jit_override_can_fuse_on_cpu(True)
torch._C._jit_override_can_fuse_on_gpu(True)
# 1/sqrt(2*pi)-> 0.3989423
# 1/sqrt(2) -> 0.70710678
# sqrt(2/pi) -> 0.79788456
# this function is tanh approximation of gelu
# actual gelu is:
# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
@torch.jit.script
def bias_gelu(bias, y):
x = bias + y
return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
# gradient of tanh approximation of gelu
# gradient of actual gelu is:
# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
@torch.jit.script
def bias_gelu_back(g, bias, y):
x = bias + y
tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
# sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
return ff*g
class GeLUFunction(torch.autograd.Function):
@staticmethod
# bias is an optional argument
def forward(ctx, input, bias):
ctx.save_for_backward(input, bias)
return bias_gelu(bias, input)
@staticmethod
def backward(ctx, grad_output):
input, bias = ctx.saved_tensors
tmp = bias_gelu_back(grad_output, bias, input)
return tmp, tmp
bias_gelu_impl = GeLUFunction.apply
import torch
import torch.nn as nn
import torch.nn.functional as F
from apex.contrib.multihead_attn import fast_mask_softmax_dropout_func
from bmm1 import *
from bmm2 import *
from padding import *
from softmax import *
class FastUnpadBertSelfAttention(nn.Module):
def __init__(self, config, enable_stream=True, enable_sync=True, fuse_mask=True, fuse_scale=True, fuse_qkv=True, fuse_dropout=True, apex_softmax=True, pad=True):
super(FastUnpadBertSelfAttention, self).__init__()
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (config.hidden_size, config.num_attention_heads))
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.hidden_size = config.hidden_size
self.fuse_qkv = fuse_qkv
self.fuse_scale = fuse_scale
self.fuse_mask = fuse_mask
self.fuse_dropout = fuse_dropout
self.apex_softmax = apex_softmax
self.pad = pad
self.enable_stream = enable_stream
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
if self.fuse_qkv:
self.bmm1 = Bmm1Strided(None,None,self.num_attention_heads,self.attention_head_size, scale=self.fuse_scale, stream=enable_stream, sync=enable_sync, timer=False)
self.bmm2 = Bmm2Strided(None,None,self.num_attention_heads,self.attention_head_size, stream=enable_stream, sync=enable_sync, timer=False)
else:
self.bmm1 = Bmm1(None,None,self.num_attention_heads,self.attention_head_size, scale=self.fuse_scale, stream=enable_stream, sync=enable_sync)
self.bmm2 = Bmm2(None,None,self.num_attention_heads,self.attention_head_size, stream=enable_stream, sync=enable_sync)
if self.fuse_dropout == False:
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
if self.fuse_mask == True and self.fuse_dropout == True:
self.softmax = FastMaskSoftmaxDropout(dim=-1, dropout_prob=config.attention_probs_dropout_prob,stream=enable_stream, sync=(not self.pad), timer=False)
elif self.fuse_mask == True:
self.softmax = FastMaskSoftmax(dim=-1, stream=enable_stream, sync=enable_sync, timer=False)
else:
self.softmax = FastSoftmax(dim=-1, stream=enable_stream, sync=enable_sync, timer=False)
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = torch.reshape(x, new_x_shape)
return x.permute(0, 2, 1, 3)
def transpose_key_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = torch.reshape(x, new_x_shape)
return x.permute(0, 2, 3, 1)
def pytorch_softmax(self,attention_scores, batch, seqlen, heads):
ntokens2 = 0
for i in range(batch):
ntokens2 += seqlen[i]*seqlen[i]*self.num_attention_heads
attention_probs = torch.zeros(ntokens2, device="cuda", dtype=torch.float16)
ntokens2 = 0
for i in range(batch):
tokens2 = seqlen[i]*seqlen[i]*self.num_attention_heads
attention_probs[ntokens2:ntokens2+tokens2] = F.softmax(attention_scores[ntokens2:ntokens2+tokens2].view(1,self.num_attention_heads,seqlen[i],seqlen[i]), dim=-1).flatten().contiguous()
ntokens2 += tokens2
return attention_probs
def forward(self, hidden_states, attention_mask, seqlen, batch, is_training=True):
self.batch = batch
# QKV
if self.fuse_qkv:
weight = torch.cat([self.query.weight.view(self.num_attention_heads,self.attention_head_size,1,self.hidden_size), self.key.weight.view(self.num_attention_heads,self.attention_head_size,1,self.hidden_size), self.value.weight.view(self.num_attention_heads,self.attention_head_size,1,self.hidden_size)], dim=1).reshape(self.all_head_size*3,self.hidden_size).contiguous()
bias = torch.cat([self.query.bias.view(self.num_attention_heads,1,self.attention_head_size), self.key.bias.view(self.num_attention_heads,1,self.attention_head_size), self.value.bias.view(self.num_attention_heads,1,self.attention_head_size)],dim=1).reshape(3*self.hidden_size).contiguous()
mixed_x_layer = torch.addmm(bias, hidden_states, weight.t())
else:
query_layer = self.query(hidden_states)
key_layer = self.key(hidden_states)
value_layer = self.value(hidden_states)
# BMM1.
if self.enable_stream: torch.cuda.synchronize()
if self.fuse_qkv:
attention_scores, qkv_layer = self.bmm1(mixed_x_layer, self.batch, seqlen)
else:
attention_scores = self.bmm1(query_layer, key_layer, self.batch, seqlen)
if self.enable_stream: torch.cuda.synchronize()
if self.fuse_scale == False:
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
# Softmax.
if self.enable_stream: torch.cuda.synchronize()
if self.fuse_mask ==True and self.fuse_dropout == True:
attention_probs = self.softmax(attention_scores, attention_mask, self.batch, seqlen, self.num_attention_heads, is_training)
elif self.fuse_mask == True:
attention_probs = self.softmax(attention_scores, attention_mask, self.batch, seqlen, self.num_attention_heads)
else:
attention_scores = attention_scores + attention_mask.view(-1)
if self.apex_softmax == True:
attention_probs = self.softmax(attention_scores, self.batch, seqlen, self.num_attention_heads)
else:
if self.pad == True:
attention_probs = F.softmax(attention_scores.view(batch,self.num_attention_heads,seqlen[0],seqlen[0]), dim=-1).flatten().contiguous()
else:
attention_probs = self.pytorch_softmax(attention_scores, self.batch, seqlen, self.num_attention_heads)
# Dropout.
if self.enable_stream: torch.cuda.synchronize()
if self.fuse_dropout == False:
attention_probs = self.dropout(attention_probs)
# BMM2.
if self.enable_stream: torch.cuda.synchronize()
if self.fuse_qkv:
context_layer = self.bmm2(attention_probs, qkv_layer, self.batch, seqlen)
else:
context_layer = self.bmm2(attention_probs, value_layer, self.batch, seqlen)
if self.enable_stream: torch.cuda.synchronize()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = torch.reshape(context_layer, new_context_layer_shape)
return context_layer
#include <vector>
#include <iostream>
#include <ATen/ATen.h>
#include <cuda.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
//#include <cuda_profiler_api.h>
#include "THC/THC.h"
#include <ATen/cuda/CUDAContext.h>
#include <torch/extension.h>
#include <math.h>
#include "/opt/pytorch/apex/apex/contrib/csrc/multihead_attn/softmax.h"
#define nstreams 16
// global variables.
cudaStream_t stream[nstreams];
cublasHandle_t handle;
///////////////////////////////////////////////////////////////////////////////////////////////////
void FastBmm1Fprop_(torch::Tensor &A,
torch::Tensor &B,
torch::Tensor &C,
int batch,
torch::Tensor &seq_len,
int heads,
int embed,
bool scale,
bool strided,
bool enable_stream,
bool sync)
{
float one = 1.0, zero = 0.0, alpha = 1.0 / sqrt(static_cast<float>(embed));
int *seqlen = static_cast<int*>(seq_len.data_ptr());
void *ptrA = static_cast<void*>(static_cast<half*>(A.data_ptr()) + (strided ? embed : 0)); // key
void *ptrB = static_cast<void*>(static_cast<half*>(B.data_ptr())); // query
void *ptrC = static_cast<void*>(static_cast<half*>(C.data_ptr())); // output
for(int i = 0; i < (enable_stream ? batch : 1); i++) {
cublasSetStream(handle, enable_stream ? stream[i%nstreams]: at::cuda::getCurrentCUDAStream());
cublasGemmStridedBatchedEx(handle,
CUBLAS_OP_T,
CUBLAS_OP_N,
seqlen[i],
seqlen[i],
embed,
static_cast<const void*>(scale ? &alpha : &one),
ptrA,
CUDA_R_16F,
(enable_stream ? 1 : batch) * (strided ? heads*3*embed : heads*embed),
strided ? 3*embed : embed,
ptrB,
CUDA_R_16F,
(enable_stream ? 1 : batch) * (strided ? heads*3*embed : heads*embed),
strided ? 3*embed : embed,
static_cast<const void*>(&zero),
ptrC,
CUDA_R_16F,
seqlen[i],
seqlen[i]*seqlen[i],
enable_stream ? heads : batch*heads,
CUDA_R_32F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP);
ptrA = static_cast<void*>(static_cast<half*>(ptrA) + (strided ? seqlen[i]*heads*3*embed : seqlen[i]*heads*embed));
ptrB = static_cast<void*>(static_cast<half*>(ptrB) + (strided ? seqlen[i]*heads*3*embed : seqlen[i]*heads*embed));
ptrC = static_cast<void*>(static_cast<half*>(ptrC) + heads*seqlen[i]*seqlen[i]);
}
for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
if(sync) cudaStreamSynchronize(stream[i]);
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
void FastBmm2Fprop_(torch::Tensor &A,
torch::Tensor &B,
torch::Tensor &C,
int batch,
torch::Tensor &seq_len,
int heads,
int embed,
bool scale,
bool strided,
bool enable_stream,
bool sync)
{
float one = 1.0, zero = 0.0;
int *seqlen = static_cast<int*>(seq_len.data_ptr());
void *ptrA = static_cast<void*>(static_cast<half*>(A.data_ptr()) + (strided ? 2*embed : 0)); // value
void *ptrB = static_cast<void*>(static_cast<half*>(B.data_ptr())); // query*key
void *ptrC = static_cast<void*>(static_cast<half*>(C.data_ptr())); // output
for(int i = 0; i < (enable_stream ? batch : 1); i++) {
cublasSetStream(handle, enable_stream ? stream[i%nstreams]: at::cuda::getCurrentCUDAStream());
cublasGemmStridedBatchedEx(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
embed,
seqlen[i],
seqlen[i],
static_cast<const void*>(&one),
ptrA,
CUDA_R_16F,
(enable_stream ? 1 : batch) * (strided ? heads*3*embed : heads*embed),
strided ? 3*embed : embed,
ptrB,
CUDA_R_16F,
seqlen[i],
seqlen[i]*seqlen[i],
static_cast<const void*>(&zero),
ptrC,
CUDA_R_16F,
enable_stream ? heads*embed : batch*heads*embed,
embed,
enable_stream ? heads : batch*heads,
CUDA_R_32F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP);
ptrA = static_cast<void*>(static_cast<half*>(ptrA) + (strided ? seqlen[i]*heads*3*embed : seqlen[i]*heads*embed));
ptrB = static_cast<void*>(static_cast<half*>(ptrB) + heads*seqlen[i]*seqlen[i]);
ptrC = static_cast<void*>(static_cast<half*>(ptrC) + seqlen[i]*heads*embed);
}
for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
if(sync) cudaStreamSynchronize(stream[i]);
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
void FastBmm1Dgrad1_(torch::Tensor &A,
torch::Tensor &B,
torch::Tensor &C,
int batch,
torch::Tensor &seq_len,
int heads,
int embed,
bool scale,
bool strided,
bool enable_stream,
bool sync)
{
float one = 1.0, zero = 0.0, alpha = 1.0 / sqrt(static_cast<float>(embed));
int *seqlen = static_cast<int*>(seq_len.data_ptr());
void *ptrA = static_cast<void*>(static_cast<half*>(A.data_ptr())); // query
void *ptrB = static_cast<void*>(static_cast<half*>(B.data_ptr()));
void *ptrC = static_cast<void*>(static_cast<half*>(C.data_ptr()) + (strided ? embed : 0)); // grad_key
for(int i = 0; i < (enable_stream ? batch : 1); i++) {
cublasSetStream(handle, enable_stream ? stream[i%nstreams] : at::cuda::getCurrentCUDAStream());
cublasGemmStridedBatchedEx(handle,
CUBLAS_OP_N,
CUBLAS_OP_T,
embed,
seqlen[i],
seqlen[i],
static_cast<const void*>(scale ? &alpha : &one),
ptrA,
CUDA_R_16F,
(enable_stream ? 1 : batch) * (strided ? heads*3*embed : heads*embed),
strided ? 3*embed : embed,
ptrB,
CUDA_R_16F,
seqlen[i],
seqlen[i]*seqlen[i],
static_cast<const void*>(&zero),
ptrC,
CUDA_R_16F,
(enable_stream ? 1 : batch) * (strided ? heads*3*embed : heads*embed),
strided ? 3*embed : embed,
enable_stream ? heads : heads*batch,
CUDA_R_32F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP);
ptrA = static_cast<void*>(static_cast<half*>(ptrA) + (strided ? seqlen[i]*heads*3*embed : seqlen[i]*heads*embed));
ptrB = static_cast<void*>(static_cast<half*>(ptrB) + heads*seqlen[i]*seqlen[i]);
ptrC = static_cast<void*>(static_cast<half*>(ptrC) + (strided ? seqlen[i]*heads*3*embed : seqlen[i]*heads*embed));
}
for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
if(sync) cudaStreamSynchronize(stream[i]);
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
void FastBmm2Dgrad1_(torch::Tensor &A,
torch::Tensor &B,
torch::Tensor &C,
int batch,
torch::Tensor &seq_len,
int heads,
int embed,
bool scale,
bool strided,
bool enable_stream,
bool sync)
{
float one = 1.0, zero = 0.0;
int *seqlen = static_cast<int*>(seq_len.data_ptr());
void *ptrA = static_cast<void*>(static_cast<half*>(A.data_ptr()) + (strided ? 2*embed : 0)); // value
void *ptrB = static_cast<void*>(static_cast<half*>(B.data_ptr()));
void *ptrC = static_cast<void*>(static_cast<half*>(C.data_ptr()));
for(int i = 0; i < (enable_stream ? batch : 1); i++) {
cublasSetStream(handle, enable_stream ? stream[i%nstreams] : at::cuda::getCurrentCUDAStream());
cublasGemmStridedBatchedEx(handle,
CUBLAS_OP_T,
CUBLAS_OP_N,
seqlen[i],
seqlen[i],
embed,
static_cast<const void*>(&one),
ptrA,
CUDA_R_16F,
(enable_stream ? 1 : batch) * (strided ? heads*3*embed : heads*embed),
strided ? 3*embed : embed,
ptrB,
CUDA_R_16F,
enable_stream ? heads*embed : batch*heads*embed,
embed,
static_cast<const void*>(&zero),
ptrC,
CUDA_R_16F,
seqlen[i],
seqlen[i]*seqlen[i],
enable_stream ? heads : batch*heads,
CUDA_R_32F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP);
ptrA = static_cast<void*>(static_cast<half*>(ptrA) + (strided ? seqlen[i]*heads*3*embed : seqlen[i]*heads*embed));
ptrB = static_cast<void*>(static_cast<half*>(ptrB) + seqlen[i]*heads*embed);
ptrC = static_cast<void*>(static_cast<half*>(ptrC) + heads*seqlen[i]*seqlen[i]);
}
for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
if(sync) cudaStreamSynchronize(stream[i]);
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
void FastBmm1Dgrad2_(torch::Tensor &A,
torch::Tensor &B,
torch::Tensor &C,
int batch,
torch::Tensor &seq_len,
int heads,
int embed,
bool scale,
bool strided,
bool enable_stream,
bool sync)
{
float one = 1.0, zero = 0.0, alpha = 1.0 / sqrt(static_cast<float>(embed));
int *seqlen = static_cast<int*>(seq_len.data_ptr());
void *ptrA = static_cast<void*>(static_cast<half*>(A.data_ptr()) + (strided ? embed : 0)); // key
void *ptrB = static_cast<void*>(static_cast<half*>(B.data_ptr()));
void *ptrC = static_cast<void*>(static_cast<half*>(C.data_ptr())); // grad query
for(int i = 0; i < (enable_stream ? batch : 1); i++) {
cublasSetStream(handle, enable_stream ? stream[i%nstreams] : at::cuda::getCurrentCUDAStream());
cublasGemmStridedBatchedEx(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
embed,
seqlen[i],
seqlen[i],
static_cast<const void*>(scale ? &alpha : &one),
ptrA,
CUDA_R_16F,
(enable_stream ? 1 : batch) * (strided ? heads*3*embed : heads*embed),
strided ? 3*embed : embed,
ptrB,
CUDA_R_16F,
seqlen[i],
seqlen[i]*seqlen[i],
static_cast<const void*>(&zero),
ptrC,
CUDA_R_16F,
(enable_stream ? 1 : batch) * (strided ? heads*3*embed : heads*embed),
strided ? 3*embed : embed,
enable_stream ? heads : batch*heads,
CUDA_R_32F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP);
ptrA = static_cast<void*>(static_cast<half*>(ptrA) + (strided ? seqlen[i]*heads*3*embed : seqlen[i]*heads*embed));
ptrB = static_cast<void*>(static_cast<half*>(ptrB) + heads*seqlen[i]*seqlen[i]);
ptrC = static_cast<void*>(static_cast<half*>(ptrC) + (strided ? seqlen[i]*heads*3*embed : seqlen[i]*heads*embed));
}
for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
if(sync) cudaStreamSynchronize(stream[i]);
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
void FastBmm2Dgrad2_(torch::Tensor &A,
torch::Tensor &B,
torch::Tensor &C,
int batch,
torch::Tensor &seq_len,
int heads,
int embed,
bool scale,
bool strided,
bool enable_stream,
bool sync)
{
float one = 1.0, zero = 0.0;
int *seqlen = static_cast<int*>(seq_len.data_ptr());
void *ptrA = static_cast<void*>(static_cast<half*>(A.data_ptr()));
void *ptrB = static_cast<void*>(static_cast<half*>(B.data_ptr()));
void *ptrC = static_cast<void*>(static_cast<half*>(C.data_ptr()) + (strided ? 2*embed : 0)); // grad-value
for(int i = 0; i < (enable_stream ? batch : 1); i++) {
cublasSetStream(handle, enable_stream ? stream[i%nstreams] : at::cuda::getCurrentCUDAStream());
cublasGemmStridedBatchedEx(handle,
CUBLAS_OP_N,
CUBLAS_OP_T,
embed,
seqlen[i],
seqlen[i],
static_cast<const void*>(&one),
ptrA,
CUDA_R_16F,
enable_stream ? heads*embed : batch*heads*embed,
embed,
ptrB,
CUDA_R_16F,
seqlen[i],
seqlen[i]*seqlen[i],
static_cast<const void*>(&zero),
ptrC,
CUDA_R_16F,
(enable_stream ? 1 : batch) * (strided ? heads*3*embed : heads*embed),
strided ? 3*embed : embed,
enable_stream ? heads : batch*heads,
CUDA_R_32F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP);
ptrA = static_cast<void*>(static_cast<half*>(ptrA) + seqlen[i]*heads*embed);
ptrB = static_cast<void*>(static_cast<half*>(ptrB) + heads*seqlen[i]*seqlen[i]);
ptrC = static_cast<void*>(static_cast<half*>(ptrC) + (strided ? seqlen[i]*heads*3*embed : seqlen[i]*heads*embed));
}
for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
if(sync) cudaStreamSynchronize(stream[i]);
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
void FastSoftmaxFprop_(torch::Tensor &input,
int batch,
torch::Tensor &seq_len,
int heads,
bool enable_stream,
bool sync)
{
int *seqlen = static_cast<int*>(seq_len.data_ptr());
void *ptrIn = static_cast<void*>(input.data_ptr());
for(int i = 0; i < (enable_stream ? batch : 1); i++) {
dispatch_softmax<half, half, float>(
reinterpret_cast<half*>(ptrIn),
reinterpret_cast<const half*>(ptrIn),
seqlen[i],
seqlen[i],
enable_stream ? heads*seqlen[i] : batch*heads*seqlen[i]);
ptrIn = static_cast<void*>(static_cast<half*>(ptrIn) + heads*seqlen[i]*seqlen[i]);
}
for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
if(sync) cudaStreamSynchronize(stream[i]);
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
void FastSoftmaxBprop_(torch::Tensor &input,
torch::Tensor &output,
int batch,
torch::Tensor &seq_len,
int heads,
bool enable_stream,
bool sync)
{
int *seqlen = static_cast<int*>(seq_len.data_ptr());
void *ptrIn = static_cast<void*>(input.data_ptr());
void *ptrOut = static_cast<void*>(output.data_ptr());
for(int i = 0; i < (enable_stream ? batch : 1); i++) {
dispatch_softmax_backward_stream<half, half, float>(
static_cast<half*>(ptrOut),
static_cast<half*>(ptrOut),
reinterpret_cast<half const*>(ptrIn),
seqlen[i],
seqlen[i],
enable_stream ? heads*seqlen[i] : batch*heads*seqlen[i],
enable_stream ? stream[i%nstreams] : at::cuda::getCurrentCUDAStream());
ptrIn = static_cast<void*>(static_cast<half*>(ptrIn) + heads*seqlen[i]*seqlen[i]);
ptrOut = static_cast<void*>(static_cast<half*>(ptrOut) + heads*seqlen[i]*seqlen[i]);
}
for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
if(sync) cudaStreamSynchronize(stream[i]);
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
void FastMaskSoftmaxFprop_(torch::Tensor &input,
torch::Tensor &mask,
int batch,
torch::Tensor &seq_len,
int heads,
bool enable_stream,
bool sync)
{
int *seqlen = static_cast<int*>(seq_len.data_ptr());
void *ptrIn = static_cast<void*>(input.data_ptr());
void *ptrMask = static_cast<void*>(mask.data_ptr());
for(int i = 0; i < (enable_stream ? batch : 1); i++) {
dispatch_additive_masked_softmax_stream<half, half, float>(
reinterpret_cast<half*>(ptrIn),
reinterpret_cast<const half*>(ptrIn),
reinterpret_cast<const half*>(ptrMask),
seqlen[i],
seqlen[i],
enable_stream ? heads*seqlen[i] : batch*heads*seqlen[i],
enable_stream ? heads*seqlen[i] : heads*seqlen[i],
enable_stream ? stream[i%nstreams] : at::cuda::getCurrentCUDAStream());
ptrIn = static_cast<void*>(static_cast<half*>(ptrIn) + heads*seqlen[i]*seqlen[i]);
ptrMask = static_cast<void*>(static_cast<half*>(ptrMask) + seqlen[i]);
}
for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
if(sync) cudaStreamSynchronize(stream[i]);
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
std::vector<torch::Tensor> FastMaskSoftmaxDropoutFprop_(torch::Tensor &input,
torch::Tensor &mask,
int batch,
torch::Tensor &seq_len,
int heads,
float dropout_prob,
bool enable_stream,
bool sync,
bool is_training)
{
int *seqlen = static_cast<int*>(seq_len.data_ptr());
void *ptrIn = static_cast<void*>(input.data_ptr());
void *ptrMask = static_cast<void*>(mask.data_ptr());
for(int i = 0; i < (enable_stream ? batch : 1); i++) {
dispatch_additive_masked_softmax_stream<half, half, float>(
reinterpret_cast<half*>(ptrIn),
reinterpret_cast<const half*>(ptrIn),
reinterpret_cast<const half*>(ptrMask),
seqlen[i],
seqlen[i],
enable_stream ? heads*seqlen[i] : batch*heads*seqlen[i],
enable_stream ? heads*seqlen[i] : heads*seqlen[i],
enable_stream ? stream[i%nstreams] : at::cuda::getCurrentCUDAStream());
ptrIn = static_cast<void*>(static_cast<half*>(ptrIn) + heads*seqlen[i]*seqlen[i]);
ptrMask = static_cast<void*>(static_cast<half*>(ptrMask) + seqlen[i]);
}
for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
if(sync) cudaStreamSynchronize(stream[i]);
}
int ntokens = seqlen[0];
for(int i = 1; i < (enable_stream ? batch : 2); i++) {
ntokens += seqlen[i];
}
auto act_options = input.options().requires_grad(false);
auto mask_options = act_options.dtype(torch::kUInt8);
torch::Tensor dropout_results = torch::empty({batch*heads, ntokens}, act_options);
torch::Tensor dropout_mask = torch::empty({batch*heads, ntokens}, mask_options);
//torch::Tensor dropout_results = torch::empty({batch*heads, seqlen[0], seqlen[0]}, act_options);
//torch::Tensor dropout_mask = torch::empty({batch*heads, seqlen[0], seqlen[0]}, mask_options);
if (is_training) {
//use at:: function so that C++ version generates the same random mask as python version
auto dropout_tuple = at::_fused_dropout(input, 1.0f-dropout_prob);
dropout_results = std::get<0>(dropout_tuple);
dropout_mask = std::get<1>(dropout_tuple);
}
return {dropout_results, dropout_mask};
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
void FastMaskSoftmaxDropoutBprop_(torch::Tensor &input,
torch::Tensor &output,
torch::Tensor &dropout_mask,
int batch,
torch::Tensor &seq_len,
int heads,
float dropout_prob,
bool enable_stream,
bool sync)
{
int *seqlen = static_cast<int*>(seq_len.data_ptr());
void *ptrIn = static_cast<void*>(input.data_ptr());
void *ptrOut = static_cast<void*>(output.data_ptr());
void *ptrDropoutMask = static_cast<void*>(dropout_mask.data_ptr());
for(int i = 0; i < (enable_stream ? batch : 1); i++) {
dispatch_masked_scale_softmax_backward_stream<half, half, float, false>(
static_cast<half*>(ptrOut),
static_cast<half*>(ptrOut),
reinterpret_cast<half const*>(ptrIn),
reinterpret_cast<uint8_t const*>(ptrDropoutMask),
1.0/(1.0-dropout_prob),
seqlen[i],
seqlen[i],
enable_stream ? heads*seqlen[i] : batch*heads*seqlen[i],
enable_stream ? stream[i%nstreams] : at::cuda::getCurrentCUDAStream());
ptrIn = static_cast<void*>(static_cast<half*>(ptrIn) + heads*seqlen[i]*seqlen[i]);
ptrOut = static_cast<void*>(static_cast<half*>(ptrOut) + heads*seqlen[i]*seqlen[i]);
}
for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
if(sync) cudaStreamSynchronize(stream[i]);
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
void init_mha_cuda_extension()
{
// CUDA Stream.
for(int i = 0; i < nstreams; i++) {
cudaStreamCreate(&stream[i]);
}
// CuBlas Handle.
cublasCreate(&handle);
cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("InitMHACUDAExtension", &init_mha_cuda_extension, "InitMHACUDAExtension");
m.def("FastBmm1Fprop", &FastBmm1Fprop_, "FastBmm1Fprop");
m.def("FastBmm1Dgrad1", &FastBmm1Dgrad1_, "FastBmm1Dgrad1");
m.def("FastBmm1Dgrad2", &FastBmm1Dgrad2_, "FastBmm1Dgrad2");
m.def("FastBmm2Fprop", &FastBmm2Fprop_, "FastBmm2Fprop");
m.def("FastBmm2Dgrad1", &FastBmm2Dgrad1_, "FastBmm2Dgrad1");
m.def("FastBmm2Dgrad2", &FastBmm2Dgrad2_, "FastBmm2Dgrad2");
m.def("FastSoftmaxFprop", &FastSoftmaxFprop_, "FastSoftmaxFprop");
m.def("FastSoftmaxBprop", &FastSoftmaxBprop_, "FastSoftmaxBprop");
m.def("FastMaskSoftmaxFprop", &FastMaskSoftmaxFprop_, "FastMaskSoftmaxFprop");
m.def("FastMaskSoftmaxDropoutFprop", &FastMaskSoftmaxDropoutFprop_, "FastMaskSoftmaxDropoutFprop");
m.def("FastMaskSoftmaxDropoutBprop", &FastMaskSoftmaxDropoutBprop_, "FastMaskSoftmaxDropoutBprop");
}
import torch
import setuptools
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
setup(
name='mhalib',
ext_modules=[
CUDAExtension(
name='mhalib',
sources=['mha_funcs.cu'],
extra_compile_args={
'cxx': ['-O3',],
'nvcc':['-O3','-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', "--expt-relaxed-constexpr", "-ftemplate-depth=1024", '-gencode arch=compute_70,code=sm_70','-gencode arch=compute_80,code=sm_80','-gencode arch=compute_80,code=compute_80']
}
)
],
cmdclass={
'build_ext': BuildExtension
})
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch BERT model."""
from __future__ import absolute_import, division, print_function, unicode_literals
import copy
import json
import logging
import math
import os
import shutil
import tarfile
import tempfile
import sys
from io import open
from operator import mul
from functools import reduce
import torch
from torch import nn
from torch.nn import CrossEntropyLoss
from torch.utils import checkpoint
from apex.contrib.multihead_attn import SelfMultiheadAttn
from file_utils import cached_path
from layers.fused_gelu import bias_gelu_impl as bias_gelu
from utils import get_rank
import mhalib
from mha import *
logger = logging.getLogger(__name__)
torch._C._jit_set_profiling_mode(False)
torch._C._jit_set_profiling_executor(False)
torch._C._jit_override_can_fuse_on_cpu(True)
torch._C._jit_override_can_fuse_on_gpu(True)
def remap_attn_names_tf(name):
if 'attention' in name:
ind = name.index("attention")
if 'self' in name and 'query' in name and 'kernel' in name:
name = name[:(ind+1)] + ['multi_head_attention', 'q_weight']
if 'self' in name and 'query' in name and 'bias' in name:
name = name[:(ind+1)] + ['multi_head_attention', 'q_bias']
if 'self' in name and 'key' in name and 'kernel' in name:
name = name[:(ind+1)] + ['multi_head_attention', 'k_weight']
if 'self' in name and 'key' in name and 'bias' in name:
name = name[:(ind+1)] + ['multi_head_attention', 'k_bias']
if 'self' in name and 'value' in name and 'kernel' in name:
name = name[:(ind+1)] + ['multi_head_attention', 'v_weight']
if 'self' in name and 'value' in name and 'bias' in name:
name = name[:(ind+1)] + ['multi_head_attention', 'v_bias']
if 'output' in name and 'dense' in name and 'kernel' in name:
name = name[:(ind+1)] + ['multi_head_attention', 'out_proj_weight']
if 'output' in name and 'dense' in name and 'bias' in name:
name = name[:(ind+1)] + ['multi_head_attention', 'out_proj_bias']
if 'output' in name and 'LayerNorm' in name:
name = name[:(ind+1)] + ['layer_norm'] + name[-1:]
return name
def load_tf_weights_in_bert(model, tf_checkpoint_path, use_fast_mha=False):
""" Load tf checkpoints in a pytorch model
"""
try:
import re
import numpy as np
import tensorflow as tf
except ImportError:
print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions.")
raise
tf_path = os.path.abspath(tf_checkpoint_path)
if get_rank() == 0:
print("Converting TensorFlow checkpoint from {}".format(tf_path))
# Load weights from TF model
init_vars = tf.train.list_variables(tf_path)
names = []
arrays = []
for name, shape in init_vars:
if get_rank() == 0:
print("Loading TF weight {} with shape {}".format(name, shape))
array = tf.train.load_variable(tf_path, name)
names.append(name)
arrays.append(array)
# MHA params need to be treated separately
if use_fast_mha:
mha_params = ['q_weight', 'q_bias', 'k_weight', 'k_bias', 'v_weight', 'v_bias', 'out_proj_weight', 'out_proj_bias']
else:
mha_params = []
for name, array in zip(names, arrays):
name = name.split('/')
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model
if any(n in ["adam_v", "adam_m", "global_step", "LAMB", "LAMB_1", "beta1_power", "beta2_power"] for n in name):
if get_rank() == 0:
print("Skipping {}".format("/".join(name)))
continue
if use_fast_mha:
name = remap_attn_names_tf(name)
pointer = model
for m_name in name:
if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
l = re.split(r'_(\d+)', m_name)
else:
l = [m_name]
if l[0] in mha_params:
pointer = getattr(pointer, l[0])
elif l[0] == 'kernel' or l[0] == 'gamma':
pointer = getattr(pointer, 'weight')
elif l[0] == 'output_bias' or l[0] == 'beta':
pointer = getattr(pointer, 'bias')
elif l[0] == 'output_weights':
pointer = getattr(pointer, 'weight')
else:
pointer = getattr(pointer, l[0])
if len(l) >= 2:
num = int(l[1])
pointer = pointer[num]
if m_name[-11:] == '_embeddings':
pointer = getattr(pointer, 'weight')
elif m_name == 'kernel' or (m_name in mha_params and 'bias' not in m_name):
array = np.ascontiguousarray(np.transpose(array))
try:
assert pointer.shape == array.shape
except AssertionError as e:
# If copying smaller into larger, assume padded and ok
if reduce(mul, pointer.shape) > reduce(mul, array.shape):
if get_rank() == 0:
print("Initialize padded PyTorch weight {}".format(name))
pointer.data.zero_()
def generate_slices():
slices = []
for i in range(array.ndim):
slices.append(slice(0, array.shape[i], 1))
return slices
# pointer.data[generate_slices()] = torch.from_numpy(array)
pointer.data[generate_slices()] = torch.from_numpy(array)
else:
e.args += (pointer.shape, array.shape)
raise
else:
if get_rank() == 0:
print("Initialize PyTorch weight {}".format(name))
pointer.data = torch.from_numpy(array)
return model
def swish(x):
return x * torch.sigmoid(x)
def fast_gelu(x):
pi = 3.1415926535897932
cdf = 0.5 * (1.0 + torch.tanh((math.sqrt(2 / pi) * (x + 0.044715 * torch.pow(x, 3)))))
return x*cdf
#torch.nn.functional.gelu(x) # Breaks ONNX export
#ACT2FN = {"gelu": torch.nn.functional.gelu, "bias_gelu": bias_gelu, "relu": torch.nn.functional.relu, "swish": swish}
ACT2FN = {"gelu": fast_gelu, "bias_gelu": bias_gelu, "relu": torch.nn.functional.relu, "swish": swish}
class LinearActivation(torch.nn.Linear):
r"""Fused Linear and activation Module.
"""
__constants__ = ['bias']
def __init__(self, in_features, out_features, act='gelu', bias=True):
super(LinearActivation, self).__init__(in_features, out_features, bias)
self.act_fn = nn.Identity() #
self.biased_act_fn = None #
if isinstance(act, str) or (sys.version_info[0] == 2 and isinstance(act, unicode)): # For TorchScript
if bias and not 'bias' in act: # compatibility
act = 'bias_' + act #
self.biased_act_fn = ACT2FN[act] #
else:
self.act_fn = ACT2FN[act]
else:
self.act_fn = act
def forward(self, input):
if not self.bias is None:
return self.biased_act_fn(self.bias, nn.functional.linear(input, self.weight, None))
else:
return self.act_fn(F.linear(input, self.weight, self.bias))
class BertConfig(object):
"""Configuration class to store the configuration of a `BertModel`.
"""
def __init__(self,
vocab_size_or_config_json_file,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02):
"""Constructs BertConfig.
Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
hidden_size: Size of the encoder layers and the pooler layer.
num_hidden_layers: Number of hidden layers in the Transformer encoder.
num_attention_heads: Number of attention heads for each attention layer in
the Transformer encoder.
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
hidden_act: The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
hidden_dropout_prob: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: The dropout ratio for the attention
probabilities.
max_position_embeddings: The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
`BertModel`.
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
"""
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
and isinstance(vocab_size_or_config_json_file, unicode)):
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
json_config = json.loads(reader.read())
for key, value in json_config.items():
self.__dict__[key] = value
elif isinstance(vocab_size_or_config_json_file, int):
self.vocab_size = vocab_size_or_config_json_file
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
else:
raise ValueError("First argument must be either a vocabulary size (int)"
"or the path to a pretrained model config file (str)")
@classmethod
def from_dict(cls, json_object):
"""Constructs a `BertConfig` from a Python dictionary of parameters."""
config = BertConfig(vocab_size_or_config_json_file=-1)
for key, value in json_object.items():
config.__dict__[key] = value
return config
@classmethod
def from_json_file(cls, json_file):
"""Constructs a `BertConfig` from a json file of parameters."""
with open(json_file, "r", encoding='utf-8') as reader:
text = reader.read()
return cls.from_dict(json.loads(text))
def __repr__(self):
return str(self.to_json_string())
def to_dict(self):
"""Serializes this instance to a Python dictionary."""
output = copy.deepcopy(self.__dict__)
return output
def to_json_string(self):
"""Serializes this instance to a JSON string."""
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
try:
import apex
#apex.amp.register_half_function(apex.normalization.fused_layer_norm, 'FusedLayerNorm')
import apex.normalization
#apex.amp.register_float_function(apex.normalization.FusedLayerNorm, 'forward')
BertLayerNorm = apex.normalization.FusedLayerNorm
except ImportError:
print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.")
class BertLayerNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-12):
"""Construct a layernorm module in the TF style (epsilon inside the square root).
"""
super(BertLayerNorm, self).__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.bias = nn.Parameter(torch.zeros(hidden_size))
self.variance_epsilon = eps
def forward(self, x):
u = x.mean(-1, keepdim=True)
s = (x - u).pow(2).mean(-1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.variance_epsilon)
return self.weight * x + self.bias
class BertEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings.
"""
def __init__(self, config):
super(BertEmbeddings, self).__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, input_ids, token_type_ids=None):
seq_length = input_ids.size(1)
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)
words_embeddings = self.word_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = words_embeddings + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class BertSelfAttention(nn.Module):
def __init__(self, config):
super(BertSelfAttention, self).__init__()
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (config.hidden_size, config.num_attention_heads))
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.softmax = nn.Softmax(dim=-1)
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
def transpose_key_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 3, 1)
def forward(self, hidden_states, attention_mask):
mixed_query_layer = self.query(hidden_states)
mixed_key_layer = self.key(hidden_states)
mixed_value_layer = self.value(hidden_states)
query_layer = self.transpose_for_scores(mixed_query_layer)
key_layer = self.transpose_key_for_scores(mixed_key_layer)
value_layer = self.transpose_for_scores(mixed_value_layer)
# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores = torch.matmul(query_layer, key_layer)
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
attention_scores = attention_scores + attention_mask.unsqueeze(1).unsqueeze(2)
# Normalize the attention scores to probabilities.
attention_probs = self.softmax(attention_scores)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)
return context_layer
class BertSelfOutput(nn.Module):
def __init__(self, config):
super(BertSelfOutput, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
# This module uses Apex C++ multihead attention implementation with fusions.
class FastBertAttention(nn.Module):
def __init__(self, config):
super(FastBertAttention, self).__init__()
self.multi_head_attention = SelfMultiheadAttn(config.hidden_size, config.num_attention_heads, dropout = config.attention_probs_dropout_prob, bias=True, include_norm_add=False, impl='fast', separate_qkv_params=True, mask_additive=True)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.layer_norm = BertLayerNorm(config.hidden_size, eps=1e-12)
def forward(self, input_tensor, attention_mask):
residual=input_tensor
multi_head_attention_output,_ = self.multi_head_attention(query = input_tensor, key = input_tensor, value = input_tensor, key_padding_mask=attention_mask, need_weights=True,attn_mask = None, is_training = self.training)
attention_output = self.dropout(multi_head_attention_output)
attention_output = self.layer_norm(attention_output + residual)
return attention_output
class FastUnpadBertAttention(nn.Module):
def __init__(self, config):
super(FastUnpadBertAttention, self).__init__()
self.self = FastUnpadBertSelfAttention(config, enable_stream=config.enable_stream, enable_sync=False, fuse_mask=config.fuse_mask, fuse_scale=config.fuse_scale, fuse_qkv=config.fuse_qkv, fuse_dropout=config.fuse_dropout, apex_softmax=config.apex_softmax, pad=config.pad)
self.output = BertSelfOutput(config)
def forward(self, input_tensor, attention_mask, seqlen, batch):
self_output = self.self(input_tensor, attention_mask, seqlen, batch, is_training = self.training)
attention_output = self.output(self_output, input_tensor)
return attention_output
class BertAttention(nn.Module):
def __init__(self, config):
super(BertAttention, self).__init__()
self.self = BertSelfAttention(config)
self.output = BertSelfOutput(config)
def forward(self, input_tensor, attention_mask):
self_output = self.self(input_tensor, attention_mask)
attention_output = self.output(self_output, input_tensor)
return attention_output
class BertIntermediate(nn.Module):
def __init__(self, config):
super(BertIntermediate, self).__init__()
self.fused_gelu_bias = config.fused_gelu_bias
if config.fused_gelu_bias:
self.dense = LinearActivation(config.hidden_size, config.intermediate_size, act=config.hidden_act)
else:
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
if not self.fused_gelu_bias:
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class BertOutput(nn.Module):
def __init__(self, config):
super(BertOutput, self).__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class BertLayer(nn.Module):
def __init__(self, config):
super(BertLayer, self).__init__()
self.unpad = config.unpad
if config.fused_mha:
self.attention = FastBertAttention(config)
elif config.unpad:
self.attention = FastUnpadBertAttention(config)
else:
self.attention = BertAttention(config)
self.intermediate = BertIntermediate(config)
self.output = BertOutput(config)
def forward(self, hidden_states, attention_mask, seqlen, batch):
if self.unpad:
attention_output = self.attention(hidden_states, attention_mask, seqlen, batch)
else:
attention_output = self.attention(hidden_states, attention_mask)
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
class BertEncoder(nn.Module):
def __init__(self, config):
super(BertEncoder, self).__init__()
layer = BertLayer(config)
self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
self.num_attention_heads = config.num_attention_heads
self.fused_mha=config.fused_mha
self.unpad=config.unpad
self.pad = config.pad
self.fuse_mask = config.fuse_mask
self.enable_stream = config.enable_stream
# def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
# all_encoder_layers = []
# for layer_module in self.layer:
# hidden_states = layer_module(hidden_states, attention_mask)
# if output_all_encoded_layers:
# all_encoder_layers.append(hidden_states)
# if not output_all_encoded_layers:
# all_encoder_layers.append(hidden_states)
# return all_encoder_layers
def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, checkpoint_activations=False):
# Unpad inputs and mask. It will remove tokens that are padded. Assume ntokens is total number of tokens (padded and non-padded)
# and ntokens_unpad is total number of non-padded tokens. Then unpadding performs the following compression of the inputs:
# hidden_states[ntokens,hidden] -> hidden_states[ntokens_unpad,hidden]
batch = None
seqlen = None
if self.unpad:
batch = hidden_states.shape[0]
maxseqlen = hidden_states.shape[1]
hidden_size = hidden_states.shape[2]
attention_indices, attention_mask, seqlen, ntokens = generate_mask(attention_mask, self.num_attention_heads, pad=self.pad, fuse_mask=self.fuse_mask)
if self.pad == True and self.enable_stream == False:
hidden_states = hidden_states.view(batch,maxseqlen,hidden_size).permute(1,0,2).contiguous().view(batch*maxseqlen,hidden_size).contiguous()
if self.pad == True and self.enable_stream == True:
hidden_states = hidden_states.view(batch*maxseqlen,hidden_size)
if self.pad == False:
hidden_states = UnpadInput.apply(hidden_states.view(batch*maxseqlen, hidden_size).contiguous(), attention_indices, batch, maxseqlen, hidden_size, ntokens)
all_encoder_layers = []
def custom(start, end):
def custom_forward(*inputs):
layers = self.layer[start:end]
x_ = inputs[0]
for layer in layers:
x_ = layer(x_, inputs[1])
return x_
return custom_forward
if checkpoint_activations:
l = 0
num_layers = len(self.layer)
chunk_length = math.ceil(math.sqrt(num_layers))
while l < num_layers:
hidden_states = checkpoint.checkpoint(custom(l, l+chunk_length), hidden_states, attention_mask*1)
l += chunk_length
# decoder layers
else:
if self.fused_mha:
hidden_states = hidden_states.permute(1,0,2).contiguous()
for i,layer_module in enumerate(self.layer):
hidden_states = layer_module(hidden_states, attention_mask, seqlen, batch)
if output_all_encoded_layers:
if self.fused_mha:
all_encoder_layers.append(hidden_states.permute(1,0,2).contiguous())
else:
all_encoder_layers.append(hidden_states)
# Pad inputs and mask. It will insert back zero-padded tokens. Assume ntokens is total number of tokens (padded and non-padded)
# and ntokens_unpad is total number of non-padded tokens. Then padding performs the following de-compression:
# hidden_states[ntokens_unpad,hidden] -> hidden_states[ntokens,hidden]
if self.unpad:
if self.pad == True and self.enable_stream == False:
hidden_states = hidden_states.view(maxseqlen,batch,hidden_size).permute(1,0,2).contiguous().view(batch,maxseqlen,hidden_size).contiguous()
if self.pad == True and self.enable_stream == True:
hidden_states = hidden_states.view(batch,maxseqlen,hidden_size)
if self.pad == False:
hidden_states = PadInput.apply(hidden_states, attention_indices, batch, maxseqlen, hidden_size, ntokens).view(batch, maxseqlen, hidden_size).contiguous()
if not output_all_encoded_layers or checkpoint_activations:
if self.fused_mha:
all_encoder_layers.append(hidden_states.permute(1,0,2).contiguous())
else:
all_encoder_layers.append(hidden_states)
return all_encoder_layers
#class BertEncoder(nn.Module):
# def __init__(self, config):
# super(BertEncoder, self).__init__()
# layer = BertLayer(config)
# self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
#
# def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
# all_encoder_layers = []
# for layer_module in self.layer:
# hidden_states = layer_module(hidden_states, attention_mask)
# if output_all_encoded_layers:
# all_encoder_layers.append(hidden_states)
# if not output_all_encoded_layers:
# all_encoder_layers.append(hidden_states)
# return all_encoder_layers
class BertPooler(nn.Module):
def __init__(self, config):
super(BertPooler, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states):
# We "pool" the model by simply taking the hidden state corresponding
# to the first token.
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
class BertPredictionHeadTransform(nn.Module):
def __init__(self, config):
super(BertPredictionHeadTransform, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
class BertLMPredictionHead(nn.Module):
def __init__(self, config, bert_model_embedding_weights):
super(BertLMPredictionHead, self).__init__()
self.transform = BertPredictionHeadTransform(config)
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
bert_model_embedding_weights.size(0),
bias=False)
self.decoder.weight = bert_model_embedding_weights
self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))
def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states) + self.bias
return hidden_states
class BertOnlyMLMHead(nn.Module):
def __init__(self, config, bert_model_embedding_weights):
super(BertOnlyMLMHead, self).__init__()
self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
def forward(self, sequence_output):
prediction_scores = self.predictions(sequence_output)
return prediction_scores
class BertOnlyNSPHead(nn.Module):
def __init__(self, config):
super(BertOnlyNSPHead, self).__init__()
self.seq_relationship = nn.Linear(config.hidden_size, 2)
def forward(self, pooled_output):
seq_relationship_score = self.seq_relationship(pooled_output)
return seq_relationship_score
class BertPreTrainingHeads(nn.Module):
def __init__(self, config, bert_model_embedding_weights):
super(BertPreTrainingHeads, self).__init__()
self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
self.seq_relationship = nn.Linear(config.hidden_size, 2)
self.dense_seq_output = config.dense_seq_output
def forward(self, sequence_output, pooled_output, masked_lm_labels):
if self.dense_seq_output:
# We are masking out elements that won't contribute to loss because of masked lm labels
sequence_flattened = torch.index_select(sequence_output.view(-1,sequence_output.shape[-1]), 0, torch.nonzero(masked_lm_labels.view(-1) != -1, as_tuple=False).squeeze())
sequence_output = sequence_flattened
prediction_scores = self.predictions(sequence_output)
seq_relationship_score = self.seq_relationship(pooled_output)
return prediction_scores, seq_relationship_score
class BertPreTrainedModel(nn.Module):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
def __init__(self, config, *inputs, **kwargs):
super(BertPreTrainedModel, self).__init__()
if not isinstance(config, BertConfig):
raise ValueError(
"Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
"To create a model from a Google pretrained model use "
"`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
self.__class__.__name__, self.__class__.__name__
))
self.config = config
# we want to make sure vocab size is padded to % 8 == 0
if self.config.vocab_size % 8 != 0:
self.config.vocab_size += 8 - (self.config.vocab_size % 8)
if get_rank == 0:
print(f'Padded vocab_size to : {self.config.vocab_size}')
def init_bert_weights(self, module):
""" Initialize the weights.
"""
if isinstance(module, (nn.Linear, nn.Embedding)):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
elif isinstance(module, BertLayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()
@classmethod
def from_pretrained(cls, pretrained_checkpoint, state_dict=None, cache_dir=None,
from_tf=False, config=None, *inputs, **kwargs):
"""
Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
Download and cache the pre-trained model file if needed.
Params:
pretrained_model_name_or_path: either:
- a path or url to a pretrained model archive containing:
. `bert_config.json` a configuration file for the model
. `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
- a path or url to a pretrained model archive containing:
. `bert_config.json` a configuration file for the model
. `model.chkpt` a TensorFlow checkpoint
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
*inputs, **kwargs: additional input for the specific Bert class
(ex: num_labels for BertForSequenceClassification)
"""
logger.info("loading archive file {}".format(pretrained_checkpoint))
assert config, "BERT configuration file must be provided to from_pretraining()"
logger.info("Model config {}".format(config))
# Instantiate model.
model = cls(config, *inputs, **kwargs)
if state_dict is None and not from_tf:
state_dict = torch.load(pretrained_checkpoint, map_location='cpu' if not torch.cuda.is_available() else None)
if from_tf:
# Directly load from a TensorFlow checkpoint
return load_tf_weights_in_bert(model, pretrained_checkpoint, use_fast_mha=config.fused_mha)
# Load from a PyTorch state_dict
old_keys = []
new_keys = []
# print(f'loading keys: {state_dict.keys()}')
for key in state_dict.keys():
new_key = None
if 'gamma' in key:
new_key = key.replace('gamma', 'weight')
if 'beta' in key:
new_key = key.replace('beta', 'bias')
if new_key:
old_keys.append(key)
new_keys.append(new_key)
for old_key, new_key in zip(old_keys, new_keys):
state_dict[new_key] = state_dict.pop(old_key)
missing_keys = []
unexpected_keys = []
error_msgs = []
# copy state_dict so _load_from_state_dict can modify it
metadata = getattr(state_dict, '_metadata', None)
state_dict = state_dict.copy()
if metadata is not None:
state_dict._metadata = metadata
def load(module, prefix=''):
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
module._load_from_state_dict(
state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
for name, child in module._modules.items():
if child is not None:
load(child, prefix + name + '.')
start_prefix = ''
if not hasattr(model, 'bert') and any(s.startswith('bert.') for s in state_dict.keys()):
start_prefix = 'bert.'
load(model, prefix=start_prefix)
if len(missing_keys) > 0:
logger.info("Weights of {} not initialized from pretrained model: {}".format(
model.__class__.__name__, missing_keys))
if len(unexpected_keys) > 0:
logger.info("Weights from pretrained model not used in {}: {}".format(
model.__class__.__name__, unexpected_keys))
if len(error_msgs) > 0:
raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
model.__class__.__name__, "\n\t".join(error_msgs)))
return model
class BertModel(BertPreTrainedModel):
"""BERT model ("Bidirectional Embedding Representations from a Transformer").
Params:
config: a BertConfig class instance with the configuration to build a new model
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
Outputs: Tuple of (encoded_layers, pooled_output)
`encoded_layers`: controled by `output_all_encoded_layers` argument:
- `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
- `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
to the last attention block of shape [batch_size, sequence_length, hidden_size],
`pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
classifier pretrained on top of the hidden state associated to the first character of the
input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = modeling.BertModel(config=config)
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
```
"""
def __init__(self, config):
super(BertModel, self).__init__(config)
self.embeddings = BertEmbeddings(config)
self.encoder = BertEncoder(config)
self.pooler = BertPooler(config)
self.apply(self.init_bert_weights)
self.unpad = config.unpad
def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True, checkpoint_activations=False):
if attention_mask is None:
attention_mask = torch.ones_like(input_ids)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
extended_attention_mask = attention_mask#.unsqueeze(1).unsqueeze(2)
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
if self.unpad == False:
extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
embedding_output = self.embeddings(input_ids, token_type_ids)
encoded_layers = self.encoder(embedding_output,
extended_attention_mask,
output_all_encoded_layers=output_all_encoded_layers, checkpoint_activations=checkpoint_activations)
sequence_output = encoded_layers[-1]
pooled_output = self.pooler(sequence_output)
if not output_all_encoded_layers:
encoded_layers = encoded_layers[-1]
return encoded_layers, pooled_output
class BertForPreTraining(BertPreTrainedModel):
"""BERT model with pre-training heads.
This module comprises the BERT model followed by the two pre-training heads:
- the masked language modeling head, and
- the next sentence classification head.
Params:
config: a BertConfig class instance with the configuration to build a new model.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
is only computed for the labels set in [0, ..., vocab_size]
`next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
with indices selected in [0, 1].
0 => next sentence is the continuation, 1 => next sentence is a random sentence.
Outputs:
if `masked_lm_labels` and `next_sentence_label` are not `None`:
Outputs the total_loss which is the sum of the masked language modeling loss and the next
sentence classification loss.
if `masked_lm_labels` or `next_sentence_label` is `None`:
Outputs a tuple comprising
- the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
- the next sentence classification logits of shape [batch_size, 2].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForPreTraining(config)
masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
```
"""
def __init__(self, config):
super(BertForPreTraining, self).__init__(config)
self.bert = BertModel(config)
self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
self.apply(self.init_bert_weights)
self.dense_seq_output = config.dense_seq_output
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, next_sentence_label=None, checkpoint_activations=False):
sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
output_all_encoded_layers=False, checkpoint_activations=checkpoint_activations)
# if dense_seq_output, prediction scores returned by this function is already masked out with masked_lm_labels, and first dimension is flattened
prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output, masked_lm_labels)
if self.dense_seq_output:
masked_lm_labels_flat = masked_lm_labels.view(-1)
mlm_labels = masked_lm_labels_flat[masked_lm_labels_flat != -1]
if masked_lm_labels is not None and next_sentence_label is not None:
loss_fct = CrossEntropyLoss(ignore_index=-1)
if self.dense_seq_output:
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), mlm_labels.view(-1))
else:
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
#print("loss is {} {}".format(masked_lm_loss, next_sentence_loss))
total_loss = masked_lm_loss + next_sentence_loss
# Masked Language Model Accuracy
if not self.dense_seq_output:
prediction_scores_flat = prediction_scores.view(-1, prediction_scores.shape[-1])
masked_lm_labels_flat = masked_lm_labels.view(-1)
mlm_predictions_scores = prediction_scores_flat[masked_lm_labels_flat != -1]
mlm_predictions = mlm_predictions_scores.argmax(dim=-1)
mlm_labels = masked_lm_labels_flat[masked_lm_labels_flat != -1]
else:
mlm_predictions = prediction_scores.argmax(dim=-1)
mlm_acc = (mlm_predictions == mlm_labels).sum(dtype=torch.float)/mlm_labels.numel()
return total_loss, mlm_acc, mlm_labels.numel()
else: #TODO: Handle this path for dense sequence output as well
return prediction_scores, seq_relationship_score
class BertForMaskedLM(BertPreTrainedModel):
"""BERT model with the masked language modeling head.
This module comprises the BERT model followed by the masked language modeling head.
Params:
config: a BertConfig class instance with the configuration to build a new model.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
is only computed for the labels set in [0, ..., vocab_size]
Outputs:
if `masked_lm_labels` is not `None`:
Outputs the masked language modeling loss.
if `masked_lm_labels` is `None`:
Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForMaskedLM(config)
masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
```
"""
def __init__(self, config):
super(BertForMaskedLM, self).__init__(config)
self.bert = BertModel(config)
self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
self.apply(self.init_bert_weights)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, checkpoint_activations=False):
sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask,
output_all_encoded_layers=False)
prediction_scores = self.cls(sequence_output)
if masked_lm_labels is not None:
loss_fct = CrossEntropyLoss(ignore_index=-1)
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
return masked_lm_loss
else:
return prediction_scores
class BertForNextSentencePrediction(BertPreTrainedModel):
"""BERT model with next sentence prediction head.
This module comprises the BERT model followed by the next sentence classification head.
Params:
config: a BertConfig class instance with the configuration to build a new model.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
with indices selected in [0, 1].
0 => next sentence is the continuation, 1 => next sentence is a random sentence.
Outputs:
if `next_sentence_label` is not `None`:
Outputs the total_loss which is the sum of the masked language modeling loss and the next
sentence classification loss.
if `next_sentence_label` is `None`:
Outputs the next sentence classification logits of shape [batch_size, 2].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForNextSentencePrediction(config)
seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
```
"""
def __init__(self, config):
super(BertForNextSentencePrediction, self).__init__(config)
self.bert = BertModel(config)
self.cls = BertOnlyNSPHead(config)
self.apply(self.init_bert_weights)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None, checkpoint_activations=False):
_, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
output_all_encoded_layers=False)
seq_relationship_score = self.cls( pooled_output)
if next_sentence_label is not None:
loss_fct = CrossEntropyLoss(ignore_index=-1)
next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
return next_sentence_loss
else:
return seq_relationship_score
class BertForSequenceClassification(BertPreTrainedModel):
"""BERT model for classification.
This module is composed of the BERT model with a linear layer on top of
the pooled output.
Params:
`config`: a BertConfig class instance with the configuration to build a new model.
`num_labels`: the number of classes for the classifier. Default = 2.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
with indices selected in [0, ..., num_labels].
Outputs:
if `labels` is not `None`:
Outputs the CrossEntropy classification loss of the output with the labels.
if `labels` is `None`:
Outputs the classification logits of shape [batch_size, num_labels].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
num_labels = 2
model = BertForSequenceClassification(config, num_labels)
logits = model(input_ids, token_type_ids, input_mask)
```
"""
def __init__(self, config, num_labels):
super(BertForSequenceClassification, self).__init__(config)
self.num_labels = num_labels
self.bert = BertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, num_labels)
self.apply(self.init_bert_weights)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, checkpoint_activations=False):
_, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
return loss
else:
return logits
class BertForMultipleChoice(BertPreTrainedModel):
"""BERT model for multiple choice tasks.
This module is composed of the BERT model with a linear layer on top of
the pooled output.
Params:
`config`: a BertConfig class instance with the configuration to build a new model.
`num_choices`: the number of classes for the classifier. Default = 2.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]
with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
with indices selected in [0, ..., num_choices].
Outputs:
if `labels` is not `None`:
Outputs the CrossEntropy classification loss of the output with the labels.
if `labels` is `None`:
Outputs the classification logits of shape [batch_size, num_labels].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
num_choices = 2
model = BertForMultipleChoice(config, num_choices)
logits = model(input_ids, token_type_ids, input_mask)
```
"""
def __init__(self, config, num_choices):
super(BertForMultipleChoice, self).__init__(config)
self.num_choices = num_choices
self.bert = BertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1)
self.apply(self.init_bert_weights)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, checkpoint_activations=False):
flat_input_ids = input_ids.view(-1, input_ids.size(-1))
flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1))
_, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False)
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
reshaped_logits = logits.view(-1, self.num_choices)
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
return loss
else:
return reshaped_logits
class BertForTokenClassification(BertPreTrainedModel):
"""BERT model for token-level classification.
This module is composed of the BERT model with a linear layer on top of
the full hidden state of the last layer.
Params:
`config`: a BertConfig class instance with the configuration to build a new model.
`num_labels`: the number of classes for the classifier. Default = 2.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
with indices selected in [0, ..., num_labels].
Outputs:
if `labels` is not `None`:
Outputs the CrossEntropy classification loss of the output with the labels.
if `labels` is `None`:
Outputs the classification logits of shape [batch_size, sequence_length, num_labels].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
num_labels = 2
model = BertForTokenClassification(config, num_labels)
logits = model(input_ids, token_type_ids, input_mask)
```
"""
def __init__(self, config, num_labels):
super(BertForTokenClassification, self).__init__(config)
self.num_labels = num_labels
self.bert = BertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, num_labels)
self.apply(self.init_bert_weights)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, checkpoint_activations=False):
sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
if labels is not None:
loss_fct = CrossEntropyLoss()
# Only keep active parts of the loss
if attention_mask is not None:
active_loss = attention_mask.view(-1) == 1
active_logits = logits.view(-1, self.num_labels)[active_loss]
active_labels = labels.view(-1)[active_loss]
loss = loss_fct(active_logits, active_labels)
else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
return loss
else:
return logits
class BertForQuestionAnswering(BertPreTrainedModel):
"""BERT model for Question Answering (span extraction).
This module is composed of the BERT model with a linear layer on top of
the sequence output that computes start_logits and end_logits
Params:
`config`: a BertConfig class instance with the configuration to build a new model.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
Positions are clamped to the length of the sequence and position outside of the sequence are not taken
into account for computing the loss.
`end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
Positions are clamped to the length of the sequence and position outside of the sequence are not taken
into account for computing the loss.
Outputs:
if `start_positions` and `end_positions` are not `None`:
Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
if `start_positions` or `end_positions` is `None`:
Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
position tokens of shape [batch_size, sequence_length].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForQuestionAnswering(config)
start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
```
"""
def __init__(self, config):
super(BertForQuestionAnswering, self).__init__(config)
self.bert = BertModel(config)
# TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
# self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.qa_outputs = nn.Linear(config.hidden_size, 2)
self.apply(self.init_bert_weights)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None, end_positions=None, checkpoint_activations=False):
sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1)
if start_positions is not None and end_positions is not None:
# If we are on multi-GPU, split add a dimension
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# sometimes the start/end positions are outside our model inputs, we ignore these terms
ignored_index = start_logits.size(1)
start_positions.clamp_(0, ignored_index)
end_positions.clamp_(0, ignored_index)
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
return total_loss
else:
return start_logits, end_logits
import torch
import math
#######################################################################################################################################################################
def unpad_input(out_, in_, indices):
out_[:,:] = in_[indices[:],:]
def pad_input(out_, in_, indices):
out_[indices[:],:] = in_[:,:]
def unpad_mask(out_, in_, indices):
out_[:] = in_.flatten()[indices[:]]
#######################################################################################################################################################################
def generate_mask(attention_mask, heads, pad=False, fuse_mask=True):
seqlen = attention_mask.sum(dim=1).float().cpu()
if pad == False:
seqlen[:] = ((seqlen[:] + 16 - 1) / 16).floor()*16
seqlen[seqlen < 16] = 16
seqlen = seqlen.int()
ntokens = seqlen.sum().item()
else:
batch = attention_mask.shape[0]
maxseqlen = attention_mask.shape[1]
seqlen.fill_(maxseqlen)
seqlen = seqlen.int()
ntokens = batch * maxseqlen
padded_mask = attention_mask.clone()
for i in range(len(seqlen)):
padded_mask[i,:seqlen[i]] = 1
indices = torch.nonzero(padded_mask.flatten(), as_tuple=False).flatten()
if pad==False and fuse_mask == True:
mask = torch.zeros([ntokens], device="cuda", dtype=torch.float16)
unpad_mask(mask, attention_mask, indices)
mask = (1 - mask) * -10000.0
elif pad==False and fuse_mask == False:
padded_mask = (padded_mask.unsqueeze(1) * padded_mask.unsqueeze(2)).unsqueeze(1).half().repeat(1, heads, 1, 1)
indices_mask = torch.nonzero(padded_mask.flatten(), as_tuple=False).flatten()
mask = torch.zeros([len(indices_mask)], device="cuda", dtype=torch.float16)
unpad_mask(mask, padded_mask, indices_mask)
mask = (1 - mask) * -10000.0
elif pad==True and fuse_mask == True:
mask = -10000.0 * (1 - attention_mask).half().view(-1)
elif pad==True and fuse_mask == False:
mask = -10000.0 * (1 - (attention_mask.unsqueeze(1) * attention_mask.unsqueeze(2))).unsqueeze(1).half().repeat(1, heads, 1, 1).view(-1)
return indices, mask, seqlen, ntokens
#######################################################################################################################################################################
class PadInput(torch.autograd.Function):
@staticmethod
def forward(ctx, input, indices, batch, maxseqlen, hidden, ntokens):
ctx.save_for_backward(indices)
ctx.hidden = hidden
ctx.ntokens = ntokens
ntokens = batch*maxseqlen
output = torch.zeros([ntokens,hidden], device="cuda", dtype=torch.float16)
pad_input(output, input, indices)
return output[:ntokens]
@staticmethod
def backward(ctx, grad_output):
indices, = ctx.saved_tensors
grad_input = torch.zeros([ctx.ntokens,ctx.hidden], device="cuda", dtype=torch.float16)
unpad_input(grad_input, grad_output, indices)
return grad_input[:ctx.ntokens], None, None, None, None, None
#######################################################################################################################################################################
class UnpadInput(torch.autograd.Function):
@staticmethod
def forward(ctx, input, indices, batch, maxseqlen, hidden, ntokens):
ctx.save_for_backward(indices)
ctx.hidden = hidden
ctx.ntokens = batch*maxseqlen
output = torch.zeros([ntokens, hidden], device="cuda", dtype=torch.float16)
unpad_input(output, input, indices)
return output[:ntokens]
@staticmethod
def backward(ctx, grad_output):
indices, = ctx.saved_tensors
grad_input = torch.zeros([ctx.ntokens,ctx.hidden], device="cuda", dtype=torch.float16)
pad_input(grad_input, grad_output, indices)
return grad_input[:ctx.ntokens], None, None, None, None, None
#######################################################################################################################################################################
import torch
import mhalib
###########################################################################################
class FastSoftmaxFunction(torch.autograd.Function):
@staticmethod
def forward(cxt, input, dim, batch, seqlen, heads, stream, sync, timers):
if timers: timers['start_fprop'].record()
mhalib.FastSoftmaxFprop(input, batch, seqlen, heads, stream, sync)
if timers: timers['stop_fprop'].record()
cxt.save_for_backward(input,seqlen)
cxt.dim = dim
cxt.batch = batch
cxt.heads = heads
cxt.stream = stream
cxt.sync = sync
cxt.timers = timers
return input
@staticmethod
def backward(cxt, grad_output):
output, seqlen, = cxt.saved_tensors
dim = cxt.dim
batch = cxt.batch
heads = cxt.heads
if cxt.timers: cxt.timers['start_dgrad'].record()
mhalib.FastSoftmaxBprop(output, grad_output, batch, seqlen, heads, cxt.stream, cxt.sync)
if cxt.timers: cxt.timers['stop_dgrad'].record()
return grad_output, None, None, None, None, None, None, None
class FastSoftmax(torch.nn.Module):
def __init__(self, dim=None, stream=True, sync=True, timer=False):
super(FastSoftmax, self).__init__()
self.dim = dim
self.stream = stream
self.sync = sync
if timer:
self.timers = {'start_fprop':torch.cuda.Event(enable_timing=True),
'start_dgrad':torch.cuda.Event(enable_timing=True),
'stop_fprop':torch.cuda.Event(enable_timing=True),
'stop_dgrad':torch.cuda.Event(enable_timing=True)}
else:
self.timers = None
def forward(self, input, batch, seqlen, heads):
return FastSoftmaxFunction.apply(input, self.dim, batch, seqlen, heads, self.stream, self.sync, self.timers)
###########################################################################################
class FastMaskSoftmaxFunction(torch.autograd.Function):
@staticmethod
def forward(cxt, input, mask, dim, batch, seqlen, heads, stream, sync, timers):
if timers: timers['start_fprop'].record()
mhalib.FastMaskSoftmaxFprop(input, mask, batch, seqlen, heads, stream, sync)
if timers: timers['stop_fprop'].record()
cxt.save_for_backward(input,seqlen)
cxt.dim = dim
cxt.batch = batch
cxt.heads = heads
cxt.stream = stream
cxt.sync = sync
cxt.timers = timers
return input
@staticmethod
def backward(cxt, grad_output):
output, seqlen, = cxt.saved_tensors
dim = cxt.dim
batch = cxt.batch
heads = cxt.heads
if cxt.timers: cxt.timers['start_dgrad'].record()
mhalib.FastSoftmaxBprop(output, grad_output, batch, seqlen, heads, cxt.stream, cxt.sync)
if cxt.timers: cxt.timers['stop_dgrad'].record()
return grad_output, None, None, None, None, None, None, None, None, None, None, None
class FastMaskSoftmax(torch.nn.Module):
def __init__(self, dim=None, stream=True, sync=True, timer=False):
super(FastMaskSoftmax, self).__init__()
self.dim = dim
self.stream = stream
self.sync = sync
if timer:
self.timers = {'start_fprop':torch.cuda.Event(enable_timing=True),
'start_dgrad':torch.cuda.Event(enable_timing=True),
'stop_fprop':torch.cuda.Event(enable_timing=True),
'stop_dgrad':torch.cuda.Event(enable_timing=True)}
else:
self.timers = None
def forward(self, input, mask, batch, seqlen, heads):
return FastMaskSoftmaxFunction.apply(input, mask, self.dim, batch, seqlen, heads, self.stream, self.sync, self.timers)
###########################################################################################
class FastMaskSoftmaxDropoutFunction(torch.autograd.Function):
@staticmethod
def forward(cxt, input, mask, dim, batch, seqlen, heads, dropout_prob, stream, sync, timers, is_training):
if timers: timers['start_fprop'].record()
output, dropout_mask, = mhalib.FastMaskSoftmaxDropoutFprop(input, mask, batch, seqlen, heads, dropout_prob, stream, sync, is_training)
if timers: timers['stop_fprop'].record()
cxt.save_for_backward(input,dropout_mask,seqlen)
cxt.dim = dim
cxt.batch = batch
cxt.heads = heads
cxt.dropout_prob = dropout_prob
cxt.stream = stream
cxt.sync = sync
cxt.timers = timers
return output
@staticmethod
def backward(cxt, grad_output):
output, dropout_mask, seqlen, = cxt.saved_tensors
dim = cxt.dim
batch = cxt.batch
heads = cxt.heads
dropout_prob = cxt.dropout_prob
if cxt.timers: cxt.timers['start_dgrad'].record()
mhalib.FastMaskSoftmaxDropoutBprop(output, grad_output, dropout_mask, batch, seqlen, heads, dropout_prob, cxt.stream, cxt.sync)
if cxt.timers: cxt.timers['stop_dgrad'].record()
return grad_output, None, None, None, None, None, None, None, None, None, None, None, None, None
class FastMaskSoftmaxDropout(torch.nn.Module):
def __init__(self, dim=None, dropout_prob=None, stream=True, sync=True, timer=False):
super(FastMaskSoftmaxDropout, self).__init__()
self.dim = dim
self.dropout_prob = dropout_prob
self.stream = stream
self.sync = sync
if timer:
self.timers = {'start_fprop':torch.cuda.Event(enable_timing=True),
'start_dgrad':torch.cuda.Event(enable_timing=True),
'stop_fprop':torch.cuda.Event(enable_timing=True),
'stop_dgrad':torch.cuda.Event(enable_timing=True)}
else:
self.timers = None
def forward(self, input, mask, batch, seqlen, heads, is_training):
return FastMaskSoftmaxDropoutFunction.apply(input, mask, self.dim, batch, seqlen, heads, self.dropout_prob, self.stream, self.sync, self.timers, is_training)
###########################################################################################
python3 convert_tf_checkpoint.py \
--bert_model "bert-large-uncased" \
--tf_checkpoint /public/home/hepj/model_source/uncased_L-24_H-1024_A-16/bert_model.ckpt \
--bert_config_path /public/home/hepj/model_source/uncased_L-24_H-1024_A-16/bert_config.json \
--output_checkpoint /public/home/hepj/model_source/model_pytorch.ckpt.pt
\ No newline at end of file
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torch.distributed as dist
from contextlib import contextmanager
import logging.config
import random
def generate_seeds(rng, size):
"""
Generate list of random seeds
:param rng: random number generator
:param size: length of the returned list
"""
seeds = [rng.randint(0, 2**32 - 1) for _ in range(size)]
return seeds
def broadcast_seeds(seeds, device):
"""
Broadcasts random seeds to all distributed workers.
Returns list of random seeds (broadcasted from workers with rank 0).
:param seeds: list of seeds (integers)
:param device: torch.device
"""
if torch.distributed.is_available() and torch.distributed.is_initialized():
seeds_tensor = torch.LongTensor(seeds).to(device)
torch.distributed.broadcast(seeds_tensor, 0)
seeds = seeds_tensor.tolist()
return seeds
def setup_seeds(master_seed, epochs, device):
"""
Generates seeds from one master_seed.
Function returns (worker_seeds, shuffling_seeds), worker_seeds are later
used to initialize per-worker random number generators (mostly for
dropouts), shuffling_seeds are for RNGs resposible for reshuffling the
dataset before each epoch.
Seeds are generated on worker with rank 0 and broadcasted to all other
workers.
:param master_seed: master RNG seed used to initialize other generators
:param epochs: number of epochs
:param device: torch.device (used for distributed.broadcast)
"""
if master_seed is None:
# random master seed, random.SystemRandom() uses /dev/urandom on Unix
master_seed = random.SystemRandom().randint(0, 2**32 - 1)
if get_rank() == 0:
# master seed is reported only from rank=0 worker, it's to avoid
# confusion, seeds from rank=0 are later broadcasted to other
# workers
logging.info(f'Using random master seed: {master_seed}')
else:
# master seed was specified from command line
logging.info(f'Using master seed from command line: {master_seed}')
# initialize seeding RNG
seeding_rng = random.Random(master_seed)
# generate worker seeds, one seed for every distributed worker
worker_seeds = generate_seeds(seeding_rng, get_world_size())
# generate seeds for data shuffling, one seed for every epoch
shuffling_seeds = generate_seeds(seeding_rng, epochs)
# broadcast seeds from rank=0 to other workers
worker_seeds = broadcast_seeds(worker_seeds, device)
shuffling_seeds = broadcast_seeds(shuffling_seeds, device)
return worker_seeds, shuffling_seeds
def barrier():
"""
Works as a temporary distributed barrier, currently pytorch
doesn't implement barrier for NCCL backend.
Calls all_reduce on dummy tensor and synchronizes with GPU.
"""
if torch.distributed.is_available() and torch.distributed.is_initialized():
torch.distributed.all_reduce(torch.cuda.FloatTensor(1))
torch.cuda.synchronize()
def get_rank():
"""
Gets distributed rank or returns zero if distributed is not initialized.
"""
if torch.distributed.is_available() and torch.distributed.is_initialized():
rank = torch.distributed.get_rank()
else:
rank = 0
return rank
def get_world_size():
"""
Gets total number of distributed workers or returns one if distributed is
not initialized.
"""
if torch.distributed.is_available():
print("Torch distributed is available.")
else:
print("Torch distributed is not available.")
if torch.distributed.is_initialized():
print("Torch distributed is initialized.")
else:
print("Torch distributed is not initialized.")
if torch.distributed.is_available() and torch.distributed.is_initialized():
world_size = torch.distributed.get_world_size()
else:
world_size = 1
return world_size
def set_device(cuda, local_rank):
"""
Sets device based on local_rank and returns instance of torch.device.
:param cuda: if True: use cuda
:param local_rank: local rank of the worker
"""
if cuda:
torch.cuda.set_device(local_rank)
device = torch.device('cuda')
else:
device = torch.device('cpu')
return device
@contextmanager
def sync_workers():
"""
Yields distributed rank and synchronizes all workers on exit.
"""
rank = get_rank()
yield rank
barrier()
def is_main_process():
return get_rank() == 0
def format_step(step):
if isinstance(step, str):
return step
s = ""
if len(step) > 0:
s += "Training Epoch: {} ".format(step[0])
if len(step) > 1:
s += "Training Iteration: {} ".format(step[1])
if len(step) > 2:
s += "Validation Iteration: {} ".format(step[2])
return s
# coding=utf-8
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""
from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import logging
import os
import unicodedata
import six
from io import open
from file_utils import cached_path
logger = logging.getLogger(__name__)
PRETRAINED_VOCAB_ARCHIVE_MAP = {
'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
}
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
'bert-base-uncased': 512,
'bert-large-uncased': 512,
'bert-base-cased': 512,
'bert-large-cased': 512,
'bert-base-multilingual-uncased': 512,
'bert-base-multilingual-cased': 512,
'bert-base-chinese': 512,
}
VOCAB_NAME = 'vocab.txt'
def convert_to_unicode(text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text.decode("utf-8", "ignore")
elif isinstance(text, unicode):
return text
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
index = 0
with open(vocab_file, "r", encoding="utf-8") as reader:
while True:
token = reader.readline()
if not token:
break
token = token.strip()
vocab[token] = index
index += 1
return vocab
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class BertTokenizer(object):
"""Runs end-to-end tokenization: punctuation splitting + wordpiece"""
def __init__(self, vocab_file, do_lower_case=True, max_len=None,
never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
if not os.path.isfile(vocab_file):
raise ValueError(
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict(
[(ids, tok) for tok, ids in self.vocab.items()])
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
never_split=never_split)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
self.max_len = max_len if max_len is not None else int(1e12)
def tokenize(self, text):
split_tokens = []
for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
return split_tokens
def convert_tokens_to_ids(self, tokens):
"""Converts a sequence of tokens into ids using the vocab."""
ids = []
for token in tokens:
ids.append(self.vocab[token])
if len(ids) > self.max_len:
raise ValueError(
"Token indices sequence length is longer than the specified maximum "
" sequence length for this BERT model ({} > {}). Running this"
" sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
)
return ids
def convert_ids_to_tokens(self, ids):
"""Converts a sequence of ids in wordpiece tokens using the vocab."""
tokens = []
for i in ids:
tokens.append(self.ids_to_tokens[i])
return tokens
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
"""
Instantiate a PreTrainedBertModel from a pre-trained model file.
Download and cache the pre-trained model file if needed.
"""
if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
else:
vocab_file = pretrained_model_name_or_path
if os.path.isdir(vocab_file):
vocab_file = os.path.join(vocab_file, VOCAB_NAME)
# redirect to the cache, if necessary
try:
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
except EnvironmentError:
logger.error(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find any file "
"associated to this path or url.".format(
pretrained_model_name_or_path,
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
vocab_file))
return None
if resolved_vocab_file == vocab_file:
logger.info("loading vocabulary file {}".format(vocab_file))
else:
logger.info("loading vocabulary file {} from cache at {}".format(
vocab_file, resolved_vocab_file))
if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
# if we're using a pretrained model, ensure the tokenizer wont index sequences longer
# than the number of positional embeddings
max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
# Instantiate tokenizer.
tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
return tokenizer
class BasicTokenizer(object):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
def __init__(self,
do_lower_case=True,
never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
"""Constructs a BasicTokenizer.
Args:
do_lower_case: Whether to lower case the input.
"""
self.do_lower_case = do_lower_case
self.never_split = never_split
def tokenize(self, text):
"""Tokenizes a piece of text."""
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case and token not in self.never_split:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text):
"""Splits punctuation on a piece of text."""
if text in self.never_split:
return [text]
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or
(cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class WordpieceTokenizer(object):
"""Runs WordPiece tokenization."""
def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
For example:
input = "unaffable"
output = ["un", "##aff", "##able"]
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer`.
Returns:
A list of wordpiece tokens.
"""
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
def _is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat.startswith("C"):
return True
return False
def _is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
\ No newline at end of file
# Deploying the BERT model using Triton Inference Server
## Solution overview
The [NVIDIA Triton Inference Server](https://github.com/NVIDIA/triton-inference-server) provides a datacenter and cloud inferencing solution optimized for NVIDIA GPUs. The server provides an inference service via an HTTP or gRPC endpoint, allowing remote clients to request inferencing for any number of GPU or CPU models being managed by the server.
This folder contains detailed performance analysis as well as scripts to run SQuAD fine-tuning on BERT model using Triton Inference Server.
## Setup
The first step is to train BERT for question answering. The process is the same as in the main readme.
1. Download the squad dataset with `cd [bert folder]/data/squad/ && bash ./squad_download.sh`.
2. Build the Docker container with `bash ./scripts/docker/build.sh`.
3. [train](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/BERT#training-process) your own checkpoint and fine-tune it, or [download](https://ngc.nvidia.com/catalog/models/nvidia:bert_large_pyt_amp_ckpt_squad_qa1_1/files) the already trained and fine-tuned checkpoint from the [NGC](https://ngc.nvidia.com/catalog/models/nvidia:bert_large_pyt_amp_ckpt_squad_qa1_1/files) model repository.
The checkpoint should be placed in `[bert folder]/checkpoints/<checkpoint>`. By default, the scripts assume `<checkpoint>` is `bert_qa.pt`, therefore, you might have to rename the trained or downloaded models as necessary.
Note: The following instructions are run from outside the container and call `docker run` commands as required. \
Unless stated otherwise, all the commands below have to be executed from `[bert folder]`.
## Quick Start Guide
### Deploying the model
The following command exports the checkpoint to `torchscript`, and deploys the Triton model repository.
`bash ./triton/export_model.sh`
The deployed Triton model repository will be in `[bert folder]/results/triton_models`.
Edit `[bert folder]/triton/export_model.sh` to deploy BERT in ONNX format.
Change the value of `EXPORT_FORMAT` from `ts-script` to `onnx`. Additionally, change the value of `triton_model_name` from `bertQA-ts` to `bertQA-onnx`, respectively.
Moreover, you may set `precision` to either `fp32` or `fp16`.
### Running the Triton server
To launch the Triton server, execute the following command.
`docker run --rm --gpus device=0 --ipc=host --network=host -p 8000:8000 -p 8001:8001 -p 8002:8002 -v $PWD/results/triton_models:/models nvcr.io/nvidia/tritonserver:20.06-v1-py3 trtserver --model-store=/models --log-verbose=1`
Here `device=0,1,2,3` selects GPUs indexed by ordinals `0,1,2` and `3`, respectively. The server will see only these GPUs. If you write `device=all`, then the server will see all the available GPUs.
By default, the server expects the model repository to be in `[bert folder]/results/triton_models`.
### Running the custom Triton client
The custom Triton client is found in `[bert folder]/triton/client.py`.
It may be used once BERT is deployed and the Triton server is running. To try it, do the following steps.
1. Start the BERT docker container with the following command: \
`docker run -it --rm --ipc=host --network=host -v $PWD/vocab:/workspace/bert/vocab bert:latest` \
Notice, that for the client, no GPU support is necessary.
2. Move to the triton folder with the following command: \
`cd /workspace/bert/triton/`
3. Run the client with the following command: \
`python client.py --do_lower_case --version_2_with_negative --vocab_file=../vocab/vocab --triton-model-name=bertQA-ts-script`
This will send a request to the already running Triton server, which will process it, and return the result to the client. The response will be printed on the screen.
You may send your own question-context pair for processing, using the `--question` and `--context` flags of client.py, respectively.
You may want to use the `--triton-model-name` flag to select the model in onnx format.
### Evaluating the deployed model on Squad1.1
To deploy and evaluate your model, run the following command.
`bash ./triton/evaluate.sh`
By default, this will deploy BERT in torchscript format, and evaluate it on Squad1.1.
You may change the format of deployment by editing `[bert folder]/triton/evaluate.sh`.
Change the value of `EXPORT_FORMAT` from `ts-script` to `onnx`. Moreover, you may set `precision` to either `fp32` or `fp16`.
### Generating performance data
To collect performance data, run the following command.
`bash ./triton/generate_figures.sh`
By default, this will deploy BERT in `torchscript` format, launch the server, run the perf client, collect statistics and place them in `[bert folder]/results/triton_models/perf_client`.
You may change the format of deployment by editing `./triton/generate_figures.sh`. Change the value of `EXPORT_FORMAT` from `ts-script` to `onnx`, respectively.
Moreover, you may set `precision` to either `fp32` or `fp16`.
## Advanced
### Other scripts
To launch the Triton server in a detached state, run the following command.
`bash ./triton/launch_triton_server.sh`
By default, the Triton server is expecting the model repository in `[bert folder]/results/triton_models`.
To make the machine wait until the server is initialized, and the model is ready for inference, run the following command.
`bash ./triton/wait_for_triton_server.sh`
## Performance
The numbers below are averages, measured on Triton on V100 32G GPU, with [static batching](https://docs.nvidia.com/deeplearning/sdk/tensorrt-inference-server-guide/docs/model_configuration.html#scheduling-and-batching).
| Format | GPUs | Batch size | Sequence length | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (mixed precision/FP32) |
|--------|------|------------|-----------------|----------------------------------|---------------------------------------------|--------------------------------------------|
|pytorch | 1 | 1 | 384 | 30.1 | 28.0 | 0.93x |
|pytorch | 1 | 8 | 384 | 36.0 | 116.8 | 3.24x |
|torchscript | 1 | 1 | 384 | 32.20 | 38.40 | 1.19x |
|torchscript | 1 | 8 | 384 | 40.00 | 134.40 | 3.36x |
|onnx | 1 | 1 | 384 | 33.30 | 92.00 | 2.76x |
|onnx | 1 | 8 | 384 | 42.60 | 165.30 | 3.88x |
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment