Commit 27dab946 authored by huchen's avatar huchen
Browse files

Merge branch 'GNMT-v2' into 'main'

更新了GNMT v2

See merge request dcutoolkit/deeplearing/dlexamples_new!11
parents 20291e9d 07c30a15
import logging # Copyright (c) 2017 Elad Hoffer
# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import math import math
import torch import torch
...@@ -6,25 +26,6 @@ import torch.nn as nn ...@@ -6,25 +26,6 @@ import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
import seq2seq.attn_score._C as C
class AttentionScore(torch.autograd.Function):
@staticmethod
def forward(ctx, att_query, att_keys, bias, linear_att):
score = C.forward(att_query, att_keys, bias, linear_att)
ctx.save_for_backward(att_query, att_keys, bias, linear_att)
return score
@staticmethod
def backward(ctx, grad_output):
att_query, att_keys, bias, linear_att = ctx.saved_tensors
grad_query, grad_keys, grad_bias, grad_linear_att = \
C.backward(grad_output, att_query, att_keys, \
bias, linear_att)
return grad_query, grad_keys, grad_bias, grad_linear_att
fused_calc_score = AttentionScore.apply
class BahdanauAttention(nn.Module): class BahdanauAttention(nn.Module):
""" """
...@@ -32,7 +33,7 @@ class BahdanauAttention(nn.Module): ...@@ -32,7 +33,7 @@ class BahdanauAttention(nn.Module):
Implementation is very similar to tf.contrib.seq2seq.BahdanauAttention Implementation is very similar to tf.contrib.seq2seq.BahdanauAttention
""" """
def __init__(self, query_size, key_size, num_units, normalize=False, def __init__(self, query_size, key_size, num_units, normalize=False,
batch_first=False, init_weight=0.1, fusion=True): batch_first=False, init_weight=0.1):
""" """
Constructor for the BahdanauAttention. Constructor for the BahdanauAttention.
...@@ -68,8 +69,6 @@ class BahdanauAttention(nn.Module): ...@@ -68,8 +69,6 @@ class BahdanauAttention(nn.Module):
self.register_parameter('normalize_bias', None) self.register_parameter('normalize_bias', None)
self.reset_parameters(init_weight) self.reset_parameters(init_weight)
self.fusion = fusion
logging.info(f'Fused attention flag set to {fusion}')
def reset_parameters(self, init_weight): def reset_parameters(self, init_weight):
""" """
...@@ -161,18 +160,12 @@ class BahdanauAttention(nn.Module): ...@@ -161,18 +160,12 @@ class BahdanauAttention(nn.Module):
processed_key = self.linear_k(keys) processed_key = self.linear_k(keys)
# scores: (b x t_q x t_k) # scores: (b x t_q x t_k)
if self.fusion: scores = self.calc_score(processed_query, processed_key)
linear_att = self.linear_att / self.linear_att.norm()
linear_att = linear_att * self.normalize_scalar
scores = fused_calc_score(processed_query, processed_key,
self.normalize_bias, linear_att)
else:
scores = self.calc_score(processed_query, processed_key)
if self.mask is not None: if self.mask is not None:
mask = self.mask.unsqueeze(1).expand(b, t_q, t_k) mask = self.mask.unsqueeze(1).expand(b, t_q, t_k)
# I can't use -INF because of overflow check in pytorch # I can't use -INF because of overflow check in pytorch
scores.data.masked_fill_(mask, -65504.0) scores.masked_fill_(mask, -65504.0)
# Normalize the scores, softmax over t_k # Normalize the scores, softmax over t_k
scores_normalized = F.softmax(scores, dim=-1) scores_normalized = F.softmax(scores, dim=-1)
......
# Copyright (c) 2017 Elad Hoffer
# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import itertools import itertools
import torch import torch
...@@ -14,7 +35,7 @@ class RecurrentAttention(nn.Module): ...@@ -14,7 +35,7 @@ class RecurrentAttention(nn.Module):
""" """
def __init__(self, input_size=1024, context_size=1024, hidden_size=1024, def __init__(self, input_size=1024, context_size=1024, hidden_size=1024,
num_layers=1, batch_first=False, dropout=0.2, num_layers=1, batch_first=False, dropout=0.2,
init_weight=0.1, fusion=True): init_weight=0.1):
""" """
Constructor for the RecurrentAttention. Constructor for the RecurrentAttention.
...@@ -35,8 +56,7 @@ class RecurrentAttention(nn.Module): ...@@ -35,8 +56,7 @@ class RecurrentAttention(nn.Module):
init_lstm_(self.rnn, init_weight) init_lstm_(self.rnn, init_weight)
self.attn = BahdanauAttention(hidden_size, context_size, context_size, self.attn = BahdanauAttention(hidden_size, context_size, context_size,
normalize=True, batch_first=batch_first, normalize=True, batch_first=batch_first)
fusion=fusion)
self.dropout = nn.Dropout(dropout) self.dropout = nn.Dropout(dropout)
...@@ -105,7 +125,7 @@ class ResidualRecurrentDecoder(nn.Module): ...@@ -105,7 +125,7 @@ class ResidualRecurrentDecoder(nn.Module):
on inputs to LSTM layers. on inputs to LSTM layers.
""" """
def __init__(self, vocab_size, hidden_size=1024, num_layers=4, dropout=0.2, def __init__(self, vocab_size, hidden_size=1024, num_layers=4, dropout=0.2,
batch_first=False, embedder=None, init_weight=0.1, fusion=True): batch_first=False, embedder=None, init_weight=0.1):
""" """
Constructor of the ResidualRecurrentDecoder. Constructor of the ResidualRecurrentDecoder.
...@@ -126,8 +146,7 @@ class ResidualRecurrentDecoder(nn.Module): ...@@ -126,8 +146,7 @@ class ResidualRecurrentDecoder(nn.Module):
self.att_rnn = RecurrentAttention(hidden_size, hidden_size, self.att_rnn = RecurrentAttention(hidden_size, hidden_size,
hidden_size, num_layers=1, hidden_size, num_layers=1,
batch_first=batch_first, batch_first=batch_first,
dropout=dropout, dropout=dropout)
fusion=fusion)
self.rnn_layers = nn.ModuleList() self.rnn_layers = nn.ModuleList()
for _ in range(num_layers - 1): for _ in range(num_layers - 1):
...@@ -138,13 +157,13 @@ class ResidualRecurrentDecoder(nn.Module): ...@@ -138,13 +157,13 @@ class ResidualRecurrentDecoder(nn.Module):
for lstm in self.rnn_layers: for lstm in self.rnn_layers:
init_lstm_(lstm, init_weight) init_lstm_(lstm, init_weight)
self.share_embedding = (embedder is not None)
if embedder is not None: if embedder is not None:
self.embedder = embedder self.embedder = embedder
else: else:
self.embedder = nn.Embedding(vocab_size, hidden_size, self.embedder = nn.Embedding(vocab_size, hidden_size,
padding_idx=config.PAD) padding_idx=config.PAD)
nn.init.uniform_(self.embedder.weight.data, -init_weight, init_weight) nn.init.uniform_(self.embedder.weight.data, -init_weight,
init_weight)
self.classifier = Classifier(hidden_size, vocab_size) self.classifier = Classifier(hidden_size, vocab_size)
self.dropout = nn.Dropout(p=dropout) self.dropout = nn.Dropout(p=dropout)
...@@ -201,10 +220,7 @@ class ResidualRecurrentDecoder(nn.Module): ...@@ -201,10 +220,7 @@ class ResidualRecurrentDecoder(nn.Module):
enc_context, enc_len, hidden = context enc_context, enc_len, hidden = context
hidden = self.init_hidden(hidden) hidden = self.init_hidden(hidden)
if self.share_embedding and self.training: x = self.embedder(inputs)
x = inputs
else:
x = self.embedder(inputs)
x, h, attn, scores = self.att_rnn(x, hidden[0], enc_context, enc_len) x, h, attn, scores = self.att_rnn(x, hidden[0], enc_context, enc_len)
self.append_hidden(h) self.append_hidden(h)
......
import torch # Copyright (c) 2017 Elad Hoffer
# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import torch.nn as nn import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence from torch.nn.utils.rnn import pad_packed_sequence
import seq2seq.data.config as config import seq2seq.data.config as config
from seq2seq.utils import init_lstm_ from seq2seq.utils import init_lstm_
import seq2seq.pack_utils._C as C
class Revert_varlen(torch.autograd.Function):
@staticmethod
def forward(ctx, input, offsets):
ctx.offsets = offsets
return C.revert_varlen_tensor(input, offsets)
@staticmethod
def backward(ctx, grad_output):
return C.revert_varlen_tensor(grad_output, ctx.offsets), None
revert_varlen = Revert_varlen.apply
class EmuBidirLSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers = 1, bias = True, batch_first = False, bidirectional = True):
super(EmuBidirLSTM, self).__init__()
assert num_layers == 1, "emulation bidirectional lstm works for a single layer only"
assert batch_first == False, "emulation bidirectional lstm works for batch_first = False only"
assert bidirectional == True, "use for bidirectional lstm only"
self.bidir = torch.nn.LSTM(input_size, hidden_size, num_layers, bias, batch_first, bidirectional = True)
self.layer1 = torch.nn.LSTM(input_size, hidden_size, num_layers, bias, batch_first)
self.layer2 = torch.nn.LSTM(input_size, hidden_size, num_layers, bias, batch_first)
self.layer1.weight_ih_l0 = self.bidir.weight_ih_l0
self.layer1.weight_hh_l0 = self.bidir.weight_hh_l0
self.layer2.weight_ih_l0 = self.bidir.weight_ih_l0_reverse
self.layer2.weight_hh_l0 = self.bidir.weight_hh_l0_reverse
self.layer1.bias_ih_l0 = self.bidir.bias_ih_l0
self.layer1.bias_hh_l0 = self.bidir.bias_hh_l0
self.layer2.bias_ih_l0 = self.bidir.bias_ih_l0_reverse
self.layer2.bias_hh_l0 = self.bidir.bias_hh_l0_reverse
@staticmethod
def bidir_lstm(model, input, lengths):
packed_input = pack_padded_sequence(input, lengths)
out = model(packed_input)[0]
return pad_packed_sequence(out)[0]
@staticmethod
def emu_bidir_lstm(model0, model1, input, lengths):
mask = C.set_mask_cpp(lengths).unsqueeze(-1).to(input.device,
input.dtype, non_blocking = True)
offsets = C.get_offsets(input, lengths)
inputl1 = revert_varlen(input, offsets)
out1 = model1(inputl1)
outputs = revert_varlen(out1[0], offsets)
out0 = model0(input)[0]*mask
out_bi = torch.cat([out0, outputs], dim=2)
return out_bi
def forward(self, input, lengths):
if (input.size(1) > 512):
return self.bidir_lstm(self.bidir, input, lengths)
else:
return self.emu_bidir_lstm(self.layer2, self.layer1, input, lengths)
class ResidualRecurrentEncoder(nn.Module): class ResidualRecurrentEncoder(nn.Module):
...@@ -93,8 +57,8 @@ class ResidualRecurrentEncoder(nn.Module): ...@@ -93,8 +57,8 @@ class ResidualRecurrentEncoder(nn.Module):
self.rnn_layers = nn.ModuleList() self.rnn_layers = nn.ModuleList()
# 1st LSTM layer, bidirectional # 1st LSTM layer, bidirectional
self.rnn_layers.append( self.rnn_layers.append(
EmuBidirLSTM(hidden_size, hidden_size, num_layers=1, bias=True, nn.LSTM(hidden_size, hidden_size, num_layers=1, bias=True,
batch_first=batch_first, bidirectional=True)) batch_first=batch_first, bidirectional=True))
# 2nd LSTM layer, with 2x larger input_size # 2nd LSTM layer, with 2x larger input_size
self.rnn_layers.append( self.rnn_layers.append(
...@@ -107,19 +71,18 @@ class ResidualRecurrentEncoder(nn.Module): ...@@ -107,19 +71,18 @@ class ResidualRecurrentEncoder(nn.Module):
nn.LSTM(hidden_size, hidden_size, num_layers=1, bias=True, nn.LSTM(hidden_size, hidden_size, num_layers=1, bias=True,
batch_first=batch_first)) batch_first=batch_first))
init_lstm_(self.rnn_layers[0].bidir) for lstm in self.rnn_layers:
for lstm in self.rnn_layers[1:]: init_lstm_(lstm, init_weight)
init_lstm_(lstm)
self.dropout = nn.Dropout(p=dropout) self.dropout = nn.Dropout(p=dropout)
self.share_embedding = (embedder is not None)
if embedder is not None: if embedder is not None:
self.embedder = embedder self.embedder = embedder
else: else:
self.embedder = nn.Embedding(vocab_size, hidden_size, self.embedder = nn.Embedding(vocab_size, hidden_size,
padding_idx=config.PAD) padding_idx=config.PAD)
nn.init.uniform_(self.embedder.weight.data, -init_weight, init_weight) nn.init.uniform_(self.embedder.weight.data, -init_weight,
init_weight)
def forward(self, inputs, lengths): def forward(self, inputs, lengths):
""" """
...@@ -130,14 +93,14 @@ class ResidualRecurrentEncoder(nn.Module): ...@@ -130,14 +93,14 @@ class ResidualRecurrentEncoder(nn.Module):
returns: tensor with encoded sequences returns: tensor with encoded sequences
""" """
if self.share_embedding and self.training: x = self.embedder(inputs)
x = inputs
else:
x = self.embedder(inputs)
# bidirectional layer # bidirectional layer
x = self.dropout(x) x = self.dropout(x)
x = self.rnn_layers[0](x, lengths) x = pack_padded_sequence(x, lengths.cpu().numpy(),
batch_first=self.batch_first)
x, _ = self.rnn_layers[0](x)
x, _ = pad_packed_sequence(x, batch_first=self.batch_first)
# 1st unidirectional layer # 1st unidirectional layer
x = self.dropout(x) x = self.dropout(x)
......
# Copyright (c) 2017 Elad Hoffer
# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import torch.nn as nn import torch.nn as nn
import seq2seq.data.config as config import seq2seq.data.config as config
from seq2seq.models.decoder import ResidualRecurrentDecoder from seq2seq.models.decoder import ResidualRecurrentDecoder
from seq2seq.models.encoder import ResidualRecurrentEncoder from seq2seq.models.encoder import ResidualRecurrentEncoder
from seq2seq.models.seq2seq_base import Seq2Seq from seq2seq.models.seq2seq_base import Seq2Seq
import torch
import time
class GNMT(Seq2Seq): class GNMT(Seq2Seq):
""" """
GNMT v2 model GNMT v2 model
""" """
def __init__(self, vocab_size, hidden_size=1024, num_layers=4, dropout=0.2, def __init__(self, vocab_size, hidden_size=1024, num_layers=4, dropout=0.2,
batch_first=False, share_embedding=True, fusion=True): batch_first=False, share_embedding=True):
""" """
Constructor for the GNMT v2 model. Constructor for the GNMT v2 model.
...@@ -36,55 +56,17 @@ class GNMT(Seq2Seq): ...@@ -36,55 +56,17 @@ class GNMT(Seq2Seq):
else: else:
embedder = None embedder = None
self.embedder = embedder
self.encoder = ResidualRecurrentEncoder(vocab_size, hidden_size, self.encoder = ResidualRecurrentEncoder(vocab_size, hidden_size,
num_layers, dropout, num_layers, dropout,
batch_first, embedder) batch_first, embedder)
self.decoder = ResidualRecurrentDecoder(vocab_size, hidden_size, self.decoder = ResidualRecurrentDecoder(vocab_size, hidden_size,
num_layers, dropout, num_layers, dropout,
batch_first, embedder, batch_first, embedder)
fusion=fusion)
def forward(self, input_encoder, input_enc_len, input_decoder): def forward(self, input_encoder, input_enc_len, input_decoder):
if self.embedder:
input_encoder = self.embedder(input_encoder)
input_decoder = self.embedder(input_decoder)
context = self.encode(input_encoder, input_enc_len) context = self.encode(input_encoder, input_enc_len)
input_enc_len = input_enc_len.to(input_encoder.device, non_blocking=True)
context = (context, input_enc_len, None) context = (context, input_enc_len, None)
output, _, _ = self.decode(input_decoder, context) output, _, _ = self.decode(input_decoder, context)
return output return output
# def forward(self, input_encoder, input_enc_len, input_decoder):
# if self.embedder:
# input_encoder = self.embedder(input_encoder)
# input_decoder = self.embedder(input_decoder)
###aiss add for prof time
# torch.cuda.synchronize()
# t1 = time.time()
# context = self.encode(input_encoder, input_enc_len)
# torch.cuda.synchronize()
# t2 = time.time()
# input_enc_len = input_enc_len.to(input_encoder.device, non_blocking=True)
# torch.cuda.synchronize()
# t5 = time.time()
# input_enc_len = input_enc_len.to(input_encoder.device, non_blocking=True)
# torch.cuda.synchronize()
# t6 = time.time()
# context = (context, input_enc_len, None)
# torch.cuda.synchronize()
# t3 = time.time()
# output, _, _ = self.decode(input_decoder, context)
# torch.cuda.synchronize()
# t4 = time.time()
# print("encode time is ",(t2 - t1)*1000)
# print("decode time is ",(t4 - t3)*1000)
# print("process time is ",(t3 - t2)*1000)
# print("process copy time1 is ",(t5 - t2)*1000)
# print("process copy time2 is ",(t6 - t5)*1000)
# print("process concat time is ",(t3 - t6)*1000)
#
# return output
#
# Copyright (c) 2017 Elad Hoffer
# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import torch.nn as nn import torch.nn as nn
from torch.nn.functional import log_softmax from torch.nn.functional import log_softmax
......
import logging
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
import seq2seq.attn_score._C as C
class AttentionScore(torch.autograd.Function):
@staticmethod
def forward(ctx, att_query, att_keys, bias, linear_att):
score = C.forward(att_query, att_keys, bias, linear_att)
ctx.save_for_backward(att_query, att_keys, bias, linear_att)
return score
@staticmethod
def backward(ctx, grad_output):
att_query, att_keys, bias, linear_att = ctx.saved_tensors
grad_query, grad_keys, grad_bias, grad_linear_att = \
C.backward(grad_output, att_query, att_keys, \
bias, linear_att)
return grad_query, grad_keys, grad_bias, grad_linear_att
fused_calc_score = AttentionScore.apply
class BahdanauAttention(nn.Module):
"""
Bahdanau Attention (https://arxiv.org/abs/1409.0473)
Implementation is very similar to tf.contrib.seq2seq.BahdanauAttention
"""
def __init__(self, query_size, key_size, num_units, normalize=False,
batch_first=False, init_weight=0.1, fusion=True):
"""
Constructor for the BahdanauAttention.
:param query_size: feature dimension for query
:param key_size: feature dimension for keys
:param num_units: internal feature dimension
:param normalize: whether to normalize energy term
:param batch_first: if True batch size is the 1st dimension, if False
the sequence is first and batch size is second
:param init_weight: range for uniform initializer used to initialize
Linear key and query transform layers and linear_att vector
"""
super(BahdanauAttention, self).__init__()
self.normalize = normalize
self.batch_first = batch_first
self.num_units = num_units
self.linear_q = nn.Linear(query_size, num_units, bias=False)
self.linear_k = nn.Linear(key_size, num_units, bias=False)
nn.init.uniform_(self.linear_q.weight.data, -init_weight, init_weight)
nn.init.uniform_(self.linear_k.weight.data, -init_weight, init_weight)
self.linear_att = Parameter(torch.Tensor(num_units))
self.mask = None
if self.normalize:
self.normalize_scalar = Parameter(torch.Tensor(1))
self.normalize_bias = Parameter(torch.Tensor(num_units))
else:
self.register_parameter('normalize_scalar', None)
self.register_parameter('normalize_bias', None)
self.reset_parameters(init_weight)
self.fusion = fusion
logging.info(f'Fused attention flag set to {fusion}')
def reset_parameters(self, init_weight):
"""
Sets initial random values for trainable parameters.
"""
stdv = 1. / math.sqrt(self.num_units)
self.linear_att.data.uniform_(-init_weight, init_weight)
if self.normalize:
self.normalize_scalar.data.fill_(stdv)
self.normalize_bias.data.zero_()
def set_mask(self, context_len, context):
"""
sets self.mask which is applied before softmax
ones for inactive context fields, zeros for active context fields
:param context_len: b
:param context: if batch_first: (b x t_k x n) else: (t_k x b x n)
self.mask: (b x t_k)
"""
if self.batch_first:
max_len = context.size(1)
else:
max_len = context.size(0)
indices = torch.arange(0, max_len, dtype=torch.int64,
device=context.device)
self.mask = indices >= (context_len.unsqueeze(1))
def calc_score(self, att_query, att_keys):
"""
Calculate Bahdanau score
:param att_query: b x t_q x n
:param att_keys: b x t_k x n
returns: b x t_q x t_k scores
"""
b, t_k, n = att_keys.size()
t_q = att_query.size(1)
att_query = att_query.unsqueeze(2).expand(b, t_q, t_k, n)
att_keys = att_keys.unsqueeze(1).expand(b, t_q, t_k, n)
sum_qk = att_query + att_keys
if self.normalize:
sum_qk = sum_qk + self.normalize_bias
linear_att = self.linear_att / self.linear_att.norm()
linear_att = linear_att * self.normalize_scalar
else:
linear_att = self.linear_att
out = torch.tanh(sum_qk).matmul(linear_att)
return out
def forward(self, query, keys):
"""
:param query: if batch_first: (b x t_q x n) else: (t_q x b x n)
:param keys: if batch_first: (b x t_k x n) else (t_k x b x n)
:returns: (context, scores_normalized)
context: if batch_first: (b x t_q x n) else (t_q x b x n)
scores_normalized: if batch_first (b x t_q x t_k) else (t_q x b x t_k)
"""
# first dim of keys and query has to be 'batch', it's needed for bmm
if not self.batch_first:
keys = keys.transpose(0, 1)
if query.dim() == 3:
query = query.transpose(0, 1)
if query.dim() == 2:
single_query = True
query = query.unsqueeze(1)
else:
single_query = False
b = query.size(0)
t_k = keys.size(1)
t_q = query.size(1)
# FC layers to transform query and key
processed_query = self.linear_q(query)
processed_key = self.linear_k(keys)
# scores: (b x t_q x t_k)
if self.fusion:
linear_att = self.linear_att / self.linear_att.norm()
linear_att = linear_att * self.normalize_scalar
scores = fused_calc_score(processed_query, processed_key,
self.normalize_bias, linear_att)
else:
scores = self.calc_score(processed_query, processed_key)
if self.mask is not None:
mask = self.mask.unsqueeze(1).expand(b, t_q, t_k)
# I can't use -INF because of overflow check in pytorch
scores.data.masked_fill_(mask, -65504.0)
# Normalize the scores, softmax over t_k
scores_normalized = F.softmax(scores, dim=-1)
# Calculate the weighted average of the attention inputs according to
# the scores
# context: (b x t_q x n)
context = torch.bmm(scores_normalized, keys)
if single_query:
context = context.squeeze(1)
scores_normalized = scores_normalized.squeeze(1)
elif not self.batch_first:
context = context.transpose(0, 1)
scores_normalized = scores_normalized.transpose(0, 1)
return context, scores_normalized
import itertools
import torch
import torch.nn as nn
import seq2seq.data.config as config
from seq2seq.models.attention import BahdanauAttention
from seq2seq.utils import init_lstm_
class RecurrentAttention(nn.Module):
"""
LSTM wrapped with an attention module.
"""
def __init__(self, input_size=1024, context_size=1024, hidden_size=1024,
num_layers=1, batch_first=False, dropout=0.2,
init_weight=0.1, fusion=True):
"""
Constructor for the RecurrentAttention.
:param input_size: number of features in input tensor
:param context_size: number of features in output from encoder
:param hidden_size: internal hidden size
:param num_layers: number of layers in LSTM
:param batch_first: if True the model uses (batch,seq,feature) tensors,
if false the model uses (seq, batch, feature)
:param dropout: probability of dropout (on input to LSTM layer)
:param init_weight: range for the uniform initializer
"""
super(RecurrentAttention, self).__init__()
self.rnn = nn.LSTM(input_size, hidden_size, num_layers, bias=True,
batch_first=batch_first)
init_lstm_(self.rnn, init_weight)
self.attn = BahdanauAttention(hidden_size, context_size, context_size,
normalize=True, batch_first=batch_first,
fusion=fusion)
self.dropout = nn.Dropout(dropout)
def forward(self, inputs, hidden, context, context_len):
"""
Execute RecurrentAttention.
:param inputs: tensor with inputs
:param hidden: hidden state for LSTM layer
:param context: context tensor from encoder
:param context_len: vector of encoder sequence lengths
:returns (rnn_outputs, hidden, attn_output, attn_scores)
"""
# set attention mask, sequences have different lengths, this mask
# allows to include only valid elements of context in attention's
# softmax
self.attn.set_mask(context_len, context)
inputs = self.dropout(inputs)
rnn_outputs, hidden = self.rnn(inputs, hidden)
attn_outputs, scores = self.attn(rnn_outputs, context)
return rnn_outputs, hidden, attn_outputs, scores
class Classifier(nn.Module):
"""
Fully-connected classifier
"""
def __init__(self, in_features, out_features, init_weight=0.1):
"""
Constructor for the Classifier.
:param in_features: number of input features
:param out_features: number of output features (size of vocabulary)
:param init_weight: range for the uniform initializer
"""
super(Classifier, self).__init__()
self.classifier = nn.Linear(in_features, out_features)
nn.init.uniform_(self.classifier.weight.data, -init_weight, init_weight)
nn.init.uniform_(self.classifier.bias.data, -init_weight, init_weight)
def forward(self, x):
"""
Execute the classifier.
:param x: output from decoder
"""
out = self.classifier(x)
return out
class ResidualRecurrentDecoder(nn.Module):
"""
Decoder with Embedding, LSTM layers, attention, residual connections and
optinal dropout.
Attention implemented in this module is different than the attention
discussed in the GNMT arxiv paper. In this model the output from the first
LSTM layer of the decoder goes into the attention module, then the
re-weighted context is concatenated with inputs to all subsequent LSTM
layers in the decoder at the current timestep.
Residual connections are enabled after 3rd LSTM layer, dropout is applied
on inputs to LSTM layers.
"""
def __init__(self, vocab_size, hidden_size=1024, num_layers=4, dropout=0.2,
batch_first=False, embedder=None, init_weight=0.1, fusion=True):
"""
Constructor of the ResidualRecurrentDecoder.
:param vocab_size: size of vocabulary
:param hidden_size: hidden size for LSMT layers
:param num_layers: number of LSTM layers
:param dropout: probability of dropout (on input to LSTM layers)
:param batch_first: if True the model uses (batch,seq,feature) tensors,
if false the model uses (seq, batch, feature)
:param embedder: instance of nn.Embedding, if None constructor will
create new embedding layer
:param init_weight: range for the uniform initializer
"""
super(ResidualRecurrentDecoder, self).__init__()
self.num_layers = num_layers
self.att_rnn = RecurrentAttention(hidden_size, hidden_size,
hidden_size, num_layers=1,
batch_first=batch_first,
dropout=dropout,
fusion=fusion)
self.rnn_layers = nn.ModuleList()
for _ in range(num_layers - 1):
self.rnn_layers.append(
nn.LSTM(2 * hidden_size, hidden_size, num_layers=1, bias=True,
batch_first=batch_first))
for lstm in self.rnn_layers:
init_lstm_(lstm, init_weight)
self.share_embedding = (embedder is not None)
if embedder is not None:
self.embedder = embedder
else:
self.embedder = nn.Embedding(vocab_size, hidden_size,
padding_idx=config.PAD)
nn.init.uniform_(self.embedder.weight.data, -init_weight, init_weight)
self.classifier = Classifier(hidden_size, vocab_size)
self.dropout = nn.Dropout(p=dropout)
def init_hidden(self, hidden):
"""
Converts flattened hidden state (from sequence generator) into a tuple
of hidden states.
:param hidden: None or flattened hidden state for decoder RNN layers
"""
if hidden is not None:
# per-layer chunks
hidden = hidden.chunk(self.num_layers)
# (h, c) chunks for LSTM layer
hidden = tuple(i.chunk(2) for i in hidden)
else:
hidden = [None] * self.num_layers
self.next_hidden = []
return hidden
def append_hidden(self, h):
"""
Appends the hidden vector h to the list of internal hidden states.
:param h: hidden vector
"""
if self.inference:
self.next_hidden.append(h)
def package_hidden(self):
"""
Flattens the hidden state from all LSTM layers into one tensor (for
the sequence generator).
"""
if self.inference:
hidden = torch.cat(tuple(itertools.chain(*self.next_hidden)))
else:
hidden = None
return hidden
def forward(self, inputs, context, inference=False):
"""
Execute the decoder.
:param inputs: tensor with inputs to the decoder
:param context: state of encoder, encoder sequence lengths and hidden
state of decoder's LSTM layers
:param inference: if True stores and repackages hidden state
"""
self.inference = inference
enc_context, enc_len, hidden = context
hidden = self.init_hidden(hidden)
if self.share_embedding and self.training:
x = inputs
else:
x = self.embedder(inputs)
x, h, attn, scores = self.att_rnn(x, hidden[0], enc_context, enc_len)
self.append_hidden(h)
x = torch.cat((x, attn), dim=2)
x = self.dropout(x)
x, h = self.rnn_layers[0](x, hidden[1])
self.append_hidden(h)
for i in range(1, len(self.rnn_layers)):
residual = x
x = torch.cat((x, attn), dim=2)
x = self.dropout(x)
x, h = self.rnn_layers[i](x, hidden[i + 1])
self.append_hidden(h)
x = x + residual
x = self.classifier(x)
hidden = self.package_hidden()
return x, scores, [enc_context, enc_len, hidden]
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence
import seq2seq.data.config as config
from seq2seq.utils import init_lstm_
import seq2seq.pack_utils._C as C
import time
import torch
class Revert_varlen(torch.autograd.Function):
@staticmethod
def forward(ctx, input, offsets):
ctx.offsets = offsets
return C.revert_varlen_tensor(input, offsets)
@staticmethod
def backward(ctx, grad_output):
return C.revert_varlen_tensor(grad_output, ctx.offsets), None
revert_varlen = Revert_varlen.apply
class EmuBidirLSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers = 1, bias = True, batch_first = False, bidirectional = True):
super(EmuBidirLSTM, self).__init__()
assert num_layers == 1, "emulation bidirectional lstm works for a single layer only"
assert batch_first == False, "emulation bidirectional lstm works for batch_first = False only"
assert bidirectional == True, "use for bidirectional lstm only"
self.bidir = torch.nn.LSTM(input_size, hidden_size, num_layers, bias, batch_first, bidirectional = True)
self.layer1 = torch.nn.LSTM(input_size, hidden_size, num_layers, bias, batch_first)
self.layer2 = torch.nn.LSTM(input_size, hidden_size, num_layers, bias, batch_first)
self.layer1.weight_ih_l0 = self.bidir.weight_ih_l0
self.layer1.weight_hh_l0 = self.bidir.weight_hh_l0
self.layer2.weight_ih_l0 = self.bidir.weight_ih_l0_reverse
self.layer2.weight_hh_l0 = self.bidir.weight_hh_l0_reverse
self.layer1.bias_ih_l0 = self.bidir.bias_ih_l0
self.layer1.bias_hh_l0 = self.bidir.bias_hh_l0
self.layer2.bias_ih_l0 = self.bidir.bias_ih_l0_reverse
self.layer2.bias_hh_l0 = self.bidir.bias_hh_l0_reverse
@staticmethod
def bidir_lstm(model, input, lengths):
packed_input = pack_padded_sequence(input, lengths)
out = model(packed_input)[0]
return pad_packed_sequence(out)[0]
@staticmethod
def emu_bidir_lstm(model0, model1, input, lengths):
mask = C.set_mask_cpp(lengths).unsqueeze(-1).to(input.device,
input.dtype, non_blocking = True)
offsets = C.get_offsets(input, lengths)
inputl1 = revert_varlen(input, offsets)
out1 = model1(inputl1)
outputs = revert_varlen(out1[0], offsets)
out0 = model0(input)[0]*mask
out_bi = torch.cat([out0, outputs], dim=2)
return out_bi
def forward(self, input, lengths):
if (input.size(1) > 512):
return self.bidir_lstm(self.bidir, input, lengths)
else:
return self.emu_bidir_lstm(self.layer2, self.layer1, input, lengths)
class ResidualRecurrentEncoder(nn.Module):
"""
Encoder with Embedding, LSTM layers, residual connections and optional
dropout.
The first LSTM layer is bidirectional and uses variable sequence length
API, the remaining (num_layers-1) layers are unidirectional. Residual
connections are enabled after third LSTM layer, dropout is applied on
inputs to LSTM layers.
"""
def __init__(self, vocab_size, hidden_size=1024, num_layers=4, dropout=0.2,
batch_first=False, embedder=None, init_weight=0.1):
"""
Constructor for the ResidualRecurrentEncoder.
:param vocab_size: size of vocabulary
:param hidden_size: hidden size for LSTM layers
:param num_layers: number of LSTM layers, 1st layer is bidirectional
:param dropout: probability of dropout (on input to LSTM layers)
:param batch_first: if True the model uses (batch,seq,feature) tensors,
if false the model uses (seq, batch, feature)
:param embedder: instance of nn.Embedding, if None constructor will
create new embedding layer
:param init_weight: range for the uniform initializer
"""
super(ResidualRecurrentEncoder, self).__init__()
self.batch_first = batch_first
self.rnn_layers = nn.ModuleList()
# 1st LSTM layer, bidirectional
self.rnn_layers.append(
EmuBidirLSTM(hidden_size, hidden_size, num_layers=1, bias=True,
batch_first=batch_first, bidirectional=True))
# 2nd LSTM layer, with 2x larger input_size
self.rnn_layers.append(
nn.LSTM((2 * hidden_size), hidden_size, num_layers=1, bias=True,
batch_first=batch_first))
# Remaining LSTM layers
for _ in range(num_layers - 2):
self.rnn_layers.append(
nn.LSTM(hidden_size, hidden_size, num_layers=1, bias=True,
batch_first=batch_first))
init_lstm_(self.rnn_layers[0].bidir)
for lstm in self.rnn_layers[1:]:
init_lstm_(lstm)
self.dropout = nn.Dropout(p=dropout)
self.share_embedding = (embedder is not None)
if embedder is not None:
self.embedder = embedder
else:
self.embedder = nn.Embedding(vocab_size, hidden_size,
padding_idx=config.PAD)
nn.init.uniform_(self.embedder.weight.data, -init_weight, init_weight)
def forward(self, inputs, lengths):
"""
Execute the encoder.
:param inputs: tensor with indices from the vocabulary
:param lengths: vector with sequence lengths (excluding padding)
returns: tensor with encoded sequences
"""
import pdb
#pdb.set_trace()
if self.share_embedding and self.training:
x = inputs
else:
x = self.embedder(inputs)
torch.cuda.synchronize()
t1 = time.time()
# bidirectional layer
x = self.dropout(x)
x = self.rnn_layers[0](x, lengths)
torch.cuda.synchronize()
t2 = time.time()
# 1st unidirectional layer
x = self.dropout(x)
x, _ = self.rnn_layers[1](x)
torch.cuda.synchronize()
t3 = time.time()
# the rest of unidirectional layers,
# with residual connections starting from 3rd layer
for i in range(2, len(self.rnn_layers)):
residual = x
x = self.dropout(x)
x, _ = self.rnn_layers[i](x)
x = x + residual
torch.cuda.synchronize()
t4 = time.time()
print("encode layer_1: ",(t2-t1)*1000)
print("encode layer_2: ",(t3-t2)*1000)
print("encode layer_rest: ",(t4-t3)*1000)
return x
import torch.nn as nn
import seq2seq.data.config as config
from seq2seq.models.decoder import ResidualRecurrentDecoder
from seq2seq.models.encoder import ResidualRecurrentEncoder
from seq2seq.models.seq2seq_base import Seq2Seq
import torch
import time
class GNMT(Seq2Seq):
"""
GNMT v2 model
"""
def __init__(self, vocab_size, hidden_size=1024, num_layers=4, dropout=0.2,
batch_first=False, share_embedding=True, fusion=True):
"""
Constructor for the GNMT v2 model.
:param vocab_size: size of vocabulary (number of tokens)
:param hidden_size: internal hidden size of the model
:param num_layers: number of layers, applies to both encoder and
decoder
:param dropout: probability of dropout (in encoder and decoder)
:param batch_first: if True the model uses (batch,seq,feature) tensors,
if false the model uses (seq, batch, feature)
:param share_embedding: if True embeddings are shared between encoder
and decoder
"""
super(GNMT, self).__init__(batch_first=batch_first)
if share_embedding:
embedder = nn.Embedding(vocab_size, hidden_size,
padding_idx=config.PAD)
nn.init.uniform_(embedder.weight.data, -0.1, 0.1)
else:
embedder = None
self.embedder = embedder
self.encoder = ResidualRecurrentEncoder(vocab_size, hidden_size,
num_layers, dropout,
batch_first, embedder)
self.decoder = ResidualRecurrentDecoder(vocab_size, hidden_size,
num_layers, dropout,
batch_first, embedder,
fusion=fusion)
#def forward(self, input_encoder, input_enc_len, input_decoder):
# if self.embedder:
# input_encoder = self.embedder(input_encoder)
# input_decoder = self.embedder(input_decoder)
# context = self.encode(input_encoder, input_enc_len)
# input_enc_len = input_enc_len.to(input_encoder.device, non_blocking=True)
# context = (context, input_enc_len, None)
# output, _, _ = self.decode(input_decoder, context)
# return output
def forward(self, input_encoder, input_enc_len, input_decoder):
if self.embedder:
input_encoder = self.embedder(input_encoder)
input_decoder = self.embedder(input_decoder)
##aiss add for prof time
torch.cuda.synchronize()
t1 = time.time()
import pdb
#pdb.set_trace()
context = self.encode(input_encoder, input_enc_len)
torch.cuda.synchronize()
t2 = time.time()
time.sleep(120)
input_enc_len = input_enc_len.to(input_encoder.device, non_blocking=True)
torch.cuda.synchronize()
t5 = time.time()
input_enc_len = input_enc_len.to(input_encoder.device, non_blocking=True)
torch.cuda.synchronize()
t6 = time.time()
context = (context, input_enc_len, None)
torch.cuda.synchronize()
t3 = time.time()
output, _, _ = self.decode(input_decoder, context)
torch.cuda.synchronize()
t4 = time.time()
print("encode time is ",(t2 - t1)*1000)
print("decode time is ",(t4 - t3)*1000)
print("process time is ",(t3 - t2)*1000)
print("process copy time1 is ",(t5 - t2)*1000)
print("process copy time2 is ",(t6 - t5)*1000)
print("process concat time is ",(t3 - t6)*1000)
return output
import torch.nn as nn
from torch.nn.functional import log_softmax
class Seq2Seq(nn.Module):
"""
Generic Seq2Seq module, with an encoder and a decoder.
"""
def __init__(self, encoder=None, decoder=None, batch_first=False):
"""
Constructor for the Seq2Seq module.
:param encoder: encoder module
:param decoder: decoder module
:param batch_first: if True the model uses (batch, seq, feature)
tensors, if false the model uses (seq, batch, feature) tensors
"""
super(Seq2Seq, self).__init__()
self.encoder = encoder
self.decoder = decoder
self.batch_first = batch_first
def encode(self, inputs, lengths):
"""
Applies the encoder to inputs with a given input sequence lengths.
:param inputs: tensor with inputs (batch, seq_len) if 'batch_first'
else (seq_len, batch)
:param lengths: vector with sequence lengths (excluding padding)
"""
return self.encoder(inputs, lengths)
def decode(self, inputs, context, inference=False):
"""
Applies the decoder to inputs, given the context from the encoder.
:param inputs: tensor with inputs (batch, seq_len) if 'batch_first'
else (seq_len, batch)
:param context: context from the encoder
:param inference: if True inference mode, if False training mode
"""
return self.decoder(inputs, context, inference)
def generate(self, inputs, context, beam_size):
"""
Autoregressive generator, works with SequenceGenerator class.
Executes decoder (in inference mode), applies log_softmax and topK for
inference with beam search decoding.
:param inputs: tensor with inputs to the decoder
:param context: context from the encoder
:param beam_size: beam size for the generator
returns: (words, logprobs, scores, new_context)
words: indices of topK tokens
logprobs: log probabilities of topK tokens
scores: scores from the attention module (for coverage penalty)
new_context: new decoder context, includes new hidden states for
decoder RNN cells
"""
logits, scores, new_context = self.decode(inputs, context, True)
logprobs = log_softmax(logits, dim=-1)
logprobs, words = logprobs.topk(beam_size, dim=-1)
return words, logprobs, scores, new_context
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment