Merge branch 'GNMT-v2' into 'main'

更新了GNMT v2 See merge request dcutoolkit/deeplearing/dlexamples_new!11

Merge branch 'GNMT-v2' into 'main'
更新了GNMT v2 See merge request dcutoolkit/deeplearing/dlexamples_new!11
27dab946 · huchen · 20291e9d · 07c30a15 · 27dab946 · 27dab946
Commit 27dab946 authored Apr 20, 2022 by huchen
20 changed files
--- a/PyTorch/NLP/gnmt/seq2seq/models/__pycache__/seq2seq_base.cpython-36.pyc
+++ b/PyTorch/NLP/gnmt/seq2seq/models/__pycache__/seq2seq_base.cpython-36.pyc
--- a/PyTorch/NLP/gnmt/seq2seq/models/attention.py
+++ b/PyTorch/NLP/gnmt/seq2seq/models/attention.py
-import logging
+# Copyright (c) 2017 Elad Hoffer
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
 import math
 import torch
@@ -6,25 +26,6 @@ import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn.parameter import Parameter
-import seq2seq.attn_score._C as C
-class AttentionScore(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, att_query, att_keys, bias, linear_att):
-        score = C.forward(att_query, att_keys, bias, linear_att)
-        ctx.save_for_backward(att_query, att_keys, bias, linear_att)
-        return score
-    @staticmethod
-    def backward(ctx, grad_output):
-        att_query, att_keys, bias, linear_att = ctx.saved_tensors
-        grad_query, grad_keys, grad_bias, grad_linear_att = \
-            C.backward(grad_output, att_query, att_keys, \
-            bias, linear_att)
-        return grad_query, grad_keys, grad_bias, grad_linear_att
-fused_calc_score = AttentionScore.apply
 class BahdanauAttention(nn.Module):
    """
@@ -32,7 +33,7 @@ class BahdanauAttention(nn.Module):
    Implementation is very similar to tf.contrib.seq2seq.BahdanauAttention
    """
    def __init__(self, query_size, key_size, num_units, normalize=False,
-                 batch_first=False, init_weight=0.1, fusion=True):
+                 batch_first=False, init_weight=0.1):
        """
        Constructor for the BahdanauAttention.
@@ -68,8 +69,6 @@ class BahdanauAttention(nn.Module):
            self.register_parameter('normalize_bias', None)
        self.reset_parameters(init_weight)
-        self.fusion = fusion
-        logging.info(f'Fused attention flag set to {fusion}')
    def reset_parameters(self, init_weight):
        """
@@ -161,18 +160,12 @@ class BahdanauAttention(nn.Module):
        processed_key = self.linear_k(keys)
        # scores: (b x t_q x t_k)
-        if self.fusion:
+        scores = self.calc_score(processed_query, processed_key)
-            linear_att = self.linear_att / self.linear_att.norm()
-            linear_att = linear_att * self.normalize_scalar
-            scores = fused_calc_score(processed_query, processed_key,
-                                self.normalize_bias, linear_att)
-        else:
-            scores = self.calc_score(processed_query, processed_key)
        if self.mask is not None:
            mask = self.mask.unsqueeze(1).expand(b, t_q, t_k)
            # I can't use -INF because of overflow check in pytorch
-            scores.data.masked_fill_(mask, -65504.0)
+            scores.masked_fill_(mask, -65504.0)
        # Normalize the scores, softmax over t_k
        scores_normalized = F.softmax(scores, dim=-1)

--- a/PyTorch/NLP/gnmt/seq2seq/models/decoder.py
+++ b/PyTorch/NLP/gnmt/seq2seq/models/decoder.py
+# Copyright (c) 2017 Elad Hoffer
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
 import itertools
 import torch
@@ -14,7 +35,7 @@ class RecurrentAttention(nn.Module):
    """
    def __init__(self, input_size=1024, context_size=1024, hidden_size=1024,
                 num_layers=1, batch_first=False, dropout=0.2,
-                 init_weight=0.1, fusion=True):
+                 init_weight=0.1):
        """
        Constructor for the RecurrentAttention.
@@ -35,8 +56,7 @@ class RecurrentAttention(nn.Module):
        init_lstm_(self.rnn, init_weight)
        self.attn = BahdanauAttention(hidden_size, context_size, context_size,
-                                      normalize=True, batch_first=batch_first,
+                                      normalize=True, batch_first=batch_first)
-                                      fusion=fusion)
        self.dropout = nn.Dropout(dropout)
@@ -105,7 +125,7 @@ class ResidualRecurrentDecoder(nn.Module):
    on inputs to LSTM layers.
    """
    def __init__(self, vocab_size, hidden_size=1024, num_layers=4, dropout=0.2,
-                 batch_first=False, embedder=None, init_weight=0.1, fusion=True):
+                 batch_first=False, embedder=None, init_weight=0.1):
        """
        Constructor of the ResidualRecurrentDecoder.
@@ -126,8 +146,7 @@ class ResidualRecurrentDecoder(nn.Module):
        self.att_rnn = RecurrentAttention(hidden_size, hidden_size,
                                          hidden_size, num_layers=1,
                                          batch_first=batch_first,
-                                          dropout=dropout,
+                                          dropout=dropout)
-                                          fusion=fusion)
        self.rnn_layers = nn.ModuleList()
        for _ in range(num_layers - 1):
@@ -138,13 +157,13 @@ class ResidualRecurrentDecoder(nn.Module):
        for lstm in self.rnn_layers:
            init_lstm_(lstm, init_weight)
-        self.share_embedding = (embedder is not None)
        if embedder is not None:
            self.embedder = embedder
        else:
            self.embedder = nn.Embedding(vocab_size, hidden_size,
                                         padding_idx=config.PAD)
-            nn.init.uniform_(self.embedder.weight.data, -init_weight, init_weight)
+            nn.init.uniform_(self.embedder.weight.data, -init_weight,
+                             init_weight)
        self.classifier = Classifier(hidden_size, vocab_size)
        self.dropout = nn.Dropout(p=dropout)
@@ -201,10 +220,7 @@ class ResidualRecurrentDecoder(nn.Module):
        enc_context, enc_len, hidden = context
        hidden = self.init_hidden(hidden)
-        if self.share_embedding and self.training:
+        x = self.embedder(inputs)
-            x = inputs
-        else:
-            x = self.embedder(inputs)
        x, h, attn, scores = self.att_rnn(x, hidden[0], enc_context, enc_len)
        self.append_hidden(h)

--- a/PyTorch/NLP/gnmt/seq2seq/models/encoder.py
+++ b/PyTorch/NLP/gnmt/seq2seq/models/encoder.py
-import torch
+# Copyright (c) 2017 Elad Hoffer
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
 import torch.nn as nn
 from torch.nn.utils.rnn import pack_padded_sequence
 from torch.nn.utils.rnn import pad_packed_sequence
 import seq2seq.data.config as config
 from seq2seq.utils import init_lstm_
-import seq2seq.pack_utils._C as C
-class Revert_varlen(torch.autograd.Function):
-   @staticmethod
-   def forward(ctx, input, offsets):
-      ctx.offsets = offsets
-      return C.revert_varlen_tensor(input, offsets)
-   @staticmethod
-   def backward(ctx, grad_output):
-       return C.revert_varlen_tensor(grad_output, ctx.offsets), None
-revert_varlen = Revert_varlen.apply
-class EmuBidirLSTM(nn.Module):
-    def __init__(self, input_size, hidden_size, num_layers = 1, bias = True, batch_first = False, bidirectional = True):
-        super(EmuBidirLSTM, self).__init__()
-        assert num_layers == 1, "emulation bidirectional lstm works for a single layer only"
-        assert batch_first == False, "emulation bidirectional lstm works for batch_first = False only"
-        assert bidirectional == True, "use for bidirectional lstm only"
-        self.bidir = torch.nn.LSTM(input_size, hidden_size, num_layers, bias, batch_first, bidirectional = True)
-        self.layer1 = torch.nn.LSTM(input_size, hidden_size, num_layers, bias, batch_first)
-        self.layer2 = torch.nn.LSTM(input_size, hidden_size, num_layers, bias, batch_first)
-        self.layer1.weight_ih_l0 = self.bidir.weight_ih_l0
-        self.layer1.weight_hh_l0 = self.bidir.weight_hh_l0
-        self.layer2.weight_ih_l0 = self.bidir.weight_ih_l0_reverse
-        self.layer2.weight_hh_l0 = self.bidir.weight_hh_l0_reverse
-        self.layer1.bias_ih_l0 = self.bidir.bias_ih_l0
-        self.layer1.bias_hh_l0 = self.bidir.bias_hh_l0
-        self.layer2.bias_ih_l0 = self.bidir.bias_ih_l0_reverse
-        self.layer2.bias_hh_l0 = self.bidir.bias_hh_l0_reverse
-    @staticmethod
-    def bidir_lstm(model, input, lengths):
-        packed_input = pack_padded_sequence(input, lengths)
-        out =  model(packed_input)[0]
-        return pad_packed_sequence(out)[0]
-    @staticmethod
-    def emu_bidir_lstm(model0, model1, input, lengths):
-        mask = C.set_mask_cpp(lengths).unsqueeze(-1).to(input.device,
-            input.dtype, non_blocking = True)
-        offsets = C.get_offsets(input, lengths)
-        inputl1 = revert_varlen(input, offsets)
-        out1 = model1(inputl1)
-        outputs = revert_varlen(out1[0], offsets)
-        out0 = model0(input)[0]*mask
-        out_bi = torch.cat([out0, outputs], dim=2)
-        return out_bi
-    def forward(self, input, lengths):
-        if (input.size(1) > 512):
-            return self.bidir_lstm(self.bidir, input, lengths)
-        else:
-            return self.emu_bidir_lstm(self.layer2, self.layer1, input, lengths)
 class ResidualRecurrentEncoder(nn.Module):
@@ -93,8 +57,8 @@ class ResidualRecurrentEncoder(nn.Module):
        self.rnn_layers = nn.ModuleList()
        # 1st LSTM layer, bidirectional
        self.rnn_layers.append(
-            EmuBidirLSTM(hidden_size, hidden_size, num_layers=1, bias=True,
+            nn.LSTM(hidden_size, hidden_size, num_layers=1, bias=True,
-                         batch_first=batch_first, bidirectional=True))
+                    batch_first=batch_first, bidirectional=True))
        # 2nd LSTM layer, with 2x larger input_size
        self.rnn_layers.append(
@@ -107,19 +71,18 @@ class ResidualRecurrentEncoder(nn.Module):
                nn.LSTM(hidden_size, hidden_size, num_layers=1, bias=True,
                        batch_first=batch_first))
-        init_lstm_(self.rnn_layers[0].bidir)
+        for lstm in self.rnn_layers:
-        for lstm in self.rnn_layers[1:]:
+            init_lstm_(lstm, init_weight)
-            init_lstm_(lstm)
        self.dropout = nn.Dropout(p=dropout)
-        self.share_embedding = (embedder is not None)
        if embedder is not None:
            self.embedder = embedder
        else:
            self.embedder = nn.Embedding(vocab_size, hidden_size,
                                         padding_idx=config.PAD)
-            nn.init.uniform_(self.embedder.weight.data, -init_weight, init_weight)
+            nn.init.uniform_(self.embedder.weight.data, -init_weight,
+                             init_weight)
    def forward(self, inputs, lengths):
        """
@@ -130,14 +93,14 @@ class ResidualRecurrentEncoder(nn.Module):
        returns: tensor with encoded sequences
        """
-        if self.share_embedding and self.training:
+        x = self.embedder(inputs)
-            x = inputs
-        else:
-            x = self.embedder(inputs)
        # bidirectional layer
        x = self.dropout(x)
-        x = self.rnn_layers[0](x, lengths)
+        x = pack_padded_sequence(x, lengths.cpu().numpy(),
+                                 batch_first=self.batch_first)
+        x, _ = self.rnn_layers[0](x)
+        x, _ = pad_packed_sequence(x, batch_first=self.batch_first)
        # 1st unidirectional layer
        x = self.dropout(x)

--- a/PyTorch/NLP/gnmt/seq2seq/models/gnmt.py
+++ b/PyTorch/NLP/gnmt/seq2seq/models/gnmt.py
+# Copyright (c) 2017 Elad Hoffer
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
 import torch.nn as nn
 import seq2seq.data.config as config
 from seq2seq.models.decoder import ResidualRecurrentDecoder
 from seq2seq.models.encoder import ResidualRecurrentEncoder
 from seq2seq.models.seq2seq_base import Seq2Seq
-import torch
-import time
 class GNMT(Seq2Seq):
    """
    GNMT v2 model
    """
    def __init__(self, vocab_size, hidden_size=1024, num_layers=4, dropout=0.2,
-                 batch_first=False, share_embedding=True, fusion=True):
+                 batch_first=False, share_embedding=True):
        """
        Constructor for the GNMT v2 model.
@@ -36,55 +56,17 @@ class GNMT(Seq2Seq):
        else:
            embedder = None
-        self.embedder = embedder
        self.encoder = ResidualRecurrentEncoder(vocab_size, hidden_size,
                                                num_layers, dropout,
                                                batch_first, embedder)
        self.decoder = ResidualRecurrentDecoder(vocab_size, hidden_size,
                                                num_layers, dropout,
-                                                batch_first, embedder,
+                                                batch_first, embedder)
-                                                fusion=fusion)
    def forward(self, input_encoder, input_enc_len, input_decoder):
-        if self.embedder:
-            input_encoder = self.embedder(input_encoder)
-            input_decoder = self.embedder(input_decoder)
        context = self.encode(input_encoder, input_enc_len)
-        input_enc_len = input_enc_len.to(input_encoder.device, non_blocking=True)
        context = (context, input_enc_len, None)
        output, _, _ = self.decode(input_decoder, context)
        return output
-#    def forward(self, input_encoder, input_enc_len, input_decoder):
-#        if self.embedder:
-#            input_encoder = self.embedder(input_encoder)
-#            input_decoder = self.embedder(input_decoder)
-###aiss add for prof time
-#        torch.cuda.synchronize()  
-#        t1 = time.time()
-#        context = self.encode(input_encoder, input_enc_len)
-#        torch.cuda.synchronize()  
-#        t2 = time.time()
-#        input_enc_len = input_enc_len.to(input_encoder.device, non_blocking=True)
-#        torch.cuda.synchronize()  
-#        t5 = time.time()
-#        input_enc_len = input_enc_len.to(input_encoder.device, non_blocking=True)
-#        torch.cuda.synchronize()  
-#        t6 = time.time()
-#        context = (context, input_enc_len, None)
-#        torch.cuda.synchronize()  
-#        t3 = time.time()
-#        output, _, _ = self.decode(input_decoder, context)
-#        torch.cuda.synchronize()  
-#        t4 = time.time()
-#        print("encode time is ",(t2 - t1)*1000) 
-#        print("decode time is ",(t4 - t3)*1000) 
-#        print("process time is ",(t3 - t2)*1000)
-#        print("process copy time1 is ",(t5 - t2)*1000) 
-#        print("process copy time2 is ",(t6 - t5)*1000) 
-#        print("process concat time is ",(t3 - t6)*1000) 
-#      
-#        return output
-#
--- a/PyTorch/NLP/gnmt/seq2seq/models/seq2seq_base.py
+++ b/PyTorch/NLP/gnmt/seq2seq/models/seq2seq_base.py
+# Copyright (c) 2017 Elad Hoffer
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
 import torch.nn as nn
 from torch.nn.functional import log_softmax

--- a/PyTorch/NLP/gnmt/seq2seq/models_prof/__pycache__/attention.cpython-36.pyc
+++ b/PyTorch/NLP/gnmt/seq2seq/models_prof/__pycache__/attention.cpython-36.pyc
--- a/PyTorch/NLP/gnmt/seq2seq/models_prof/__pycache__/decoder.cpython-36.pyc
+++ b/PyTorch/NLP/gnmt/seq2seq/models_prof/__pycache__/decoder.cpython-36.pyc
--- a/PyTorch/NLP/gnmt/seq2seq/models_prof/__pycache__/encoder.cpython-36.pyc
+++ b/PyTorch/NLP/gnmt/seq2seq/models_prof/__pycache__/encoder.cpython-36.pyc
--- a/PyTorch/NLP/gnmt/seq2seq/models_prof/__pycache__/gnmt.cpython-36.pyc
+++ b/PyTorch/NLP/gnmt/seq2seq/models_prof/__pycache__/gnmt.cpython-36.pyc
--- a/PyTorch/NLP/gnmt/seq2seq/models_prof/__pycache__/seq2seq_base.cpython-36.pyc
+++ b/PyTorch/NLP/gnmt/seq2seq/models_prof/__pycache__/seq2seq_base.cpython-36.pyc
--- a/PyTorch/NLP/gnmt/seq2seq/models_prof/attention.py
+++ b/PyTorch/NLP/gnmt/seq2seq/models_prof/attention.py
-import logging
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn.parameter import Parameter
-import seq2seq.attn_score._C as C
-class AttentionScore(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, att_query, att_keys, bias, linear_att):
-        score = C.forward(att_query, att_keys, bias, linear_att)
-        ctx.save_for_backward(att_query, att_keys, bias, linear_att)
-        return score
-    @staticmethod
-    def backward(ctx, grad_output):
-        att_query, att_keys, bias, linear_att = ctx.saved_tensors
-        grad_query, grad_keys, grad_bias, grad_linear_att = \
-            C.backward(grad_output, att_query, att_keys, \
-            bias, linear_att)
-        return grad_query, grad_keys, grad_bias, grad_linear_att
-fused_calc_score = AttentionScore.apply
-class BahdanauAttention(nn.Module):
-    """
-    Bahdanau Attention (https://arxiv.org/abs/1409.0473)
-    Implementation is very similar to tf.contrib.seq2seq.BahdanauAttention
-    """
-    def __init__(self, query_size, key_size, num_units, normalize=False,
-                 batch_first=False, init_weight=0.1, fusion=True):
-        """
-        Constructor for the BahdanauAttention.
-        :param query_size: feature dimension for query
-        :param key_size: feature dimension for keys
-        :param num_units: internal feature dimension
-        :param normalize: whether to normalize energy term
-        :param batch_first: if True batch size is the 1st dimension, if False
-            the sequence is first and batch size is second
-        :param init_weight: range for uniform initializer used to initialize
-            Linear key and query transform layers and linear_att vector
-        """
-        super(BahdanauAttention, self).__init__()
-        self.normalize = normalize
-        self.batch_first = batch_first
-        self.num_units = num_units
-        self.linear_q = nn.Linear(query_size, num_units, bias=False)
-        self.linear_k = nn.Linear(key_size, num_units, bias=False)
-        nn.init.uniform_(self.linear_q.weight.data, -init_weight, init_weight)
-        nn.init.uniform_(self.linear_k.weight.data, -init_weight, init_weight)
-        self.linear_att = Parameter(torch.Tensor(num_units))
-        self.mask = None
-        if self.normalize:
-            self.normalize_scalar = Parameter(torch.Tensor(1))
-            self.normalize_bias = Parameter(torch.Tensor(num_units))
-        else:
-            self.register_parameter('normalize_scalar', None)
-            self.register_parameter('normalize_bias', None)
-        self.reset_parameters(init_weight)
-        self.fusion = fusion
-        logging.info(f'Fused attention flag set to {fusion}')
-    def reset_parameters(self, init_weight):
-        """
-        Sets initial random values for trainable parameters.
-        """
-        stdv = 1. / math.sqrt(self.num_units)
-        self.linear_att.data.uniform_(-init_weight, init_weight)
-        if self.normalize:
-            self.normalize_scalar.data.fill_(stdv)
-            self.normalize_bias.data.zero_()
-    def set_mask(self, context_len, context):
-        """
-        sets self.mask which is applied before softmax
-        ones for inactive context fields, zeros for active context fields
-        :param context_len: b
-        :param context: if batch_first: (b x t_k x n) else: (t_k x b x n)
-        self.mask: (b x t_k)
-        """
-        if self.batch_first:
-            max_len = context.size(1)
-        else:
-            max_len = context.size(0)
-        indices = torch.arange(0, max_len, dtype=torch.int64,
-                               device=context.device)
-        self.mask = indices >= (context_len.unsqueeze(1))
-    def calc_score(self, att_query, att_keys):
-        """
-        Calculate Bahdanau score
-        :param att_query: b x t_q x n
-        :param att_keys: b x t_k x n
-        returns: b x t_q x t_k scores
-        """
-        b, t_k, n = att_keys.size()
-        t_q = att_query.size(1)
-        att_query = att_query.unsqueeze(2).expand(b, t_q, t_k, n)
-        att_keys = att_keys.unsqueeze(1).expand(b, t_q, t_k, n)
-        sum_qk = att_query + att_keys
-        if self.normalize:
-            sum_qk = sum_qk + self.normalize_bias
-            linear_att = self.linear_att / self.linear_att.norm()
-            linear_att = linear_att * self.normalize_scalar
-        else:
-            linear_att = self.linear_att
-        out = torch.tanh(sum_qk).matmul(linear_att)
-        return out
-    def forward(self, query, keys):
-        """
-        :param query: if batch_first: (b x t_q x n) else: (t_q x b x n)
-        :param keys: if batch_first: (b x t_k x n) else (t_k x b x n)
-        :returns: (context, scores_normalized)
-        context: if batch_first: (b x t_q x n) else (t_q x b x n)
-        scores_normalized: if batch_first (b x t_q x t_k) else (t_q x b x t_k)
-        """
-        # first dim of keys and query has to be 'batch', it's needed for bmm
-        if not self.batch_first:
-            keys = keys.transpose(0, 1)
-            if query.dim() == 3:
-                query = query.transpose(0, 1)
-        if query.dim() == 2:
-            single_query = True
-            query = query.unsqueeze(1)
-        else:
-            single_query = False
-        b = query.size(0)
-        t_k = keys.size(1)
-        t_q = query.size(1)
-        # FC layers to transform query and key
-        processed_query = self.linear_q(query)
-        processed_key = self.linear_k(keys)
-        # scores: (b x t_q x t_k)
-        if self.fusion:
-            linear_att = self.linear_att / self.linear_att.norm()
-            linear_att = linear_att * self.normalize_scalar
-            scores = fused_calc_score(processed_query, processed_key,
-                                self.normalize_bias, linear_att)
-        else:
-            scores = self.calc_score(processed_query, processed_key)
-        if self.mask is not None:
-            mask = self.mask.unsqueeze(1).expand(b, t_q, t_k)
-            # I can't use -INF because of overflow check in pytorch
-            scores.data.masked_fill_(mask, -65504.0)
-        # Normalize the scores, softmax over t_k
-        scores_normalized = F.softmax(scores, dim=-1)
-        # Calculate the weighted average of the attention inputs according to
-        # the scores
-        # context: (b x t_q x n)
-        context = torch.bmm(scores_normalized, keys)
-        if single_query:
-            context = context.squeeze(1)
-            scores_normalized = scores_normalized.squeeze(1)
-        elif not self.batch_first:
-            context = context.transpose(0, 1)
-            scores_normalized = scores_normalized.transpose(0, 1)
-        return context, scores_normalized
--- a/PyTorch/NLP/gnmt/seq2seq/models_prof/decoder.py
+++ b/PyTorch/NLP/gnmt/seq2seq/models_prof/decoder.py
-import itertools
-import torch
-import torch.nn as nn
-import seq2seq.data.config as config
-from seq2seq.models.attention import BahdanauAttention
-from seq2seq.utils import init_lstm_
-class RecurrentAttention(nn.Module):
-    """
-    LSTM wrapped with an attention module.
-    """
-    def __init__(self, input_size=1024, context_size=1024, hidden_size=1024,
-                 num_layers=1, batch_first=False, dropout=0.2,
-                 init_weight=0.1, fusion=True):
-        """
-        Constructor for the RecurrentAttention.
-        :param input_size: number of features in input tensor
-        :param context_size: number of features in output from encoder
-        :param hidden_size: internal hidden size
-        :param num_layers: number of layers in LSTM
-        :param batch_first: if True the model uses (batch,seq,feature) tensors,
-            if false the model uses (seq, batch, feature)
-        :param dropout: probability of dropout (on input to LSTM layer)
-        :param init_weight: range for the uniform initializer
-        """
-        super(RecurrentAttention, self).__init__()
-        self.rnn = nn.LSTM(input_size, hidden_size, num_layers, bias=True,
-                           batch_first=batch_first)
-        init_lstm_(self.rnn, init_weight)
-        self.attn = BahdanauAttention(hidden_size, context_size, context_size,
-                                      normalize=True, batch_first=batch_first,
-                                      fusion=fusion)
-        self.dropout = nn.Dropout(dropout)
-    def forward(self, inputs, hidden, context, context_len):
-        """
-        Execute RecurrentAttention.
-        :param inputs: tensor with inputs
-        :param hidden: hidden state for LSTM layer
-        :param context: context tensor from encoder
-        :param context_len: vector of encoder sequence lengths
-        :returns (rnn_outputs, hidden, attn_output, attn_scores)
-        """
-        # set attention mask, sequences have different lengths, this mask
-        # allows to include only valid elements of context in attention's
-        # softmax
-        self.attn.set_mask(context_len, context)
-        inputs = self.dropout(inputs)
-        rnn_outputs, hidden = self.rnn(inputs, hidden)
-        attn_outputs, scores = self.attn(rnn_outputs, context)
-        return rnn_outputs, hidden, attn_outputs, scores
-class Classifier(nn.Module):
-    """
-    Fully-connected classifier
-    """
-    def __init__(self, in_features, out_features, init_weight=0.1):
-        """
-        Constructor for the Classifier.
-        :param in_features: number of input features
-        :param out_features: number of output features (size of vocabulary)
-        :param init_weight: range for the uniform initializer
-        """
-        super(Classifier, self).__init__()
-        self.classifier = nn.Linear(in_features, out_features)
-        nn.init.uniform_(self.classifier.weight.data, -init_weight, init_weight)
-        nn.init.uniform_(self.classifier.bias.data, -init_weight, init_weight)
-    def forward(self, x):
-        """
-        Execute the classifier.
-        :param x: output from decoder
-        """
-        out = self.classifier(x)
-        return out
-class ResidualRecurrentDecoder(nn.Module):
-    """
-    Decoder with Embedding, LSTM layers, attention, residual connections and
-    optinal dropout.
-    Attention implemented in this module is different than the attention
-    discussed in the GNMT arxiv paper. In this model the output from the first
-    LSTM layer of the decoder goes into the attention module, then the
-    re-weighted context is concatenated with inputs to all subsequent LSTM
-    layers in the decoder at the current timestep.
-    Residual connections are enabled after 3rd LSTM layer, dropout is applied
-    on inputs to LSTM layers.
-    """
-    def __init__(self, vocab_size, hidden_size=1024, num_layers=4, dropout=0.2,
-                 batch_first=False, embedder=None, init_weight=0.1, fusion=True):
-        """
-        Constructor of the ResidualRecurrentDecoder.
-        :param vocab_size: size of vocabulary
-        :param hidden_size: hidden size for LSMT layers
-        :param num_layers: number of LSTM layers
-        :param dropout: probability of dropout (on input to LSTM layers)
-        :param batch_first: if True the model uses (batch,seq,feature) tensors,
-            if false the model uses (seq, batch, feature)
-        :param embedder: instance of nn.Embedding, if None constructor will
-            create new embedding layer
-        :param init_weight: range for the uniform initializer
-        """
-        super(ResidualRecurrentDecoder, self).__init__()
-        self.num_layers = num_layers
-        self.att_rnn = RecurrentAttention(hidden_size, hidden_size,
-                                          hidden_size, num_layers=1,
-                                          batch_first=batch_first,
-                                          dropout=dropout,
-                                          fusion=fusion)
-        self.rnn_layers = nn.ModuleList()
-        for _ in range(num_layers - 1):
-            self.rnn_layers.append(
-                nn.LSTM(2 * hidden_size, hidden_size, num_layers=1, bias=True,
-                        batch_first=batch_first))
-        for lstm in self.rnn_layers:
-            init_lstm_(lstm, init_weight)
-        self.share_embedding = (embedder is not None)
-        if embedder is not None:
-            self.embedder = embedder
-        else:
-            self.embedder = nn.Embedding(vocab_size, hidden_size,
-                                         padding_idx=config.PAD)
-            nn.init.uniform_(self.embedder.weight.data, -init_weight, init_weight)
-        self.classifier = Classifier(hidden_size, vocab_size)
-        self.dropout = nn.Dropout(p=dropout)
-    def init_hidden(self, hidden):
-        """
-        Converts flattened hidden state (from sequence generator) into a tuple
-        of hidden states.
-        :param hidden: None or flattened hidden state for decoder RNN layers
-        """
-        if hidden is not None:
-            # per-layer chunks
-            hidden = hidden.chunk(self.num_layers)
-            # (h, c) chunks for LSTM layer
-            hidden = tuple(i.chunk(2) for i in hidden)
-        else:
-            hidden = [None] * self.num_layers
-        self.next_hidden = []
-        return hidden
-    def append_hidden(self, h):
-        """
-        Appends the hidden vector h to the list of internal hidden states.
-        :param h: hidden vector
-        """
-        if self.inference:
-            self.next_hidden.append(h)
-    def package_hidden(self):
-        """
-        Flattens the hidden state from all LSTM layers into one tensor (for
-        the sequence generator).
-        """
-        if self.inference:
-            hidden = torch.cat(tuple(itertools.chain(*self.next_hidden)))
-        else:
-            hidden = None
-        return hidden
-    def forward(self, inputs, context, inference=False):
-        """
-        Execute the decoder.
-        :param inputs: tensor with inputs to the decoder
-        :param context: state of encoder, encoder sequence lengths and hidden
-            state of decoder's LSTM layers
-        :param inference: if True stores and repackages hidden state
-        """
-        self.inference = inference
-        enc_context, enc_len, hidden = context
-        hidden = self.init_hidden(hidden)
-        if self.share_embedding and self.training:
-            x = inputs
-        else:
-            x = self.embedder(inputs)
-        x, h, attn, scores = self.att_rnn(x, hidden[0], enc_context, enc_len)
-        self.append_hidden(h)
-        x = torch.cat((x, attn), dim=2)
-        x = self.dropout(x)
-        x, h = self.rnn_layers[0](x, hidden[1])
-        self.append_hidden(h)
-        for i in range(1, len(self.rnn_layers)):
-            residual = x
-            x = torch.cat((x, attn), dim=2)
-            x = self.dropout(x)
-            x, h = self.rnn_layers[i](x, hidden[i + 1])
-            self.append_hidden(h)
-            x = x + residual
-        x = self.classifier(x)
-        hidden = self.package_hidden()
-        return x, scores, [enc_context, enc_len, hidden]
--- a/PyTorch/NLP/gnmt/seq2seq/models_prof/encoder.py
+++ b/PyTorch/NLP/gnmt/seq2seq/models_prof/encoder.py
-import torch
-import torch.nn as nn
-from torch.nn.utils.rnn import pack_padded_sequence
-from torch.nn.utils.rnn import pad_packed_sequence
-import seq2seq.data.config as config
-from seq2seq.utils import init_lstm_
-import seq2seq.pack_utils._C as C
-import time
-import torch
-class Revert_varlen(torch.autograd.Function):
-   @staticmethod
-   def forward(ctx, input, offsets):
-      ctx.offsets = offsets
-      return C.revert_varlen_tensor(input, offsets)
-   @staticmethod
-   def backward(ctx, grad_output):
-       return C.revert_varlen_tensor(grad_output, ctx.offsets), None
-revert_varlen = Revert_varlen.apply
-class EmuBidirLSTM(nn.Module):
-    def __init__(self, input_size, hidden_size, num_layers = 1, bias = True, batch_first = False, bidirectional = True):
-        super(EmuBidirLSTM, self).__init__()
-        assert num_layers == 1, "emulation bidirectional lstm works for a single layer only"
-        assert batch_first == False, "emulation bidirectional lstm works for batch_first = False only"
-        assert bidirectional == True, "use for bidirectional lstm only"
-        self.bidir = torch.nn.LSTM(input_size, hidden_size, num_layers, bias, batch_first, bidirectional = True)
-        self.layer1 = torch.nn.LSTM(input_size, hidden_size, num_layers, bias, batch_first)
-        self.layer2 = torch.nn.LSTM(input_size, hidden_size, num_layers, bias, batch_first)
-        self.layer1.weight_ih_l0 = self.bidir.weight_ih_l0
-        self.layer1.weight_hh_l0 = self.bidir.weight_hh_l0
-        self.layer2.weight_ih_l0 = self.bidir.weight_ih_l0_reverse
-        self.layer2.weight_hh_l0 = self.bidir.weight_hh_l0_reverse
-        self.layer1.bias_ih_l0 = self.bidir.bias_ih_l0
-        self.layer1.bias_hh_l0 = self.bidir.bias_hh_l0
-        self.layer2.bias_ih_l0 = self.bidir.bias_ih_l0_reverse
-        self.layer2.bias_hh_l0 = self.bidir.bias_hh_l0_reverse
-    @staticmethod
-    def bidir_lstm(model, input, lengths):
-        packed_input = pack_padded_sequence(input, lengths)
-        out =  model(packed_input)[0]
-        return pad_packed_sequence(out)[0]
-    @staticmethod
-    def emu_bidir_lstm(model0, model1, input, lengths):
-        mask = C.set_mask_cpp(lengths).unsqueeze(-1).to(input.device,
-            input.dtype, non_blocking = True)
-        offsets = C.get_offsets(input, lengths)
-        inputl1 = revert_varlen(input, offsets)
-        out1 = model1(inputl1)
-        outputs = revert_varlen(out1[0], offsets)
-        out0 = model0(input)[0]*mask
-        out_bi = torch.cat([out0, outputs], dim=2)
-        return out_bi
-    def forward(self, input, lengths):
-        if (input.size(1) > 512):
-            return self.bidir_lstm(self.bidir, input, lengths)
-        else:
-            return self.emu_bidir_lstm(self.layer2, self.layer1, input, lengths)
-class ResidualRecurrentEncoder(nn.Module):
-    """
-    Encoder with Embedding, LSTM layers, residual connections and optional
-    dropout.
-    The first LSTM layer is bidirectional and uses variable sequence length
-    API, the remaining (num_layers-1) layers are unidirectional. Residual
-    connections are enabled after third LSTM layer, dropout is applied on
-    inputs to LSTM layers.
-    """
-    def __init__(self, vocab_size, hidden_size=1024, num_layers=4, dropout=0.2,
-                 batch_first=False, embedder=None, init_weight=0.1):
-        """
-        Constructor for the ResidualRecurrentEncoder.
-        :param vocab_size: size of vocabulary
-        :param hidden_size: hidden size for LSTM layers
-        :param num_layers: number of LSTM layers, 1st layer is bidirectional
-        :param dropout: probability of dropout (on input to LSTM layers)
-        :param batch_first: if True the model uses (batch,seq,feature) tensors,
-            if false the model uses (seq, batch, feature)
-        :param embedder: instance of nn.Embedding, if None constructor will
-            create new embedding layer
-        :param init_weight: range for the uniform initializer
-        """
-        super(ResidualRecurrentEncoder, self).__init__()
-        self.batch_first = batch_first
-        self.rnn_layers = nn.ModuleList()
-        # 1st LSTM layer, bidirectional
-        self.rnn_layers.append(
-            EmuBidirLSTM(hidden_size, hidden_size, num_layers=1, bias=True,
-                         batch_first=batch_first, bidirectional=True))
-        # 2nd LSTM layer, with 2x larger input_size
-        self.rnn_layers.append(
-            nn.LSTM((2 * hidden_size), hidden_size, num_layers=1, bias=True,
-                    batch_first=batch_first))
-        # Remaining LSTM layers
-        for _ in range(num_layers - 2):
-            self.rnn_layers.append(
-                nn.LSTM(hidden_size, hidden_size, num_layers=1, bias=True,
-                        batch_first=batch_first))
-        init_lstm_(self.rnn_layers[0].bidir)
-        for lstm in self.rnn_layers[1:]:
-            init_lstm_(lstm)
-        self.dropout = nn.Dropout(p=dropout)
-        self.share_embedding = (embedder is not None)
-        if embedder is not None:
-            self.embedder = embedder
-        else:
-            self.embedder = nn.Embedding(vocab_size, hidden_size,
-                                         padding_idx=config.PAD)
-            nn.init.uniform_(self.embedder.weight.data, -init_weight, init_weight)
-    def forward(self, inputs, lengths):
-        """
-        Execute the encoder.
-        :param inputs: tensor with indices from the vocabulary
-        :param lengths: vector with sequence lengths (excluding padding)
-        returns: tensor with encoded sequences
-        """
-        import pdb
-        #pdb.set_trace()
-        if self.share_embedding and self.training:
-            x = inputs
-        else:
-            x = self.embedder(inputs)
-        torch.cuda.synchronize()
-        t1 = time.time()
-        # bidirectional layer
-        x = self.dropout(x)
-        x = self.rnn_layers[0](x, lengths)
-        torch.cuda.synchronize()
-        t2 = time.time()
-        # 1st unidirectional layer
-        x = self.dropout(x)
-        x, _ = self.rnn_layers[1](x)
-        torch.cuda.synchronize()
-        t3 = time.time()
-        # the rest of unidirectional layers,
-        # with residual connections starting from 3rd layer
-        for i in range(2, len(self.rnn_layers)):
-            residual = x
-            x = self.dropout(x)
-            x, _ = self.rnn_layers[i](x)
-            x = x + residual
-        torch.cuda.synchronize()
-        t4 = time.time()
-        print("encode layer_1: ",(t2-t1)*1000)
-        print("encode layer_2: ",(t3-t2)*1000)
-        print("encode layer_rest: ",(t4-t3)*1000)
-        return x
--- a/PyTorch/NLP/gnmt/seq2seq/models_prof/gnmt.py
+++ b/PyTorch/NLP/gnmt/seq2seq/models_prof/gnmt.py
-import torch.nn as nn
-import seq2seq.data.config as config
-from seq2seq.models.decoder import ResidualRecurrentDecoder
-from seq2seq.models.encoder import ResidualRecurrentEncoder
-from seq2seq.models.seq2seq_base import Seq2Seq
-import torch
-import time
-class GNMT(Seq2Seq):
-    """
-    GNMT v2 model
-    """
-    def __init__(self, vocab_size, hidden_size=1024, num_layers=4, dropout=0.2,
-                 batch_first=False, share_embedding=True, fusion=True):
-        """
-        Constructor for the GNMT v2 model.
-        :param vocab_size: size of vocabulary (number of tokens)
-        :param hidden_size: internal hidden size of the model
-        :param num_layers: number of layers, applies to both encoder and
-            decoder
-        :param dropout: probability of dropout (in encoder and decoder)
-        :param batch_first: if True the model uses (batch,seq,feature) tensors,
-            if false the model uses (seq, batch, feature)
-        :param share_embedding: if True embeddings are shared between encoder
-            and decoder
-        """
-        super(GNMT, self).__init__(batch_first=batch_first)
-        if share_embedding:
-            embedder = nn.Embedding(vocab_size, hidden_size,
-                                    padding_idx=config.PAD)
-            nn.init.uniform_(embedder.weight.data, -0.1, 0.1)
-        else:
-            embedder = None
-        self.embedder = embedder
-        self.encoder = ResidualRecurrentEncoder(vocab_size, hidden_size,
-                                                num_layers, dropout,
-                                                batch_first, embedder)
-        self.decoder = ResidualRecurrentDecoder(vocab_size, hidden_size,
-                                                num_layers, dropout,
-                                                batch_first, embedder,
-                                                fusion=fusion)
-    #def forward(self, input_encoder, input_enc_len, input_decoder):
-    #    if self.embedder:
-    #        input_encoder = self.embedder(input_encoder)
-    #        input_decoder = self.embedder(input_decoder)
-    #    context = self.encode(input_encoder, input_enc_len)
-    #    input_enc_len = input_enc_len.to(input_encoder.device, non_blocking=True)
-    #    context = (context, input_enc_len, None)
-    #    output, _, _ = self.decode(input_decoder, context)
-    #    return output
-    def forward(self, input_encoder, input_enc_len, input_decoder):
-        if self.embedder:
-            input_encoder = self.embedder(input_encoder)
-            input_decoder = self.embedder(input_decoder)
-##aiss add for prof time
-        torch.cuda.synchronize()  
-        t1 = time.time()
-        import pdb
-        #pdb.set_trace()
-        context = self.encode(input_encoder, input_enc_len)
-        torch.cuda.synchronize()  
-        t2 = time.time()
-        time.sleep(120)
-        input_enc_len = input_enc_len.to(input_encoder.device, non_blocking=True)
-        torch.cuda.synchronize()  
-        t5 = time.time()
-        input_enc_len = input_enc_len.to(input_encoder.device, non_blocking=True)
-        torch.cuda.synchronize()  
-        t6 = time.time()
-        context = (context, input_enc_len, None)
-        torch.cuda.synchronize()  
-        t3 = time.time()
-        output, _, _ = self.decode(input_decoder, context)
-        torch.cuda.synchronize()  
-        t4 = time.time()
-        print("encode time is ",(t2 - t1)*1000) 
-        print("decode time is ",(t4 - t3)*1000) 
-        print("process time is ",(t3 - t2)*1000)
-        print("process copy time1 is ",(t5 - t2)*1000) 
-        print("process copy time2 is ",(t6 - t5)*1000) 
-        print("process concat time is ",(t3 - t6)*1000) 
-        return output
--- a/PyTorch/NLP/gnmt/seq2seq/models_prof/seq2seq_base.py
+++ b/PyTorch/NLP/gnmt/seq2seq/models_prof/seq2seq_base.py
-import torch.nn as nn
-from torch.nn.functional import log_softmax
-class Seq2Seq(nn.Module):
-    """
-    Generic Seq2Seq module, with an encoder and a decoder.
-    """
-    def __init__(self, encoder=None, decoder=None, batch_first=False):
-        """
-        Constructor for the Seq2Seq module.
-        :param encoder: encoder module
-        :param decoder: decoder module
-        :param batch_first: if True the model uses (batch, seq, feature)
-            tensors, if false the model uses (seq, batch, feature) tensors
-        """
-        super(Seq2Seq, self).__init__()
-        self.encoder = encoder
-        self.decoder = decoder
-        self.batch_first = batch_first
-    def encode(self, inputs, lengths):
-        """
-        Applies the encoder to inputs with a given input sequence lengths.
-        :param inputs: tensor with inputs (batch, seq_len) if 'batch_first'
-            else (seq_len, batch)
-        :param lengths: vector with sequence lengths (excluding padding)
-        """
-        return self.encoder(inputs, lengths)
-    def decode(self, inputs, context, inference=False):
-        """
-        Applies the decoder to inputs, given the context from the encoder.
-        :param inputs: tensor with inputs (batch, seq_len) if 'batch_first'
-            else (seq_len, batch)
-        :param context: context from the encoder
-        :param inference: if True inference mode, if False training mode
-        """
-        return self.decoder(inputs, context, inference)
-    def generate(self, inputs, context, beam_size):
-        """
-        Autoregressive generator, works with SequenceGenerator class.
-        Executes decoder (in inference mode), applies log_softmax and topK for
-        inference with beam search decoding.
-        :param inputs: tensor with inputs to the decoder
-        :param context: context from the encoder
-        :param beam_size: beam size for the generator
-        returns: (words, logprobs, scores, new_context)
-            words: indices of topK tokens
-            logprobs: log probabilities of topK tokens
-            scores: scores from the attention module (for coverage penalty)
-            new_context: new decoder context, includes new hidden states for
-                decoder RNN cells
-        """
-        logits, scores, new_context = self.decode(inputs, context, True)
-        logprobs = log_softmax(logits, dim=-1)
-        logprobs, words = logprobs.topk(beam_size, dim=-1)
-        return words, logprobs, scores, new_context
--- a/PyTorch/NLP/gnmt/seq2seq/train/__pycache__/fp_optimizers.cpython-36.pyc
+++ b/PyTorch/NLP/gnmt/seq2seq/train/__pycache__/fp_optimizers.cpython-36.pyc
--- a/PyTorch/NLP/gnmt/seq2seq/train/__pycache__/lr_scheduler.cpython-36.pyc
+++ b/PyTorch/NLP/gnmt/seq2seq/train/__pycache__/lr_scheduler.cpython-36.pyc
--- a/PyTorch/NLP/gnmt/seq2seq/train/__pycache__/smoothing.cpython-36.pyc
+++ b/PyTorch/NLP/gnmt/seq2seq/train/__pycache__/smoothing.cpython-36.pyc
--- a/PyTorch/NLP/gnmt/seq2seq/train/__pycache__/table.cpython-36.pyc
+++ b/PyTorch/NLP/gnmt/seq2seq/train/__pycache__/table.cpython-36.pyc