encoder.py 6.46 KB
Newer Older
huchen's avatar
huchen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence

import seq2seq.data.config as config
from seq2seq.utils import init_lstm_
import seq2seq.pack_utils._C as C
import time
import torch
class Revert_varlen(torch.autograd.Function):
   @staticmethod
   def forward(ctx, input, offsets):
      ctx.offsets = offsets
      return C.revert_varlen_tensor(input, offsets)

   @staticmethod
   def backward(ctx, grad_output):
       return C.revert_varlen_tensor(grad_output, ctx.offsets), None

revert_varlen = Revert_varlen.apply

class EmuBidirLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers = 1, bias = True, batch_first = False, bidirectional = True):
        super(EmuBidirLSTM, self).__init__()
        assert num_layers == 1, "emulation bidirectional lstm works for a single layer only"
        assert batch_first == False, "emulation bidirectional lstm works for batch_first = False only"
        assert bidirectional == True, "use for bidirectional lstm only"
        self.bidir = torch.nn.LSTM(input_size, hidden_size, num_layers, bias, batch_first, bidirectional = True)
        self.layer1 = torch.nn.LSTM(input_size, hidden_size, num_layers, bias, batch_first)
        self.layer2 = torch.nn.LSTM(input_size, hidden_size, num_layers, bias, batch_first)
        self.layer1.weight_ih_l0 = self.bidir.weight_ih_l0
        self.layer1.weight_hh_l0 = self.bidir.weight_hh_l0
        self.layer2.weight_ih_l0 = self.bidir.weight_ih_l0_reverse
        self.layer2.weight_hh_l0 = self.bidir.weight_hh_l0_reverse
        self.layer1.bias_ih_l0 = self.bidir.bias_ih_l0
        self.layer1.bias_hh_l0 = self.bidir.bias_hh_l0
        self.layer2.bias_ih_l0 = self.bidir.bias_ih_l0_reverse
        self.layer2.bias_hh_l0 = self.bidir.bias_hh_l0_reverse

    @staticmethod
    def bidir_lstm(model, input, lengths):
        packed_input = pack_padded_sequence(input, lengths)
        out =  model(packed_input)[0]
        return pad_packed_sequence(out)[0]

    @staticmethod
    def emu_bidir_lstm(model0, model1, input, lengths):
        mask = C.set_mask_cpp(lengths).unsqueeze(-1).to(input.device,
            input.dtype, non_blocking = True)
        offsets = C.get_offsets(input, lengths)
        inputl1 = revert_varlen(input, offsets)
        out1 = model1(inputl1)
        outputs = revert_varlen(out1[0], offsets)
        out0 = model0(input)[0]*mask
        out_bi = torch.cat([out0, outputs], dim=2)
        return out_bi

    def forward(self, input, lengths):
        if (input.size(1) > 512):
            return self.bidir_lstm(self.bidir, input, lengths)
        else:
            return self.emu_bidir_lstm(self.layer2, self.layer1, input, lengths)


class ResidualRecurrentEncoder(nn.Module):
    """
    Encoder with Embedding, LSTM layers, residual connections and optional
    dropout.

    The first LSTM layer is bidirectional and uses variable sequence length
    API, the remaining (num_layers-1) layers are unidirectional. Residual
    connections are enabled after third LSTM layer, dropout is applied on
    inputs to LSTM layers.
    """
    def __init__(self, vocab_size, hidden_size=1024, num_layers=4, dropout=0.2,
                 batch_first=False, embedder=None, init_weight=0.1):
        """
        Constructor for the ResidualRecurrentEncoder.

        :param vocab_size: size of vocabulary
        :param hidden_size: hidden size for LSTM layers
        :param num_layers: number of LSTM layers, 1st layer is bidirectional
        :param dropout: probability of dropout (on input to LSTM layers)
        :param batch_first: if True the model uses (batch,seq,feature) tensors,
            if false the model uses (seq, batch, feature)
        :param embedder: instance of nn.Embedding, if None constructor will
            create new embedding layer
        :param init_weight: range for the uniform initializer
        """
        super(ResidualRecurrentEncoder, self).__init__()
        self.batch_first = batch_first
        self.rnn_layers = nn.ModuleList()
        # 1st LSTM layer, bidirectional
        self.rnn_layers.append(
            EmuBidirLSTM(hidden_size, hidden_size, num_layers=1, bias=True,
                         batch_first=batch_first, bidirectional=True))

        # 2nd LSTM layer, with 2x larger input_size
        self.rnn_layers.append(
            nn.LSTM((2 * hidden_size), hidden_size, num_layers=1, bias=True,
                    batch_first=batch_first))

        # Remaining LSTM layers
        for _ in range(num_layers - 2):
            self.rnn_layers.append(
                nn.LSTM(hidden_size, hidden_size, num_layers=1, bias=True,
                        batch_first=batch_first))

        init_lstm_(self.rnn_layers[0].bidir)
        for lstm in self.rnn_layers[1:]:
            init_lstm_(lstm)

        self.dropout = nn.Dropout(p=dropout)

        self.share_embedding = (embedder is not None)
        if embedder is not None:
            self.embedder = embedder
        else:
            self.embedder = nn.Embedding(vocab_size, hidden_size,
                                         padding_idx=config.PAD)
            nn.init.uniform_(self.embedder.weight.data, -init_weight, init_weight)

    def forward(self, inputs, lengths):
        """
        Execute the encoder.

        :param inputs: tensor with indices from the vocabulary
        :param lengths: vector with sequence lengths (excluding padding)

        returns: tensor with encoded sequences
        """
        import pdb
        #pdb.set_trace()
        if self.share_embedding and self.training:
            x = inputs
        else:
            x = self.embedder(inputs)

        torch.cuda.synchronize()
        t1 = time.time()
        # bidirectional layer
        x = self.dropout(x)
        x = self.rnn_layers[0](x, lengths)

        torch.cuda.synchronize()
        t2 = time.time()
        # 1st unidirectional layer
        x = self.dropout(x)
        x, _ = self.rnn_layers[1](x)

        torch.cuda.synchronize()
        t3 = time.time()
        # the rest of unidirectional layers,
        # with residual connections starting from 3rd layer
        for i in range(2, len(self.rnn_layers)):
            residual = x
            x = self.dropout(x)
            x, _ = self.rnn_layers[i](x)
            x = x + residual

        torch.cuda.synchronize()
        t4 = time.time()
        print("encode layer_1: ",(t2-t1)*1000)
        print("encode layer_2: ",(t3-t2)*1000)
        print("encode layer_rest: ",(t4-t3)*1000)

        return x