from typing import List from typing import Optional from typing import Sequence from typing import Tuple from typing import Union import logging import torch import torch.nn as nn from torch.utils.checkpoint import checkpoint def make_pad_mask(lengths, xs=None, length_dim=-1, maxlen=None): """Make mask tensor containing indices of padded part. Args: lengths (LongTensor or List): Batch of lengths (B,). xs (Tensor, optional): The reference tensor. If set, masks will be the same shape as this tensor. length_dim (int, optional): Dimension indicator of the above tensor. See the example. Returns: Tensor: Mask tensor containing indices of padded part. dtype=torch.uint8 in PyTorch 1.2- dtype=torch.bool in PyTorch 1.2+ (including 1.2) Examples: With only lengths. >>> lengths = [5, 3, 2] >>> make_pad_mask(lengths) masks = [[0, 0, 0, 0 ,0], [0, 0, 0, 1, 1], [0, 0, 1, 1, 1]] With the reference tensor. >>> xs = torch.zeros((3, 2, 4)) >>> make_pad_mask(lengths, xs) tensor([[[0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 1], [0, 0, 0, 1]], [[0, 0, 1, 1], [0, 0, 1, 1]]], dtype=torch.uint8) >>> xs = torch.zeros((3, 2, 6)) >>> make_pad_mask(lengths, xs) tensor([[[0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1]], [[0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1]], [[0, 0, 1, 1, 1, 1], [0, 0, 1, 1, 1, 1]]], dtype=torch.uint8) With the reference tensor and dimension indicator. >>> xs = torch.zeros((3, 6, 6)) >>> make_pad_mask(lengths, xs, 1) tensor([[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1]], [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]], [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]], dtype=torch.uint8) >>> make_pad_mask(lengths, xs, 2) tensor([[[0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1]], [[0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1]], [[0, 0, 1, 1, 1, 1], [0, 0, 1, 1, 1, 1], [0, 0, 1, 1, 1, 1], [0, 0, 1, 1, 1, 1], [0, 0, 1, 1, 1, 1], [0, 0, 1, 1, 1, 1]]], dtype=torch.uint8) """ if length_dim == 0: raise ValueError("length_dim cannot be 0: {}".format(length_dim)) if not isinstance(lengths, list): lengths = lengths.tolist() bs = int(len(lengths)) if maxlen is None: if xs is None: maxlen = int(max(lengths)) else: maxlen = xs.size(length_dim) else: assert xs is None assert maxlen >= int(max(lengths)) seq_range = torch.arange(0, maxlen, dtype=torch.int64) seq_range_expand = seq_range.unsqueeze(0).expand(bs, maxlen) seq_length_expand = seq_range_expand.new(lengths).unsqueeze(-1) mask = seq_range_expand >= seq_length_expand if xs is not None: assert xs.size(0) == bs, (xs.size(0), bs) if length_dim < 0: length_dim = xs.dim() + length_dim # ind = (:, None, ..., None, :, , None, ..., None) ind = tuple(slice(None) if i in (0, length_dim) else None for i in range(xs.dim())) mask = mask[ind].expand_as(xs).to(xs.device) return mask class SinusoidalPositionEncoder(torch.nn.Module): """ """ def __int__(self, d_model=80, dropout_rate=0.1): pass def encode( self, positions: torch.Tensor = None, depth: int = None, dtype: torch.dtype = torch.float32 ): batch_size = positions.size(0) positions = positions.type(dtype) device = positions.device log_timescale_increment = torch.log(torch.tensor([10000], dtype=dtype, device=device)) / ( depth / 2 - 1 ) inv_timescales = torch.exp( torch.arange(depth / 2, device=device).type(dtype) * (-log_timescale_increment) ) inv_timescales = torch.reshape(inv_timescales, [batch_size, -1]) scaled_time = torch.reshape(positions, [1, -1, 1]) * torch.reshape( inv_timescales, [1, 1, -1] ) encoding = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=2) return encoding.type(dtype) def forward(self, x): batch_size, timesteps, input_dim = x.size() positions = torch.arange(1, timesteps + 1, device=x.device)[None, :] position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device) return x + position_encoding class PositionwiseFeedForward(torch.nn.Module): """Positionwise feed forward layer. Args: idim (int): Input dimenstion. hidden_units (int): The number of hidden units. dropout_rate (float): Dropout rate. """ def __init__(self, idim, hidden_units, dropout_rate, activation=torch.nn.ReLU()): """Construct an PositionwiseFeedForward object.""" super(PositionwiseFeedForward, self).__init__() self.w_1 = torch.nn.Linear(idim, hidden_units) self.w_2 = torch.nn.Linear(hidden_units, idim) self.dropout = torch.nn.Dropout(dropout_rate) self.activation = activation def forward(self, x): """Forward function.""" return self.w_2(self.dropout(self.activation(self.w_1(x)))) class MultiHeadedAttentionSANM(nn.Module): """Multi-Head Attention layer. Args: n_head (int): The number of heads. n_feat (int): The number of features. dropout_rate (float): Dropout rate. """ def __init__( self, n_head, in_feat, n_feat, dropout_rate, kernel_size, sanm_shfit=0, lora_list=None, lora_rank=8, lora_alpha=16, lora_dropout=0.1, ): """Construct an MultiHeadedAttention object.""" super().__init__() assert n_feat % n_head == 0 # We assume d_v always equals d_k self.d_k = n_feat // n_head self.h = n_head self.linear_out = nn.Linear(n_feat, n_feat) self.linear_q_k_v = nn.Linear(in_feat, n_feat * 3) self.attn = None self.dropout = nn.Dropout(p=dropout_rate) self.fsmn_block = nn.Conv1d( n_feat, n_feat, kernel_size, stride=1, padding=0, groups=n_feat, bias=False ) # padding left_padding = (kernel_size - 1) // 2 if sanm_shfit > 0: left_padding = left_padding + sanm_shfit right_padding = kernel_size - 1 - left_padding self.pad_fn = nn.ConstantPad1d((left_padding, right_padding), 0.0) def forward_fsmn(self, inputs, mask, mask_shfit_chunk=None): b, t, d = inputs.size() if mask is not None: mask = torch.reshape(mask, (b, -1, 1)) if mask_shfit_chunk is not None: mask = mask * mask_shfit_chunk inputs = inputs * mask x = inputs.transpose(1, 2) x = self.pad_fn(x) x = self.fsmn_block(x) x = x.transpose(1, 2) x += inputs x = self.dropout(x) if mask is not None: x = x * mask return x def forward_qkv(self, x): """Transform query, key and value. Args: query (torch.Tensor): Query tensor (#batch, time1, size). key (torch.Tensor): Key tensor (#batch, time2, size). value (torch.Tensor): Value tensor (#batch, time2, size). Returns: torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k). torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k). torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k). """ b, t, d = x.size() q_k_v = self.linear_q_k_v(x) q, k, v = torch.split(q_k_v, int(self.h * self.d_k), dim=-1) q_h = torch.reshape(q, (b, t, self.h, self.d_k)).transpose( 1, 2 ) # (batch, head, time1, d_k) k_h = torch.reshape(k, (b, t, self.h, self.d_k)).transpose( 1, 2 ) # (batch, head, time2, d_k) v_h = torch.reshape(v, (b, t, self.h, self.d_k)).transpose( 1, 2 ) # (batch, head, time2, d_k) return q_h, k_h, v_h, v def forward_attention(self, value, scores, mask, mask_att_chunk_encoder=None): """Compute attention context vector. Args: value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k). scores (torch.Tensor): Attention score (#batch, n_head, time1, time2). mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2). Returns: torch.Tensor: Transformed value (#batch, time1, d_model) weighted by the attention score (#batch, time1, time2). """ n_batch = value.size(0) if mask is not None: if mask_att_chunk_encoder is not None: mask = mask * mask_att_chunk_encoder mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) min_value = -float( "inf" ) # float(numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min) scores = scores.masked_fill(mask, min_value) self.attn = torch.softmax(scores, dim=-1).masked_fill( mask, 0.0 ) # (batch, head, time1, time2) else: self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) p_attn = self.dropout(self.attn) x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) x = ( x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k) ) # (batch, time1, d_model) return self.linear_out(x) # (batch, time1, d_model) def forward(self, x, mask, mask_shfit_chunk=None, mask_att_chunk_encoder=None): """Compute scaled dot product attention. Args: query (torch.Tensor): Query tensor (#batch, time1, size). key (torch.Tensor): Key tensor (#batch, time2, size). value (torch.Tensor): Value tensor (#batch, time2, size). mask (torch.Tensor): Mask tensor (#batch, 1, time2) or (#batch, time1, time2). Returns: torch.Tensor: Output tensor (#batch, time1, d_model). """ q_h, k_h, v_h, v = self.forward_qkv(x) fsmn_memory = self.forward_fsmn(v, mask, mask_shfit_chunk) q_h = q_h * self.d_k ** (-0.5) scores = torch.matmul(q_h, k_h.transpose(-2, -1)) att_outs = self.forward_attention(v_h, scores, mask, mask_att_chunk_encoder) return att_outs + fsmn_memory def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0): """Compute scaled dot product attention. Args: query (torch.Tensor): Query tensor (#batch, time1, size). key (torch.Tensor): Key tensor (#batch, time2, size). value (torch.Tensor): Value tensor (#batch, time2, size). mask (torch.Tensor): Mask tensor (#batch, 1, time2) or (#batch, time1, time2). Returns: torch.Tensor: Output tensor (#batch, time1, d_model). """ q_h, k_h, v_h, v = self.forward_qkv(x) if chunk_size is not None and look_back > 0 or look_back == -1: if cache is not None: k_h_stride = k_h[:, :, : -(chunk_size[2]), :] v_h_stride = v_h[:, :, : -(chunk_size[2]), :] k_h = torch.cat((cache["k"], k_h), dim=2) v_h = torch.cat((cache["v"], v_h), dim=2) cache["k"] = torch.cat((cache["k"], k_h_stride), dim=2) cache["v"] = torch.cat((cache["v"], v_h_stride), dim=2) if look_back != -1: cache["k"] = cache["k"][:, :, -(look_back * chunk_size[1]) :, :] cache["v"] = cache["v"][:, :, -(look_back * chunk_size[1]) :, :] else: cache_tmp = { "k": k_h[:, :, : -(chunk_size[2]), :], "v": v_h[:, :, : -(chunk_size[2]), :], } cache = cache_tmp fsmn_memory = self.forward_fsmn(v, None) q_h = q_h * self.d_k ** (-0.5) scores = torch.matmul(q_h, k_h.transpose(-2, -1)) att_outs = self.forward_attention(v_h, scores, None) return att_outs + fsmn_memory, cache class MultiSequential(torch.nn.Sequential): """Multi-input multi-output torch.nn.Sequential.""" def __init__(self, *args, layer_drop_rate=0.0): """Initialize MultiSequential with layer_drop. Args: layer_drop_rate (float): Probability of dropping out each fn (layer). """ super(MultiSequential, self).__init__(*args) self.layer_drop_rate = layer_drop_rate def forward(self, *args): """Repeat.""" _probs = torch.empty(len(self)).uniform_() for idx, m in enumerate(self): if not self.training or (_probs[idx] >= self.layer_drop_rate): args = m(*args) return args class CheckpointFunction(torch.nn.Module): """Wrap a module function for gradient checkpointing.""" def __init__(self, module_fn): super().__init__() self.module_fn = module_fn def forward(self, *args, **kwargs): # Use checkpointing on the forward pass of the module function return checkpoint(self.module_fn, *args, **kwargs) def repeat(N, fn, layer_drop_rate=0.0, use_checkpoint=False): """Repeat module N times. Args: N (int): Number of repeat time. fn (Callable): Function to generate module. layer_drop_rate (float): Probability of dropping out each fn (layer). Returns: MultiSequential: Repeated model instance. """ modules = [] for n in range(N): module_fn = fn(n) if use_checkpoint: # Wrap the module function with checkpointing module_fn = CheckpointFunction(module_fn) modules.append(module_fn) return MultiSequential(*modules, layer_drop_rate=layer_drop_rate) class LayerNorm(torch.nn.LayerNorm): """Layer normalization module. Args: nout (int): Output dim size. dim (int): Dimension to be normalized. """ def __init__(self, nout, dim=-1): """Construct an LayerNorm object.""" super(LayerNorm, self).__init__(nout, eps=1e-12) self.dim = dim def forward(self, x): """Apply layer normalization. Args: x (torch.Tensor): Input tensor. Returns: torch.Tensor: Normalized tensor. """ if self.dim == -1: return super(LayerNorm, self).forward(x) return super(LayerNorm, self).forward(x.transpose(self.dim, -1)).transpose(self.dim, -1) class EncoderLayerSANM(nn.Module): def __init__( self, in_size, size, self_attn, feed_forward, dropout_rate, normalize_before=True, concat_after=False, stochastic_depth_rate=0.0, ): """Construct an EncoderLayer object.""" super(EncoderLayerSANM, self).__init__() self.self_attn = self_attn self.feed_forward = feed_forward self.norm1 = LayerNorm(in_size) self.norm2 = LayerNorm(size) self.dropout = nn.Dropout(dropout_rate) self.in_size = in_size self.size = size self.normalize_before = normalize_before self.concat_after = concat_after if self.concat_after: self.concat_linear = nn.Linear(size + size, size) self.stochastic_depth_rate = stochastic_depth_rate self.dropout_rate = dropout_rate def forward(self, x, mask, cache=None, mask_shfit_chunk=None, mask_att_chunk_encoder=None): """Compute encoded features. Args: x_input (torch.Tensor): Input tensor (#batch, time, size). mask (torch.Tensor): Mask tensor for the input (#batch, time). cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size). Returns: torch.Tensor: Output tensor (#batch, time, size). torch.Tensor: Mask tensor (#batch, time). """ skip_layer = False # with stochastic depth, residual connection `x + f(x)` becomes # `x <- x + 1 / (1 - p) * f(x)` at training time. stoch_layer_coeff = 1.0 if self.training and self.stochastic_depth_rate > 0: skip_layer = torch.rand(1).item() < self.stochastic_depth_rate stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate) if skip_layer: if cache is not None: x = torch.cat([cache, x], dim=1) return x, mask residual = x if self.normalize_before: x = self.norm1(x) if self.concat_after: x_concat = torch.cat( ( x, self.self_attn( x, mask, mask_shfit_chunk=mask_shfit_chunk, mask_att_chunk_encoder=mask_att_chunk_encoder, ), ), dim=-1, ) if self.in_size == self.size: x = residual + stoch_layer_coeff * self.concat_linear(x_concat) else: x = stoch_layer_coeff * self.concat_linear(x_concat) else: if self.in_size == self.size: x = residual + stoch_layer_coeff * self.dropout( self.self_attn( x, mask, mask_shfit_chunk=mask_shfit_chunk, mask_att_chunk_encoder=mask_att_chunk_encoder, ) ) else: x = stoch_layer_coeff * self.dropout( self.self_attn( x, mask, mask_shfit_chunk=mask_shfit_chunk, mask_att_chunk_encoder=mask_att_chunk_encoder, ) ) if not self.normalize_before: x = self.norm1(x) residual = x if self.normalize_before: x = self.norm2(x) x = residual + stoch_layer_coeff * self.dropout(self.feed_forward(x)) if not self.normalize_before: x = self.norm2(x) return x, mask, cache, mask_shfit_chunk, mask_att_chunk_encoder class SANMEncoder(nn.Module): def __init__( self, input_size: int = 560, output_size: int = 512, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 50, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, attention_dropout_rate: float = 0.1, input_layer: Optional[str] = "pe", pos_enc_class=SinusoidalPositionEncoder, normalize_before: bool = True, concat_after: bool = False, positionwise_layer_type: str = "linear", positionwise_conv_kernel_size: int = 1, padding_idx: int = -1, interctc_layer_idx: List[int] = [], interctc_use_conditioning: bool = False, kernel_size: int = 11, sanm_shfit: int = 0, lora_list: List[str] = None, lora_rank: int = 8, lora_alpha: int = 16, lora_dropout: float = 0.1, selfattention_layer_type: str = "sanm", tf2torch_tensor_name_prefix_torch: str = "encoder", tf2torch_tensor_name_prefix_tf: str = "seq2seq/encoder", gradient_checkpoint=False ): super().__init__() self._output_size = output_size if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(input_size, output_size), torch.nn.LayerNorm(output_size), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "pe": self.embed = SinusoidalPositionEncoder() else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = ( output_size, linear_units, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") if selfattention_layer_type == "sanm": encoder_selfattn_layer = MultiHeadedAttentionSANM encoder_selfattn_layer_args0 = ( attention_heads, input_size, output_size, attention_dropout_rate, kernel_size, sanm_shfit, lora_list, lora_rank, lora_alpha, lora_dropout, ) encoder_selfattn_layer_args = ( attention_heads, output_size, output_size, attention_dropout_rate, kernel_size, sanm_shfit, lora_list, lora_rank, lora_alpha, lora_dropout, ) self.encoders0 = repeat( 1, lambda lnum: EncoderLayerSANM( input_size, output_size, encoder_selfattn_layer(*encoder_selfattn_layer_args0), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), use_checkpoint=gradient_checkpoint ) self.encoders = repeat( num_blocks - 1, lambda lnum: EncoderLayerSANM( output_size, output_size, encoder_selfattn_layer(*encoder_selfattn_layer_args), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), use_checkpoint=gradient_checkpoint ) if self.normalize_before: self.after_norm = LayerNorm(output_size) self.interctc_layer_idx = interctc_layer_idx if len(interctc_layer_idx) > 0: assert 0 < min(interctc_layer_idx) and max(interctc_layer_idx) < num_blocks self.interctc_use_conditioning = interctc_use_conditioning self.conditioning_layer = None self.dropout = nn.Dropout(dropout_rate) self.tf2torch_tensor_name_prefix_torch = tf2torch_tensor_name_prefix_torch self.tf2torch_tensor_name_prefix_tf = tf2torch_tensor_name_prefix_tf def output_size(self) -> int: return self._output_size def forward( self, xs_pad: torch.Tensor, ilens: torch.Tensor, prev_states: torch.Tensor = None, ctc = None, ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: """Embed positions in tensor. Args: xs_pad: input tensor (B, L, D) ilens: input length (B) prev_states: Not to be used now. Returns: position embedded tensor and mask """ masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device) xs_pad = xs_pad * self.output_size() ** 0.5 xs_pad = self.embed(xs_pad) # xs_pad = self.dropout(xs_pad) encoder_outs = self.encoders0(xs_pad, masks) xs_pad, masks = encoder_outs[0], encoder_outs[1] intermediate_outs = [] if len(self.interctc_layer_idx) == 0: encoder_outs = self.encoders(xs_pad, masks) xs_pad, masks = encoder_outs[0], encoder_outs[1] else: for layer_idx, encoder_layer in enumerate(self.encoders): encoder_outs = encoder_layer(xs_pad, masks) xs_pad, masks = encoder_outs[0], encoder_outs[1] if layer_idx + 1 in self.interctc_layer_idx: encoder_out = xs_pad # intermediate outputs are also normalized if self.normalize_before: encoder_out = self.after_norm(encoder_out) intermediate_outs.append((layer_idx + 1, encoder_out)) if self.interctc_use_conditioning: ctc_out = ctc.softmax(encoder_out) xs_pad = xs_pad + self.conditioning_layer(ctc_out) if self.normalize_before: xs_pad = self.after_norm(xs_pad) olens = masks.squeeze(1).sum(1) if len(intermediate_outs) > 0: return (xs_pad, intermediate_outs), olens, None return xs_pad, olens, None