# coding=utf-8 # Copyright 2021 The OneFlow Authors. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import oneflow as flow from oneflow import nn from libai.config import configurable from libai.layers import ( Embedding, LayerNorm, Linear, LMLogits, ParallelCrossEntropyLoss, TransformerLayer, VocabEmbedding, build_activation, ) from libai.utils import distributed as dist from .bert_model import BertEmbeddings, BertExtendedAttnMask, BertModel, BertPooler from .utils import init_method_normal class RobertaExtendedAttnMask(BertExtendedAttnMask): """ Same as BertExtendedAttnMask. """ class RobertaEmbeddings(BertEmbeddings): """ Same as BertEmbeddings with a tiny tweak for vocab_embeddings and position_embeddings. """ def __init__( self, vocab_size, hidden_size, max_sequence_length, embedding_dropout_prob, num_tokentypes=0, pad_token_id=1, init_method=nn.init.xavier_normal_, amp_enabled=False, ): super().__init__( vocab_size, hidden_size, max_sequence_length, embedding_dropout_prob, num_tokentypes=num_tokentypes, init_method=init_method, amp_enabled=amp_enabled, ) self.pad_token_id = pad_token_id self.vocab_embeddings = VocabEmbedding( vocab_size, hidden_size, init_method=init_method, amp_enabled=amp_enabled, padding_idx=pad_token_id, ) self.position_embeddings = Embedding( max_sequence_length, hidden_size, init_method=init_method, amp_enabled=amp_enabled, padding_idx=pad_token_id, ) if num_tokentypes > 0: self.tokentype_embeddings = Embedding( num_tokentypes, hidden_size, init_method=init_method, amp_enabled=amp_enabled ) self.tokentype_ids = flow.zeros( 1, max_sequence_length, dtype=flow.long, sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), placement=dist.get_layer_placement(0), ) else: self.tokentype_embeddings = None def forward(self, input_ids, tokentype_ids=None, position_ids=None): seq_length = input_ids.size()[1] word_embeddings = self.vocab_embeddings(input_ids) if position_ids is None: position_ids = self.create_position_ids_from_input_ids(input_ids, self.pad_token_id) position_embeddings = self.position_embeddings(position_ids) embeddings = word_embeddings + position_embeddings if self.tokentype_embeddings is not None: if tokentype_ids is None: tokentype_ids = ( self.tokentype_ids[:, :seq_length] .expand_as(input_ids) .to_global(sbp=input_ids.sbp) ) embeddings = embeddings + self.tokentype_embeddings(tokentype_ids) embeddings = self.embedding_dropout(embeddings) return embeddings def create_position_ids_from_input_ids(self, input_ids, pad_token_id): mask = input_ids.ne(pad_token_id).int() position_ids = (flow.cumsum(mask, dim=1).type_as(mask)) * mask + pad_token_id position_ids = position_ids.to_global(sbp=input_ids.sbp, placement=input_ids.placement) return position_ids class RobertaPooler(BertPooler): """ Same as BertPooler. """ class RobertaLoss(nn.Module): def __init__(self): super().__init__() self.lm_loss = ParallelCrossEntropyLoss() def forward(self, lm_output, lm_labels, loss_mask): lm_labels = lm_labels.to_global(placement=lm_output.placement) loss_mask = loss_mask.to_global(placement=lm_output.placement) lm_loss = self.lm_loss(lm_output, lm_labels) loss_mask = loss_mask.float() # Change loss_mask.sum() sbp sign from [P, B] -> [B, B] # because (lm_loss * loss_mask) / loss_mask.sum() cannot accept P / P denominator = loss_mask.sum().to_global( sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]) ) masked_lm_loss = flow.sum(lm_loss.view(-1) * loss_mask.view(-1)) / denominator masked_lm_loss = masked_lm_loss.to_global( sbp=dist.get_nd_sbp([flow.sbp.partial_sum, flow.sbp.broadcast]) ) loss_dict = {"lm_loss": masked_lm_loss} return loss_dict class RobertaModel(BertModel): """The bare Roberta Model transformer outputting raw hidden-states without any specific head on top. Args: vocab_size (int): The size of vocabulary file. hidden_size (int): The size of hidden states. hidden_layers (int): The number of ``TransformerLayer`` in encoder. num_attention_heads (int): The number of attention heads for each attention layer of ``TransformerLayer``. intermediate_size (int): The size of intermediate layer in feed-forward network for each ``TransformerLayer``. hidden_dropout_prob (float, optional): The dropout ratio for the output for each TransformerLayer. Defaults to 0.0. attention_probs_dropout_prob (float, optional): The dropout ratio for the output of each attention layer in ``TransformerLayer``. Defaults to 0.0. max_position_embeddings (int): Max sequence length of input, defines the shape of Position Embeddings in ``RobertaEmbeddings``. type_vocab_size (int, optional): Number of segment token indices. Defaults to 2. add_pooling_layer (bool, optional): Whether or not averaging or pooling the sequence of hidden-states for the whole input sequence. Defaults to ``True``. initializer_range (float, optional): Sigma of the normal distribution in the initialization method. Defaults to 0.02. layer_norm_eps (float, optional): The epsilon of LayerNorm layer. Defaults to 1e-5. pad_token_id (int, optional): The token id used for padding. Defaults to 1. bias_gelu_fusion (bool, optional): Whether or not to fuse the computing of bias and gelu. Defaults to ``False``. bias_dropout_fusion (bool, optional): Whether or not to fuse the computing of dropout and bias. Defaults to ``False``. scale_mask_softmax_fusion (bool, optional): Whether to fuse the computing of mask and softmax in attention layers. Defaults to ``False``. apply_query_key_layer_scaling (bool, optional): Whether or not to use layer index related scaling in computing attention scores. If ``True``, the scaling factor equals to sqrt(d) * (layer_index + 1). Defaults to ``True``. apply_residual_post_layernorm (bool, optional): If set ``True``, use original BERT(Roberta) residual connection ordering otherwise use Megatron BERT residual connection which is more stable when scaling model size introduced in https://arxiv.org/pdf/1909.08053.pdf. Default: ``False``. amp_enabled (bool, optional): Whether or not to set fp16 for embedding weight in T5 model. Defaults to ``False``. """ @configurable def __init__( self, vocab_size, hidden_size, hidden_layers, num_attention_heads, intermediate_size, hidden_dropout_prob, attention_probs_dropout_prob, max_position_embeddings, num_tokentypes=2, add_pooling_layer=True, initializer_range=0.02, layernorm_eps=1e-12, pad_token_id=1, bias_gelu_fusion=True, bias_dropout_fusion=True, scale_mask_softmax_fusion=True, apply_query_key_layer_scaling=True, apply_residual_post_layernorm=False, amp_enabled=False, ): super().__init__( vocab_size, hidden_size, hidden_layers, num_attention_heads, intermediate_size, hidden_dropout_prob, attention_probs_dropout_prob, max_position_embeddings, num_tokentypes=num_tokentypes, add_pooling_layer=add_pooling_layer, initializer_range=initializer_range, layernorm_eps=layernorm_eps, bias_gelu_fusion=bias_gelu_fusion, bias_dropout_fusion=bias_dropout_fusion, scale_mask_softmax_fusion=scale_mask_softmax_fusion, apply_query_key_layer_scaling=apply_query_key_layer_scaling, apply_residual_post_layernorm=apply_residual_post_layernorm, amp_enabled=amp_enabled, ) init_method = init_method_normal(initializer_range) # Embeddings self.embeddings = RobertaEmbeddings( vocab_size, hidden_size, max_position_embeddings, hidden_dropout_prob, num_tokentypes, pad_token_id, init_method, amp_enabled, ) # Mask generation self.extended_attn_mask = RobertaExtendedAttnMask() self.pooler = RobertaPooler(hidden_size, init_method) if add_pooling_layer else None @classmethod def from_config(cls, cfg): return { "vocab_size": cfg.vocab_size, "hidden_size": cfg.hidden_size, "hidden_layers": cfg.hidden_layers, "num_attention_heads": cfg.num_attention_heads, "intermediate_size": cfg.intermediate_size, "hidden_dropout_prob": cfg.hidden_dropout_prob, "attention_probs_dropout_prob": cfg.attention_probs_dropout_prob, "max_position_embeddings": cfg.max_position_embeddings, "num_tokentypes": cfg.num_tokentypes, "add_pooling_layer": cfg.add_pooling_layer, "initializer_range": cfg.initializer_range, "layernorm_eps": cfg.layernorm_eps, "pad_token_id": cfg.pad_token_id, "bias_gelu_fusion": cfg.bias_gelu_fusion, "bias_dropout_fusion": cfg.bias_dropout_fusion, "scale_mask_softmax_fusion": cfg.scale_mask_softmax_fusion, "apply_query_key_layer_scaling": cfg.apply_query_key_layer_scaling, "apply_residual_post_layernorm": cfg.apply_residual_post_layernorm, "amp_enabled": cfg.amp_enabled, } class RobertaLMHead(nn.Module): def __init__(self, vocab_size, hidden_size, init_method, layer_norm_eps): super().__init__() self.dense = Linear( hidden_size, hidden_size, bias=True, parallel="data", init_method=init_method, layer_idx=-1, ) self.activation_func = build_activation("gelu") self.layernorm = LayerNorm((hidden_size,), eps=layer_norm_eps, layer_idx=-1) # NOTE(xzp): LMLogits as a decoder:nn.Linear(hidden_size, vocab_size), # it shares the roberta.word_embeddings.weight self.lm_logits = LMLogits(vocab_size, bias=True) def forward(self, hidden_states, word_embeddings_weight): hidden_states = self.dense(hidden_states) hidden_states = self.activation_func(hidden_states) hidden_states = hidden_states.to_global( sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.broadcast]) ) hidden_states = self.layernorm(hidden_states) hidden_states = self.lm_logits(hidden_states, word_embeddings_weight) return hidden_states class RobertaPreTrainedModel(nn.Module): @staticmethod def set_pipeline_stage_id(model): dist_utils = dist.get_dist_util() # Set pipeline parallelism stage_id if hasattr(model.roberta.final_layernorm, "config"): # Old API in OneFlow 0.8 for module_block in model.modules(): # module.origin can get the original module if isinstance(module_block.origin, RobertaEmbeddings): module_block.config.set_stage( dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0) ) elif isinstance(module_block.origin, RobertaExtendedAttnMask): module_block.config.set_stage( dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0) ) elif isinstance(module_block.origin, TransformerLayer): module_block.config.set_stage( dist_utils.get_layer_stage_id(module_block.layer_idx), dist.get_layer_placement(module_block.layer_idx), ) # `add_pooling_layer` in RobertaForMaskedLM and RobertaForCausalLM. # default to False. elif isinstance(module_block.origin, RobertaPooler): module_block.config.set_stage( dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1) ) elif isinstance(module_block.origin, RobertaLMHead): module_block.config.set_stage( dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1) ) # Set the last layernorm stage id model.roberta.final_layernorm.config.set_stage( dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1) ) else: for module_block in model.modules(): # module.origin can get the original module if isinstance(module_block.to(nn.Module), RobertaEmbeddings): module_block.to(nn.graph.GraphModule).set_stage( dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0) ) elif isinstance(module_block.to(nn.Module), RobertaExtendedAttnMask): module_block.to(nn.graph.GraphModule).set_stage( dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0) ) elif isinstance(module_block.to(nn.Module), TransformerLayer): module_block.to(nn.graph.GraphModule).set_stage( dist_utils.get_layer_stage_id(module_block.layer_idx), dist.get_layer_placement(module_block.layer_idx), ) # `add_pooling_layer` in RobertaForMaskedLM and RobertaForCausalLM. # default to False. elif isinstance(module_block.to(nn.Module), RobertaPooler): module_block.to(nn.graph.GraphModule).set_stage( dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1) ) elif isinstance(module_block.to(nn.Module), RobertaLMHead): module_block.to(nn.graph.GraphModule).set_stage( dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1) ) # Set the last layernorm stage id model.roberta.final_layernorm.to(nn.graph.GraphModule).set_stage( dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1) ) class RobertaForPreTraining(RobertaPreTrainedModel): def __init__(self, cfg): super().__init__() cfg.add_pooling_layer = False self.roberta = RobertaModel(cfg) self.lm_head = RobertaLMHead( cfg.vocab_size, cfg.hidden_size, init_method_normal(cfg.initializer_range), cfg.layernorm_eps, ) self.loss_fc = RobertaLoss() def forward( self, input_ids, attention_mask, tokentype_ids=None, lm_labels=None, loss_mask=None, ): """ Args: input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary. attention_mask (flow.BoolTensor): Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. tokentype_ids (flow.LongTensor, optional): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`. Defaults to None. labels (flow.LongTensor, optional): Labels for computing the masked language modeling loss. Indices should be in `[-1, 0, ..., config.vocab_size]`. Defaults to None. loss_mask (flow.BoolTensor, optional): Mask to avoid performing loss computing on ignored tokens. Tokens with indices set to `-1` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. Defaults to None. """ input_ids = input_ids.to_global(placement=dist.get_layer_placement(0)) attention_mask = attention_mask.to_global(placement=dist.get_layer_placement(0)) tokentype_ids = tokentype_ids.to_global(placement=dist.get_layer_placement(0)) outputs = self.roberta(input_ids, attention_mask, tokentype_ids=tokentype_ids) sequence_output = outputs[0] prediction_scores = self.lm_head(sequence_output, self.roberta.word_embeddings_weight()) if lm_labels is not None: return self.loss_fc(prediction_scores, lm_labels, loss_mask) return {"prediction_scores": prediction_scores} class RobertaForCausalLM(RobertaPreTrainedModel): def __init__(self, cfg): super().__init__() cfg.add_pooling_layer = False self.roberta = RobertaModel(cfg) self.lm_head = RobertaLMHead( cfg.vocab_size, cfg.hidden_size, init_method_normal(cfg.initializer_range), cfg.layernorm_eps, ) self.loss_fc = RobertaLoss() def forward( self, input_ids, attention_mask, tokentype_ids=None, position_ids=None, labels=None, loss_mask=None, ): """ Args: input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary. attention_mask (flow.BoolTensor): Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. tokentype_ids (flow.LongTensor, optional): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`. Defaults to None. position_ids (flow.LongTensor, optional): Indices of positions of each input sequence tokens in the position embeddings. Defaults to None. labels (flow.LongTensor, optional): Labels for computing the masked language modeling loss. Indices should be in `[-1, 0, ..., config.vocab_size]`. Defaults to None. loss_mask (flow.BoolTensor, optional): Mask to avoid performing loss computing on ignored tokens. Tokens with indices set to `-1` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. Defaults to None. """ outputs = self.roberta(input_ids, attention_mask, position_ids, tokentype_ids) sequence_output = outputs[0] prediction_scores = self.lm_head(sequence_output, self.roberta.word_embeddings_weight()) if labels is not None: # next-token prediction task, shift prediction_scores and labels by one. shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() shifted_prediction_scores = shifted_prediction_scores.to_global( sbp=prediction_scores.sbp ) shifted_labels = labels[:, 1:].contiguous() shifted_labels = shifted_labels.to_global(sbp=shifted_labels.sbp) lm_loss = self.loss_fc(shifted_prediction_scores, shifted_labels, loss_mask) return {"lm_loss": lm_loss} return {"prediction_scores": prediction_scores}