Commit 3b355d3f authored by yuguo960516's avatar yuguo960516
Browse files

gpt2

parent fd158e88
Pipeline #143 canceled with stages
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow as flow
from oneflow import nn
from libai.utils import distributed as dist
class LayerNorm(nn.Module):
"""Applies Layer Normalization over a mini-batch of inputs in 1D parallelism.
Args:
normalized_shape: input shape from an expected input of size.
eps: a value added to the denominator for numerical stability. Defaults to 1e-5.
elementwise_affine: a boolean value that when set to ``True``, this module
has learnable per-element affine parameters initialized to ones (for weights)
and zeros (for biases). Default: ``True``.
elementwise_affine: a boolean value that when set to ``True``, this module
has learnable per-element affine parameters initialized to ones (for weights)
and zeros (for biases). Default: ``True``.
bias: If set to ``False``, the layer will not learn an additive bias. Defaults to ``True``.
layer_idx: a layer_idx sign which determines the placement. It will be used in pipeline
parallelism. Defaults to 0.
"""
def __init__(
self, normalized_shape, eps=1e-5, elementwise_affine=True, bias=True, *, layer_idx=0
):
super().__init__()
if isinstance(normalized_shape, int):
normalized_shape = (normalized_shape,)
self.normalized_shape = tuple(normalized_shape)
self.eps = eps
self.elementwise_affine = elementwise_affine
self.layer_idx = layer_idx
if elementwise_affine:
self.weight = nn.Parameter(
flow.ones(
normalized_shape,
dtype=flow.float32,
placement=dist.get_layer_placement(layer_idx),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
)
)
self.bias = nn.Parameter(
flow.zeros(
normalized_shape,
dtype=flow.float32,
placement=dist.get_layer_placement(layer_idx),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
),
requires_grad=bias,
)
else:
self.weight = None
self.bias = None
def forward(self, x):
assert x.shape[-len(self.normalized_shape) :] == self.normalized_shape
begin_norm_axis = x.ndim - len(self.normalized_shape)
begin_params_axis = x.ndim - len(self.normalized_shape)
if self.elementwise_affine:
y = flow._C.layer_norm_affine(
x,
self.weight,
self.bias,
begin_norm_axis=begin_norm_axis,
begin_params_axis=begin_params_axis,
epsilon=self.eps,
)
else:
y = flow._C.layer_norm(
x,
begin_norm_axis=begin_norm_axis,
begin_params_axis=begin_params_axis,
epsilon=self.eps,
)
return y
def extra_repr(self) -> str:
return "{normalized_shape}, eps={eps}, elementwise_affine={elementwise_affine}".format(
**self.__dict__
)
class RMSLayerNorm(nn.Module):
"""T5 uses a layer_norm which only scales and doesn't shift, which is also known as
Root Mean Square Layer Normalization thus varience is calculated w/o mean and
there is no bias. More details see: https://arxiv.org/abs/1910.07467.
Args:
normalized_shape: input shape from an expected input of size.
eps: a value added to the denominator for numerical stability. Defaults to 1e-5.
elementwise_affine: a boolean value that when set to ``True``, this module
has learnable per-element affine parameters initialized to ones (for weights)
and zeros (for biases). Default: ``True``.
layer_idx: a layer_idx sign which determines the placement. It will be used in pipeline
parallelism. Defaults to 0.
"""
def __init__(self, normalized_shape, eps=1e-6, layer_idx=0):
super().__init__()
self.layer_idx = layer_idx
self.weight = flow.nn.Parameter(
flow.ones(
normalized_shape,
dtype=flow.float32,
placement=dist.get_layer_placement(layer_idx),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
)
)
self.l2norm_epsilon = eps
def forward(self, hidden_states):
return flow._C.rms_layer_norm(hidden_states, self.weight, self.l2norm_epsilon)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow as flow
from oneflow import nn
from libai.utils import distributed as dist
class Linear1D(nn.Module):
r"""Linear layer with 1D parallelism which includes column parallelism and row parallelism.
The linear layer is defined as :math:`y = xA^T + b`.
In column parallelism, A^T is parallelized along the second dimension
as :math:`A^T = [A_1, ..., A_p]`.
In row parallelism, A^T is parallelized along the first dimension and X along its second
dimension as:
.. math::
A^T = \begin{bmatrix}
A\_1 \\
. \\
. \\
. \\
A\_p
\end{bmatrix}
x = \begin{bmatrix}
x\_1 & ... & x\_p
\end{bmatrix}
Arguments:
in_features: size of each input sample.
out_features: size of each output sample.
bias: If set to ``False``, the layer will not learn an additive bias. Defaults to ``True``.
parallel: Parallel mode. Defaults to "data".
init_method: method to initialize weight. Defaults to :func:`nn.init.xavier_normal_`.
skip_bias_add: skip adding bias but instead return it, so that adding bias can be fused with
other elementwise operations. Defaults to ``False``.
layer_idx: A layer_idx sign which determines the placement. It will be used in pipeline
parallelism. Defaults to 0.
"""
def __init__(
self,
in_features,
out_features,
bias=True,
parallel="data",
init_method=nn.init.xavier_normal_,
skip_bias_add=False,
*,
layer_idx=0, # enforce layer_idx passed with keyword
):
super().__init__()
self.in_features = in_features
self.out_features = out_features
self.parallel = parallel
self.skip_bias_add = skip_bias_add
if parallel == "col":
# Column parallel
# weight sbp sign: [B, S(0)], weight will be transposed when performing matmul
# so weight sbp sign actually be [B, S(1)]
# bias sbp sign: [B, S(0)]
weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)])
bias_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)])
elif parallel == "row":
# Row parallel
# weight sbp sign: [B, S(1)], weight will be transposed when performing matmul
# so weight sbp sign actually be [B, S(1)]
# bias sbp sign: [B, B]
weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(1)])
bias_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
elif parallel == "data":
weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
bias_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
else:
raise KeyError(f"{parallel} is not supported! Only support ('data', 'row' and 'col')")
self.weight = flow.nn.Parameter(
flow.empty(
(out_features, in_features),
dtype=flow.float32,
placement=dist.get_layer_placement(layer_idx), # for pipeline parallelism placement
sbp=weight_sbp,
)
)
init_method(self.weight)
self.bias = (
flow.nn.Parameter(
flow.zeros(
(out_features,),
dtype=flow.float32,
placement=dist.get_layer_placement(layer_idx),
sbp=bias_sbp,
)
)
if bias
else None
)
def forward(self, x):
if dist.same_sbp(self.weight.sbp, dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)])):
# If the last dim of weight sbp sign is S(0), then last dim of weight.t sbp
# sign is S(1), so the last dim of x sbp sign must be B.
if self.weight.sbp[-1] == flow.sbp.split(0):
x_sbp = x.sbp[:-1] + (flow.sbp.broadcast,)
x = x.to_global(sbp=x_sbp)
# x.grad sbp must be x.sbp, otherwise backward pass cannot be performed correctly.
x = x.to_global(grad_sbp=x.sbp)
x = flow.matmul(x, self.weight, transpose_b=True)
elif dist.same_sbp(
self.weight.sbp, dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(1)])
):
# If the last dim of weight sbp sign is S(1), then last dim of weight.t sbp
# sign is S(0), so the last dim of x sbp sign must be S(ndim-1).
if self.weight.sbp[-1] == flow.sbp.split(1):
x_sbp = x.sbp[:-1] + (flow.sbp.split(x.ndim - 1),)
x = x.to_global(sbp=x_sbp)
out_sbp = x.sbp[:-1] + (flow.sbp.broadcast,)
else:
out_sbp = x.sbp
x = flow.matmul(x, self.weight, transpose_b=True)
# Change x.sbp for followup forward pass.
# This line can be removed when sbp can be auto inferred.
x = x.to_global(sbp=out_sbp)
elif dist.same_sbp(
self.weight.sbp, dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
):
# x.grad sbp must be x.sbp, otherwise backward pass cannot be performed correctly.
x = x.to_global(grad_sbp=x.sbp)
# NOTE(chengcheng): when input x is [S(0), B], there is no need to change sbp for x.
# x = x.to_global(sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.split(0)]))
x = flow.matmul(x, self.weight, transpose_b=True)
else:
# Not supported weight_sbp, deduce sbp and communicate with nccl automatically.
x = flow.matmul(x, self.weight, transpose_b=True)
if self.bias is not None:
if self.skip_bias_add:
return x, self.bias
else:
return x + self.bias
else:
return x
def extra_repr(self) -> str:
return "in_features={}, out_features={}, bias={}, parallel={}".format(
self.in_features,
self.out_features,
self.bias is not None,
self.parallel,
)
# Give an alias for Linear1d
Linear = Linear1D
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow as flow
from oneflow import nn
from libai.utils import distributed as dist
class LMLogits(nn.Module):
def __init__(self, vocab_size, bias=False):
super().__init__()
self.bias = (
nn.Parameter(
flow.zeros(
(vocab_size,),
dtype=flow.float32,
placement=dist.get_layer_placement(-1),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)]),
)
)
if bias
else None
)
def forward(self, input, word_embeddings):
"""LM logits using word embedding weights"""
# input with sbp sign [S(0), B] and word_embeddings with sbp sign [S(0), B]
# NOTE(l1aoxingyu): This is for pipeline parallelism
# change word embedding placement from stage(0) to stage(-1)
w = word_embeddings.to_global(placement=input.placement)
# NOTE(l1aoxingyu): input x embed^T = logits with sbp sign
# [S(0), B] x [B, S(1)] --> [S(0), S(1)]
# ↑ ↑ ↑
# input embed^T logits
# Backward pass input.grad = logits.grad x embed with sbp sign
# [S(0), S(1)] x [B, S(0)] --> [S(0), P]
# ↑ ↑ ↑
# logits.grad embed input.grad
# When use input.grad as head node for backward pass, need to convert
# its sbp sign fromm [S(0), P] --> [S(0), B]
input = input.to_global(grad_sbp=input.sbp)
logits = flow._C.matmul(input, w, transpose_b=True)
if self.bias is not None:
logits = logits + self.bias
return logits
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow as flow
from oneflow import nn
from libai.layers import Linear, build_activation
class MLP(nn.Module):
"""MLP
MLP will take the input with h hidden state, project it to intermediate
hidden dimension, perform gelu transformation, and project the
state back into h hidden dimension.
Arguments:
hidden_size: size of each input and output sample.
ffn_hidden_size: size of each intermediate sample.
output_dropout_prob: Output dropout probability. Defaults to 0.0.
init_method: method to initialize the first linear weight.
Defaults to :func:`nn.init.xavier_normal_`.
output_layer_init_method: method to initialize the second linear weight. If set to None,
it will use ``init_method`` instead. Defaults to None.
bias_gelu_fusion: If set to ``True``, it will fuse bias adding and elementwise
gelu activation. Defaults to ``False``.
bias_dropout_fusion: If set to ``True``, it will fuse bias adding and dropout.
Defaults to ``False``.
layer_idx: A layer_idx sign which determines the placement. It will be used in
pipeline parallelism. Defaults to 0.
"""
def __init__(
self,
hidden_size,
ffn_hidden_size,
output_dropout_prob=0.0,
init_method=nn.init.xavier_normal_,
output_layer_init_method=None,
bias_gelu_fusion=False,
bias_dropout_fusion=False,
*,
layer_idx=0,
):
super().__init__()
self.output_dropout_prob = output_dropout_prob
self.bias_gelu_fusion = bias_gelu_fusion
self.bias_dropout_fusion = bias_dropout_fusion
if output_layer_init_method is None:
output_layer_init_method = init_method
self.dense_h_to_4h = Linear(
hidden_size,
ffn_hidden_size,
bias=True,
parallel="col",
skip_bias_add=bias_gelu_fusion,
init_method=init_method,
layer_idx=layer_idx,
)
if not bias_gelu_fusion:
self.activation_func = build_activation("gelu")
self.dense_4h_to_h = Linear(
ffn_hidden_size,
hidden_size,
bias=True,
parallel="row",
skip_bias_add=bias_dropout_fusion,
init_method=output_layer_init_method,
layer_idx=layer_idx,
)
if not bias_dropout_fusion:
self.dropout = nn.Dropout(self.output_dropout_prob)
def forward(self, hidden_states):
intermediate = self.dense_h_to_4h(hidden_states)
if self.bias_gelu_fusion:
intermediate, bias = intermediate
intermediate = flow._C.fused_bias_add_gelu(
intermediate, bias, axis=intermediate.ndim - 1
)
else:
intermediate = self.activation_func(intermediate)
output = self.dense_4h_to_h(intermediate)
if self.bias_dropout_fusion:
output, bias = output
output = flow._C.fused_bias_add_dropout(
output, bias, p=self.output_dropout_prob, axis=output.ndim - 1
)
else:
output = self.dropout(output)
return output
def extra_repr(self) -> str:
return "bias_gelu_fusion={}, bias_dropout_fusion={}, dropout={}".format(
self.bias_gelu_fusion, self.bias_dropout_fusion, self.output_dropout_prob
)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow.nn as nn
from libai.utils import distributed as dist
from .attention import AttnMaskType, MultiheadAttention
from .droppath import DropPath
from .layer_norm import LayerNorm
from .mlp import MLP
class TransformerLayer(nn.Module):
"""A single transformer layer.
Transformer layer takes input with size [bsz, seq_length, hidden size] and returns an
output of the same size.
The input and output has same sbp sign, (S(0), B).
Arguments:
hidden_size: size of hidden state.
ffn_hidden_size: size of feed forword neural network.
num_attention_heads: number of attention heads.
is_decoder: used to specify whether this is transformer encoder layer or transformer
decoder layer. Default: ``False``.
attention_dropout_prob: dropout probability of attention weights.
output_dropout_prob: dropout probability of output.
layernorm_epsilon: epsilon used in layernorm layer. Default: `1e-5`.
init_method: method to initialize the input layer weights.
output_layer_init_method: method to initialize the output layer weights.
If None, use `init_method`.
bias_gelu_fusion: whether fuse add bias and gelu. Default: ``False``.
bias_dropout_fusion: whether fuse add bias and dropout. Default: ``False``.
scale_mask_softmax_fusion: whether to fuse scale, mask and softmax. Default: ``False``.
apply_query_key_layer_scaling: if `true`, scaling the attention score by layer index.
Default: ``False``.
apply_residual_post_layernorm: if ``true``, use original BERT residual
connection ordering. Otherwise, use Megatron BERT residual connection which
is more stable when scaling model size introduced in
https://arxiv.org/pdf/1909.08053.pdf.
Default: ``False``.
layer_idx: the layer index, which determines the placement.
"""
def __init__(
self,
hidden_size,
ffn_hidden_size,
num_attention_heads,
is_decoder=False,
attention_dropout_prob=0.0,
output_dropout_prob=0.0,
drop_path_prob=0.0,
layernorm_epsilon=1e-5,
init_method=nn.init.xavier_normal_,
output_layer_init_method=None,
bias_gelu_fusion=False,
bias_dropout_fusion=False,
scale_mask_softmax_fusion=False,
apply_query_key_layer_scaling=False,
apply_residual_post_layernorm=False,
attn_mask_type=AttnMaskType.padding,
*,
layer_idx=0
):
super().__init__()
self.hidden_size = hidden_size
self.ffn_hidden_size = ffn_hidden_size
self.num_attention_heads = num_attention_heads
self.attention_dropout_prob = attention_dropout_prob
self.output_dropout_prob = output_dropout_prob
self.layernorm_epsilon = layernorm_epsilon
self.attn_mask_type = attn_mask_type
self.layer_idx = layer_idx
self.is_decoder = is_decoder
self.bias_gelu_fusion = bias_gelu_fusion
self.bias_dropout_fusion = bias_dropout_fusion
self.scale_mask_softmax_fusion = scale_mask_softmax_fusion
self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
self.apply_residual_post_layernorm = apply_residual_post_layernorm
self.init_method = init_method
if output_layer_init_method is None:
output_layer_init_method = init_method
self.output_layer_init_method = output_layer_init_method
self.drop_path = DropPath(drop_path_prob) if drop_path_prob > 0.0 else nn.Identity()
self.input_layernorm = LayerNorm(
self.hidden_size, eps=self.layernorm_epsilon, layer_idx=self.layer_idx
)
self.self_attention = self.build_attention(is_cross_attention=False)
self.post_attention_layernorm = LayerNorm(
self.hidden_size, eps=self.layernorm_epsilon, layer_idx=self.layer_idx
)
if self.is_decoder:
self.cross_attention = self.build_attention(is_cross_attention=True)
self.post_cross_attention_layernorm = LayerNorm(
self.hidden_size, eps=self.layernorm_epsilon, layer_idx=self.layer_idx
)
self.mlp = MLP(
self.hidden_size,
self.ffn_hidden_size,
self.output_dropout_prob,
self.init_method,
output_layer_init_method=self.output_layer_init_method,
bias_gelu_fusion=self.bias_gelu_fusion,
bias_dropout_fusion=self.bias_dropout_fusion,
layer_idx=self.layer_idx,
)
def forward(
self,
hidden_states,
attention_mask=None,
encoder_states=None,
encoder_attention_mask=None,
past_key_value=None,
use_cache=False,
):
"""
Args:
hidden_states: shape is (batch_size, seq_length, hidden_size),
sbp signature is (S(0), B).
attention_mask: the combination of key padding mask and casual mask of hidden states
with shape (batch_size, 1, seq_length, seq_length) and the sbp
signature is (S(0), B),
encoder_states: encoder output with shape (batch_size, seq_length, hidden_size)
and the sbp signature is (S(0), B), which will be used in cross attention.
encoder_attention_mask: key padding mask of encoder states with shape
(batch_size, 1, seq_length, seq_length) and the sbp signature is (S(0), B).
past_key_value: tuple of key and value, each shape is
(seq_length, bsz, num_heads, head_size), For decoder layer,
the past_key_value contains the states both from self attention
and cross attention.
use_cache: it will be set to `True` when the model is in the inference phase and
used for incremental decoding.
"""
# Change placement for pipeline parallelsim
hidden_states = hidden_states.to_global(placement=dist.get_layer_placement(self.layer_idx))
# hidden_states shape: (batch_size, seq_length, hidden_size)
if attention_mask is not None:
attention_mask = attention_mask.to_global(
placement=dist.get_layer_placement(self.layer_idx)
)
if past_key_value is not None:
if self.is_decoder:
assert len(past_key_value) == 4
self_attn_past_key_value = past_key_value[:2]
cross_attn_past_key_value = past_key_value[2:]
else:
self_attn_past_key_value = past_key_value
cross_attn_past_key_value = None
else:
self_attn_past_key_value, cross_attn_past_key_value = None, None
layernorm_output = self.input_layernorm(hidden_states)
attention_output = self.self_attention(
layernorm_output,
attention_mask=attention_mask,
past_key_value=self_attn_past_key_value,
use_cache=use_cache,
)
attention_output = self.drop_path(attention_output)
if use_cache:
attention_output, presents = attention_output
if self.apply_residual_post_layernorm:
residual = layernorm_output
else:
residual = hidden_states
hidden_states = residual + attention_output
layernorm_output = self.post_attention_layernorm(hidden_states)
if self.is_decoder:
attention_output = self.cross_attention(
layernorm_output,
encoder_states,
attention_mask=encoder_attention_mask,
past_key_value=cross_attn_past_key_value,
use_cache=use_cache,
)
if use_cache:
attention_output, decoder_presents = attention_output
presents += decoder_presents
attention_output = self.drop_path(attention_output)
if self.apply_residual_post_layernorm:
residual = layernorm_output
else:
residual = hidden_states
hidden_states = residual + attention_output
layernorm_output = self.post_cross_attention_layernorm(hidden_states)
mlp_output = self.mlp(layernorm_output)
mlp_output = self.drop_path(mlp_output)
if self.apply_residual_post_layernorm:
residual = layernorm_output
else:
residual = hidden_states
output = residual + mlp_output
if use_cache:
output = (output, presents)
return output
def build_attention(self, is_cross_attention=False):
return MultiheadAttention(
self.hidden_size,
self.num_attention_heads,
is_cross_attention=is_cross_attention,
attention_dropout_prob=self.attention_dropout_prob,
output_dropout_prob=self.output_dropout_prob,
init_method=self.init_method,
output_layer_init_method=self.output_layer_init_method,
bias_dropout_fusion=self.bias_dropout_fusion,
scale_mask_softmax_fusion=self.scale_mask_softmax_fusion,
apply_query_key_layer_scaling=self.apply_query_key_layer_scaling,
attn_mask_type=self.attn_mask_type,
layer_idx=self.layer_idx,
)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .bert_model import BertForPreTraining, BertModel, BertForClassification
from .roberta_model import RobertaForPreTraining, RobertaForCausalLM, RobertaModel
from .build import build_graph, build_model
from .t5_model import T5ForPreTraining, T5Model
from .gpt_model import GPTForPreTraining, GPTModel
from .vision_transformer import VisionTransformer
from .swin_transformer import SwinTransformer
from .swin_transformer_v2 import SwinTransformerV2
from .resmlp import ResMLP
__all__ = [
"build_model",
"build_graph",
"BertModel",
"BertForPreTraining",
"BertForClassification",
"RobertaModel",
"RobertaForCausalLM",
"RobertaForPreTraining",
"T5Model",
"T5ForPreTraining",
"GPTModel",
"GPTForPreTraining",
"VisionTransformer",
"SwinTransformer",
"SwinTransformerV2",
"ResMLP",
]
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow as flow
from oneflow import nn
from libai.config import configurable
from libai.layers import (
Embedding,
LayerNorm,
Linear,
LMLogits,
ParallelCrossEntropyLoss,
TransformerLayer,
VocabEmbedding,
build_activation,
)
from libai.layers.attention import AttnMaskType
from libai.utils import distributed as dist
from .utils import init_method_normal, scaled_init_method_normal
class BertExtendedAttnMask(nn.Module):
def forward(self, attention_mask):
# We create a 3D attention mask from a 2D tensor mask.
# [b, 1, s]
attention_mask_b1s = attention_mask.unsqueeze(1)
# [b, s, 1]
attention_mask_bs1 = attention_mask.unsqueeze(2)
# [b, s, s]
attention_mask_bss = attention_mask_b1s * attention_mask_bs1
# [b, 1, s, s]
extended_attention_mask = attention_mask_bss.unsqueeze(1)
return extended_attention_mask
class BertEmbeddings(nn.Module):
def __init__(
self,
vocab_size,
hidden_size,
max_sequence_length,
embedding_dropout_prob,
num_tokentypes=0,
init_method=nn.init.xavier_normal_,
amp_enabled=False,
):
super().__init__()
self.vocab_embeddings = VocabEmbedding(
vocab_size, hidden_size, init_method=init_method, amp_enabled=amp_enabled
)
self.position_embeddings = Embedding(
max_sequence_length, hidden_size, init_method=init_method, amp_enabled=amp_enabled
)
# NOTE(l1aoxingyu): Set position_ids sbp sign to [B, B] initially, because position_ids is a
# 1D-tensor from 0 to seq_length, if set to [S(0), B] at first, then position_ids
# will split at the first dim of hierarchy.
self.position_ids = flow.arange(
max_sequence_length,
dtype=flow.long,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=dist.get_layer_placement(0),
).unsqueeze(0)
if num_tokentypes > 0:
self.tokentype_embeddings = Embedding(
num_tokentypes, hidden_size, init_method=init_method, amp_enabled=amp_enabled
)
self.tokentype_ids = flow.zeros(
self.position_ids.size(),
dtype=flow.long,
sbp=self.position_ids.sbp,
placement=self.position_ids.placement,
)
else:
self.tokentype_embeddings = None
self.embedding_dropout = nn.Dropout(embedding_dropout_prob)
def forward(self, input_ids, tokentype_ids=None, position_ids=None):
seq_length = input_ids.size()[1]
word_embeddings = self.vocab_embeddings(input_ids)
if position_ids is None:
# Change position_ids sbp sign: [B, B] -> [S(0), B]
position_ids = (
self.position_ids[:, :seq_length].expand_as(input_ids).to_global(sbp=input_ids.sbp)
)
position_embeddings = self.position_embeddings(position_ids)
embeddings = word_embeddings + position_embeddings
if self.tokentype_embeddings is not None:
if tokentype_ids is None:
tokentype_ids = (
self.tokentype_ids[:, :seq_length]
.expand_as(input_ids)
.to_global(sbp=input_ids.sbp)
)
embeddings = embeddings + self.tokentype_embeddings(tokentype_ids)
embeddings = self.embedding_dropout(embeddings)
return embeddings
def word_embeddings(self):
return self.vocab_embeddings.weight
class BertLMPredictionHead(nn.Module):
def __init__(self, hidden_size, init_method):
super().__init__()
self.dense = Linear(
hidden_size,
hidden_size,
bias=True,
parallel="data",
init_method=init_method,
layer_idx=-1,
)
self.activation_func = build_activation("gelu")
self.layernorm = LayerNorm((hidden_size,), layer_idx=-1)
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.activation_func(hidden_states)
hidden_states = hidden_states.to_global(
grad_sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.split(2)])
)
# NOTE(l1aoxingyu): hidden_states shape is [B, S, H] whose sbp sign: [S(0), S(2)]
# Change from [S(0), S(2)] -> [S(0), B] because layernorm cannot get inputs with sbp S(2)
hidden_states = hidden_states.to_global(
sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.broadcast])
)
hidden_states = self.layernorm(hidden_states)
return hidden_states
class BertPooler(nn.Module):
"""Pooler layer.
Pool hidden states of the first token and
add a linear transformation followed by a tanh.
Args:
hidden_size: hidden state feature dimension
"""
def __init__(self, hidden_size, init_method):
super().__init__()
self.dense = Linear(
hidden_size,
hidden_size,
bias=True,
parallel="col",
init_method=init_method,
layer_idx=-1,
)
self.activation_func = build_activation("tanh")
def forward(self, hidden_states):
"""Just "pool" the model by simply taking the [CLS] token corresponding
to the first token."""
# hidden_states: [bsz, seq_len, hidden_size]
select_token_tensor = hidden_states[:, 0, :]
pooled_output = self.dense(select_token_tensor)
pooled_output = self.activation_func(pooled_output)
return pooled_output
class BertLoss(nn.Module):
def __init__(self, add_binary_head):
super().__init__()
self.add_binary_head = add_binary_head
self.lm_loss = ParallelCrossEntropyLoss()
def forward(self, lm_output, lm_labels, loss_mask, binary_logits, ns_labels):
lm_labels = lm_labels.to_global(placement=lm_output.placement)
loss_mask = loss_mask.to_global(placement=lm_output.placement)
binary_logits = binary_logits.to_global(placement=lm_output.placement)
ns_labels = ns_labels.to_global(placement=lm_output.placement)
lm_loss = self.lm_loss(lm_output, lm_labels)
loss_mask = loss_mask.float()
# Change loss_mask.sum() sbp sign from [P, B] -> [B, B]
# because (lm_loss * loss_mask) / loss_mask.sum() cannot accept P / P
denominator = (
loss_mask.sum().to_global(sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]))
+ 1e-7
)
masked_lm_loss = flow.sum(lm_loss.view(-1) * loss_mask.view(-1)) / denominator
# NOTE(l1aoxingyu): Change lm loss sbp sign [P, P] -> [P, B] to add with sop loss
# whose sbp sign: [P, B]
masked_lm_loss = masked_lm_loss.to_global(
sbp=dist.get_nd_sbp([flow.sbp.partial_sum, flow.sbp.broadcast])
)
loss_dict = {"lm_loss": masked_lm_loss}
if self.add_binary_head:
sop_loss = flow._C.cross_entropy(
binary_logits, ns_labels, ignore_index=-1, reduction="none"
).mean()
loss_dict["sop_loss"] = sop_loss
return loss_dict
class BertModel(nn.Module):
"""The bare Bert Model transformer outputting raw hidden-states without
any specific head on top.
Args:
vocab_size (int): The size of vocabulary file.
hidden_size (int): The size of hidden states.
hidden_layers (int): The number of ``TransformerLayer`` in encoder.
num_attention_heads (int):
The number of attention heads for each attention layer of ``TransformerLayer``.
intermediate_size (int):
The size of intermediate layer in feed-forward network for each ``TransformerLayer``.
hidden_dropout_prob (float, optional):
The dropout ratio for the output for each TransformerLayer. Defaults to 0.0.
attention_probs_dropout_prob (float, optional):
The dropout ratio for the output of each attention layer in ``TransformerLayer``.
Defaults to 0.0.
max_position_embeddings (int):
Max sequence length of input, defines the shape of Position Embeddings
in ``BertEmbedding``.
num_tokentypes (int, optional):
Number of segment token indices. Defaults to 2.
add_pooling_layer (bool, optional):
Whether or not averaging or pooling the sequence of hidden-states for the
whole input sequence. Defaults to ``True``.
initializer_range (float, optional):
Sigma of the normal distribution in the initialization method. Defaults to 0.02.
layernorm_epsilon (float, optional):
The epsilon of LayerNorm layer. Defaults to 1e-5.
bias_gelu_fusion (bool, optional):
Whether or not to fuse the computing of bias and gelu. Defaults to ``False``.
bias_dropout_fusion (bool, optional):
Whether or not to fuse the computing of dropout and bias. Defaults to ``False``.
scale_mask_softmax_fusion (bool, optional):
Whether to fuse the computing of mask and softmax in attention layers.
Defaults to ``False``.
apply_query_key_layer_scaling (bool, optional):
Whether or not to use layer index related scaling in computing attention scores.
If ``True``, the scaling factor equals to sqrt(d) * (layer_index + 1).
Defaults to ``True``.
apply_residual_post_layernorm (bool, optional):
If set ``True``, use original BERT residual connection ordering otherwise use Megatron
BERT residual connection which is more stable when scaling model size introduced in
https://arxiv.org/pdf/1909.08053.pdf.
Default: ``False``.
amp_enabled (bool, optional):
Whether or not to set fp16 for embedding weight in T5 model. Defaults to ``False``.
"""
@configurable
def __init__(
self,
vocab_size,
hidden_size,
hidden_layers,
num_attention_heads,
intermediate_size,
hidden_dropout_prob,
attention_probs_dropout_prob,
max_position_embeddings,
num_tokentypes=2,
add_pooling_layer=True,
initializer_range=0.02,
layernorm_eps=1e-12,
bias_gelu_fusion=True,
bias_dropout_fusion=True,
scale_mask_softmax_fusion=True,
apply_query_key_layer_scaling=True,
apply_residual_post_layernorm=False,
amp_enabled=False,
):
super().__init__()
init_method = init_method_normal(initializer_range)
scaled_init_method = scaled_init_method_normal(initializer_range, hidden_layers)
# Embeddings
self.embeddings = BertEmbeddings(
vocab_size,
hidden_size,
max_position_embeddings,
hidden_dropout_prob,
num_tokentypes,
init_method,
amp_enabled,
)
# Mask generation
self.extended_attn_mask = BertExtendedAttnMask()
# Encoders
self.encoders = nn.ModuleList(
[
TransformerLayer(
hidden_size,
intermediate_size,
num_attention_heads,
attention_dropout_prob=attention_probs_dropout_prob,
output_dropout_prob=hidden_dropout_prob,
layernorm_epsilon=layernorm_eps,
bias_gelu_fusion=bias_gelu_fusion,
bias_dropout_fusion=bias_dropout_fusion,
scale_mask_softmax_fusion=scale_mask_softmax_fusion,
apply_query_key_layer_scaling=apply_query_key_layer_scaling,
init_method=init_method,
output_layer_init_method=scaled_init_method,
apply_residual_post_layernorm=apply_residual_post_layernorm,
attn_mask_type=AttnMaskType.padding, # bert mask type
layer_idx=i,
)
for i in range(hidden_layers)
]
)
self.final_layernorm = LayerNorm((hidden_size,), eps=layernorm_eps, layer_idx=-1)
self.pooler = BertPooler(hidden_size, init_method) if add_pooling_layer else None
@classmethod
def from_config(cls, cfg):
return {
"vocab_size": cfg.vocab_size,
"hidden_size": cfg.hidden_size,
"hidden_layers": cfg.hidden_layers,
"num_attention_heads": cfg.num_attention_heads,
"intermediate_size": cfg.intermediate_size,
"hidden_dropout_prob": cfg.hidden_dropout_prob,
"attention_probs_dropout_prob": cfg.attention_probs_dropout_prob,
"max_position_embeddings": cfg.max_position_embeddings,
"num_tokentypes": cfg.num_tokentypes,
"add_pooling_layer": cfg.add_pooling_layer,
"initializer_range": cfg.initializer_range,
"layernorm_eps": cfg.layernorm_eps,
"bias_gelu_fusion": cfg.bias_gelu_fusion,
"bias_dropout_fusion": cfg.bias_dropout_fusion,
"scale_mask_softmax_fusion": cfg.scale_mask_softmax_fusion,
"apply_query_key_layer_scaling": cfg.apply_query_key_layer_scaling,
"apply_residual_post_layernorm": cfg.apply_residual_post_layernorm,
"amp_enabled": cfg.amp_enabled,
}
def forward(self, input_ids, attention_mask, tokentype_ids=None):
"""
Args:
input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary.
attention_mask (flow.BoolTensor): Mask to avoid performing attention
on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
tokentype_ids (flow.LongTensor, optional): Segment token indices to indicate first and
second portions of the inputs. Indices are selected in `[0, 1]`. Defaults to None.
"""
extended_attention_mask = self.extended_attn_mask(attention_mask)
embedding_output = self.embeddings(input_ids, tokentype_ids)
hidden_states = embedding_output
for layer in self.encoders:
hidden_states = layer(hidden_states, extended_attention_mask)
encoder_output = self.final_layernorm(hidden_states)
pooled_output = self.pooler(encoder_output) if self.pooler is not None else None
return encoder_output, pooled_output
def word_embeddings_weight(self):
return self.embeddings.word_embeddings()
class BertPreTrainingHeads(nn.Module):
def __init__(self, vocab_size, hidden_size, init_method, add_binary_head=True):
super().__init__()
self.predictions = BertLMPredictionHead(hidden_size, init_method)
self.seq_relationship = Linear(
hidden_size,
2,
bias=True,
parallel="data",
init_method=init_method,
layer_idx=-1,
)
self.lm_logits = LMLogits(vocab_size, bias=True)
self.loss_func = BertLoss(add_binary_head)
def forward(
self,
sequence_output,
pooled_output,
word_embeddings_weight,
ns_labels,
lm_labels,
loss_mask,
):
prediction_scores = self.predictions(sequence_output)
seq_relationship_score = self.seq_relationship(pooled_output)
prediction_scores = self.lm_logits(prediction_scores, word_embeddings_weight)
if lm_labels is not None:
return self.loss_func(
prediction_scores, lm_labels, loss_mask, seq_relationship_score, ns_labels
)
return {
"prediction_scores": prediction_scores,
"seq_relationship_score": seq_relationship_score,
}
class BertForPreTraining(nn.Module):
"""Bert Model with two heads on top as done during the pretraining: a
`masked language modeling` head and a `next sentence prediction (classification)` head.
"""
def __init__(self, cfg):
super().__init__()
self.bert = BertModel(cfg)
self.cls_head = BertPreTrainingHeads(
cfg.vocab_size,
cfg.hidden_size,
init_method_normal(cfg.initializer_range),
cfg.add_binary_head,
)
def forward(
self,
input_ids,
attention_mask,
tokentype_ids=None,
ns_labels=None,
lm_labels=None,
loss_mask=None,
):
"""
Args:
input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary.
attention_mask (flow.BoolTensor): Mask to avoid performing attention on
padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
tokentype_ids (flow.LongTensor, optional): Segment token indices to indicate first
and second portions of the inputs. Indices are selected in `[0, 1]`.
Defaults to None.
ns_labels (flow.LongTensor, optional): Labels for computing the next sequence prediction
(classification) loss. Input should be a sequence pair (see `input_ids` docstring).
Indices should be in `[0, 1]`:
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
lm_labels (flow.LongTensor, optional): Labels for computing the masked
language modeling loss. Indices should be in `[-1, 0, ..., config.vocab_size]`.
loss_mask (flow.BoolTensor, optional): Mask to avoid performing loss computing
on ignored tokens. Tokens with indices set to `-1` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
input_ids = input_ids.to_global(placement=dist.get_layer_placement(0))
attention_mask = attention_mask.to_global(placement=dist.get_layer_placement(0))
tokentype_ids = tokentype_ids.to_global(placement=dist.get_layer_placement(0))
outputs = self.bert(input_ids, attention_mask, tokentype_ids)
sequence_output, pooled_output = outputs[:2]
return self.cls_head(
sequence_output,
pooled_output,
self.bert.word_embeddings_weight(),
ns_labels,
lm_labels,
loss_mask,
)
@staticmethod
def set_pipeline_stage_id(model):
dist_utils = dist.get_dist_util()
# Set pipeline parallelism stage_id
if hasattr(model.bert.final_layernorm, "config"):
# Old API in OneFlow 0.8
for module_block in model.modules():
# module.origin can get the original module
if isinstance(module_block.origin, BertEmbeddings):
module_block.config.set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
elif isinstance(module_block.origin, BertExtendedAttnMask):
module_block.config.set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
elif isinstance(module_block.origin, TransformerLayer):
module_block.config.set_stage(
dist_utils.get_layer_stage_id(module_block.layer_idx),
dist.get_layer_placement(module_block.layer_idx),
)
elif isinstance(module_block.origin, BertPooler):
module_block.config.set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
elif isinstance(module_block.origin, BertPreTrainingHeads):
module_block.config.set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
# Set the last layernorm stage id
model.bert.final_layernorm.config.set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
else:
for module_block in model.modules():
if isinstance(module_block.to(nn.Module), BertEmbeddings):
module_block.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
elif isinstance(module_block.to(nn.Module), BertExtendedAttnMask):
module_block.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
elif isinstance(module_block.to(nn.Module), TransformerLayer):
module_block.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(module_block.layer_idx),
dist.get_layer_placement(module_block.layer_idx),
)
elif isinstance(module_block.to(nn.Module), BertPooler):
module_block.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
elif isinstance(module_block.to(nn.Module), BertPreTrainingHeads):
module_block.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
# Set the last layernorm stage id
model.bert.final_layernorm.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
class BertForClassification(nn.Module):
def __init__(self, cfg):
super().__init__()
self.cfg = cfg
self.num_labels = cfg.num_labels
self.bert = BertModel(cfg)
self.classifier = Linear(
cfg.hidden_size,
cfg.num_labels,
bias=True,
parallel="row",
init_method=init_method_normal(cfg.initializer_range),
layer_idx=-1,
)
classifier_dropout = (
cfg.classifier_dropout
if cfg.classifier_dropout is not None
else cfg.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
def forward(self, input_ids, attention_mask, tokentype_ids=None, labels=None, **kwargs):
labels = labels if labels is not None else kwargs.get("ns_labels")
outputs = self.bert(input_ids, attention_mask, tokentype_ids)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
if labels is not None:
loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
loss = loss.to_global(sbp=dist.get_nd_sbp([flow.sbp.partial_sum, flow.sbp.broadcast]))
return {"cls_loss": loss}
else:
return {"logits": logits}
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from libai.config import instantiate, try_get_key
def build_model(cfg):
"""Build the whole model architecture, defined by ``cfg.model``.
Note that it does not load any weights from ``cfg``.
"""
model = instantiate(cfg)
return model
def build_graph(cfg, model, optimizer=None, lr_scheduler=None, is_train=False):
"""Build the `nn.Graph`, defined by ``cfg.graph``."""
auto_parallel_conf = try_get_key(cfg, "graph.auto_parallel", default=None)
if is_train:
# Set train graph
assert optimizer is not None, "optimizer must be set for train graph"
assert lr_scheduler is not None, "lr_scheduler must be set for train graph"
graph = cfg.graph.train_graph
graph.model = model
graph.optimizer = optimizer
graph.lr_scheduler = lr_scheduler
graph.fp16 = try_get_key(cfg, "train.amp.enabled", default=False)
graph.activation_checkpoint = try_get_key(
cfg, "train.activation_checkpoint.enabled", default=False
)
graph.zero_optim = try_get_key(cfg, "train.zero_optimization.enabled", default=False)
graph.zero_stage = try_get_key(cfg, "train.zero_optimization.stage", default=1)
graph.grad_acc_steps = try_get_key(cfg, "train.num_accumulation_steps", default=1)
graph.auto_parallel_conf = auto_parallel_conf
return instantiate(graph)
else:
# Set eval graph
graph = cfg.graph.eval_graph
graph.model = model
graph.auto_parallel_conf = auto_parallel_conf
return instantiate(graph)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow as flow
from oneflow import nn
from oneflow.nn import init
from libai.config import configurable
from libai.layers import (
Embedding,
LayerNorm,
LMLogits,
ParallelCrossEntropyLoss,
TransformerLayer,
VocabEmbedding,
)
from libai.layers.attention import AttnMaskType
from libai.utils import distributed as dist
from .utils import init_method_normal, scaled_init_method_normal
class CasualMask(nn.Module):
"""
Create a casual mask and combine it with the padding mask.
It will be used in gpt model and T5 decoder.
When in T5 decoder, the argument `layer_idx` should be set to first decoder layer index.
"""
def __init__(self, max_positions=1024, *, layer_idx=0):
super().__init__()
self.mask = flow.tril(
flow.ones(
(max_positions, max_positions),
dtype=flow.int8,
placement=dist.get_layer_placement(layer_idx),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
)
)
def forward(self, input_ids, past_length=0, attention_mask=None):
bsz, tgt_len = input_ids.size()
casual_mask = self.mask[:tgt_len, :tgt_len]
if past_length > 0:
# in case past_key_values are used, we need to add a prefix ones mask to casual mask
casual_mask = flow.cat(
[flow.ones(tgt_len, past_length, dtype=flow.int8), casual_mask], dim=-1
)
casual_mask = (
casual_mask.unsqueeze(0).unsqueeze(1).expand(bsz, 1, tgt_len, tgt_len + past_length)
)
casual_mask = casual_mask.to_global(sbp=input_ids.sbp)
if attention_mask is not None:
assert attention_mask.dim() == 4, "please extend the attention mask first"
casual_mask = casual_mask * attention_mask
return casual_mask
class GPTModel(nn.Module):
"""GPT-2 language model. The output of the forward method is logits.
Args:
hidden_layers (int): The number of ``TransformerLayer`` in the gpt model.
vocab_size (int): The size of vocabulary file.
hidden_size (int): The size of hidden states.
ffn_hidden_size (int):
The size of intermediate layer in feed-forward network for each ``TransformerLayer``.
num_attention_heads (int):
The number of attention heads for each attention layer of ``TransformerLayer``.
max_seq_length (int, optional):
Max sequence length of input, defines the shape of Position Embeddings in GPTEmebedding.
Defaults to 1024.
embedding_dropout_prob (float, optional):
The dropout ratio for the output of GPTEmbedding Layer. Defaults to 0.0.
attention_dropout_prob (float, optional):
The dropout ratio for the output of each attention layer in ``TransformerLayer``.
Defaults to 0.0.
output_dropout_prob (float, optional):
The dropout ratio for the output for each TransformerLayer. Defaults to 0.0.
layernorm_epsilon (float, optional):
The epsilon of LayerNorm layer. Defaults to 1e-5.
initializer_range (float, optional):
Sigma of the normal distribution in the initialization method. Defaults to 0.02.
use_scaled_init_for_output_weights (bool, optional): Defaults to ``True``.
bias_gelu_fusion (bool, optional):
Whether or not to fuse the computing of bias and gelu. Defaults to ``False``.
bias_dropout_fusion (bool, optional):
Whether or not to fuse the computing of dropout and bias. Defaults to ``False``.
scale_mask_softmax_fusion (bool, optional):
Whether to fuse the computing of mask and softmax in attention layers.
Defaults to ``False``.
apply_query_key_layer_scaling (bool, optional):
Whether or not to use layer index related scaling in computing attention scores.
If ``True``, the scaling factor equals to sqrt(d) * (layer_index + 1).
Defaults to ``False``.
apply_residual_post_layernorm (bool, optional):
If set ``True``, use original BERT residual connection ordering otherwise use Megatron
BERT residual connection which is more stable when scaling model size introduced in
https://arxiv.org/pdf/1909.08053.pdf.
Default: ``False``.
amp_enabled (bool, optional):
Whether or not to set fp16 for embedding weight in T5 model. Defaults to ``False``.
"""
@configurable
def __init__(
self,
hidden_layers,
vocab_size,
hidden_size,
ffn_hidden_size,
num_attention_heads,
max_seq_length=1024,
embedding_dropout_prob=0.0,
attention_dropout_prob=0.0,
output_dropout_prob=0.0,
layernorm_epsilon=1e-5,
initializer_range=0.02,
use_scaled_init_for_output_weights=True,
bias_gelu_fusion=False,
bias_dropout_fusion=False,
scale_mask_softmax_fusion=False,
apply_query_key_layer_scaling=False,
apply_residual_post_layernorm=False,
amp_enabled=False,
):
super().__init__()
init_method = init_method_normal(sigma=initializer_range)
if use_scaled_init_for_output_weights:
output_layer_init_method = scaled_init_method_normal(initializer_range, hidden_layers)
else:
output_layer_init_method = init_method
self.embeddings = GPTEmbedding(
vocab_size,
hidden_size,
max_seq_length,
init_method=init_method,
embedding_dropout_prob=embedding_dropout_prob,
amp_enabled=amp_enabled,
)
self.transformer = Transformer(
hidden_layers,
hidden_size,
ffn_hidden_size,
num_attention_heads,
attention_dropout_prob=attention_dropout_prob,
output_dropout_prob=output_dropout_prob,
layernorm_epsilon=layernorm_epsilon,
init_method=init_method,
output_layer_init_method=output_layer_init_method,
bias_gelu_fusion=bias_gelu_fusion,
bias_dropout_fusion=bias_dropout_fusion,
scale_mask_softmax_fusion=scale_mask_softmax_fusion,
apply_query_key_layer_scaling=apply_query_key_layer_scaling,
apply_residual_post_layernorm=apply_residual_post_layernorm,
)
self.lm_head = LMLogits(vocab_size, bias=False)
@classmethod
def from_config(cls, cfg):
return {
"hidden_layers": cfg.hidden_layers,
"vocab_size": cfg.vocab_size,
"hidden_size": cfg.hidden_size,
"ffn_hidden_size": cfg.ffn_hidden_size,
"num_attention_heads": cfg.num_attention_heads,
"max_seq_length": cfg.max_seq_length,
"embedding_dropout_prob": cfg.embedding_dropout_prob,
"attention_dropout_prob": cfg.attention_dropout_prob,
"output_dropout_prob": cfg.output_dropout_prob,
"layernorm_epsilon": cfg.layernorm_epsilon,
"initializer_range": cfg.initializer_range,
"use_scaled_init_for_output_weights": cfg.use_scaled_init_for_output_weights,
"bias_gelu_fusion": cfg.bias_gelu_fusion,
"bias_dropout_fusion": cfg.bias_dropout_fusion,
"scale_mask_softmax_fusion": cfg.scale_mask_softmax_fusion,
"apply_query_key_layer_scaling": cfg.apply_query_key_layer_scaling,
"apply_residual_post_layernorm": cfg.apply_residual_post_layernorm,
"amp_enabled": cfg.amp_enabled,
}
def forward(self, input_ids):
"""
Args:
input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary.
Returns:
flow.Tensor: logits
"""
input_ids = input_ids.to_global(placement=dist.get_layer_placement(0))
input_embeds = self.embeddings(input_ids, 0)
transformer_output = self.transformer(input_embeds, attention_mask=None)
output = self.lm_head(transformer_output, self.embeddings.token_embeddings.weight)
return output
class GPTEmbedding(nn.Module):
def __init__(
self,
vocab_size,
hidden_size,
max_seq_length,
init_method=init.xavier_normal_,
embedding_dropout_prob=0.0,
amp_enabled=False,
):
super().__init__()
self.token_embeddings = VocabEmbedding(
vocab_size, hidden_size, init_method=init_method, amp_enabled=amp_enabled
)
self.position_embeddings = Embedding(
max_seq_length, hidden_size, init_method=init_method, amp_enabled=amp_enabled
)
self.dropout = nn.Dropout(embedding_dropout_prob)
self.position_ids = flow.arange(
max_seq_length,
dtype=flow.long,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=dist.get_layer_placement(0),
).unsqueeze(0)
def forward(self, input_ids, past_length=0):
bsz, seq_length = input_ids.size()
position_ids = self.position_ids[:, past_length : past_length + seq_length]
position_ids = position_ids.expand_as(input_ids).to_global(sbp=input_ids.sbp)
token_embeds = self.token_embeddings(input_ids)
position_embeds = self.position_embeddings(position_ids)
input_embeds = token_embeds + position_embeds
input_embeds = self.dropout(input_embeds)
return input_embeds
class Transformer(nn.Module):
def __init__(
self,
hidden_layers,
hidden_size,
ffn_hidden_size,
num_attention_heads,
attention_dropout_prob=0.0,
output_dropout_prob=0.0,
layernorm_epsilon=1e-5,
init_method=init.xavier_normal_,
output_layer_init_method=None,
bias_gelu_fusion=False,
bias_dropout_fusion=False,
scale_mask_softmax_fusion=False,
apply_query_key_layer_scaling=False,
apply_residual_post_layernorm=False,
):
super().__init__()
self.hidden_layers = hidden_layers
def build_layer(layer_number):
return TransformerLayer(
hidden_size,
ffn_hidden_size,
num_attention_heads,
attention_dropout_prob=attention_dropout_prob,
output_dropout_prob=output_dropout_prob,
layernorm_epsilon=layernorm_epsilon,
init_method=init_method,
output_layer_init_method=output_layer_init_method,
bias_gelu_fusion=bias_gelu_fusion,
bias_dropout_fusion=bias_dropout_fusion,
scale_mask_softmax_fusion=scale_mask_softmax_fusion,
apply_query_key_layer_scaling=apply_query_key_layer_scaling,
apply_residual_post_layernorm=apply_residual_post_layernorm,
attn_mask_type=AttnMaskType.causal,
layer_idx=layer_number,
)
self.layers = nn.ModuleList([build_layer(i) for i in range(self.hidden_layers)])
self.layernorm_f = LayerNorm(hidden_size, eps=layernorm_epsilon, layer_idx=-1)
def forward(self, hidden_states, attention_mask):
# hidden_states shape: (batch_size, seq_length, hidden_size)
# sbp: [S(0), B]
for i, layer in enumerate(self.layers):
hidden_states = layer(hidden_states, attention_mask)
output = self.layernorm_f(hidden_states)
return output
class GPTLoss(nn.Module):
def __init__(self) -> None:
super().__init__()
self.lm_loss = ParallelCrossEntropyLoss()
def forward(self, logits, lm_labels):
lm_loss = self.lm_loss(logits, lm_labels)
lm_loss = lm_loss.mean()
return {"lm_loss": lm_loss}
class GPTForPreTraining(nn.Module):
"""
GPT Model with classification head on top.
"""
def __init__(self, cfg) -> None:
super().__init__()
self.GPT_model = GPTModel(cfg)
self.loss_func = GPTLoss()
def forward(
self,
input_ids,
labels=None,
):
"""
Args:
input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary.
labels (flow.LongTensor, optional): Labels for computing language modeling loss.
None for evaluating. Defaults to None.
Returns:
dict:
A dict containing :code:`loss_value` or :code:`logits`
depending on training or evaluation.
:code:`{"masked_lm_loss": loss_value}` when training,
:code:`{"prediction_scores": logits}` when evaluating.
"""
logits = self.GPT_model(input_ids)
if labels is not None:
lm_loss = self.loss_func(logits, labels)
return lm_loss
else:
return {"prediction_scores": logits}
@staticmethod
def set_pipeline_stage_id(model: nn.Module):
dist_utils = dist.get_dist_util()
if hasattr(model.GPT_model.transformer.layernorm_f, "config"):
# Old API in OneFlow 0.8
for module_block in model.modules():
if isinstance(module_block.origin, (GPTEmbedding, CasualMask)):
module_block.config.set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
elif isinstance(module_block.origin, TransformerLayer):
module_block.config.set_stage(
dist_utils.get_layer_stage_id(module_block.layer_idx),
dist.get_layer_placement(module_block.layer_idx),
)
elif isinstance(module_block.origin, (LMLogits, GPTLoss)):
module_block.config.set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
model.GPT_model.transformer.layernorm_f.config.set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
else:
for module_block in model.modules():
if isinstance(module_block.to(nn.Module), (GPTEmbedding, CasualMask)):
module_block.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
elif isinstance(module_block.to(nn.Module), TransformerLayer):
module_block.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(module_block.layer_idx),
dist.get_layer_placement(module_block.layer_idx),
)
elif isinstance(module_block.to(nn.Module), (LMLogits, GPTLoss)):
module_block.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
model.GPT_model.transformer.layernorm_f.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# --------------------------------------------------------
# ResMLP Model
# References:
# resmlp: https://github.com/facebookresearch/deit/blob/main/resmlp_models.py
# --------------------------------------------------------
import oneflow as flow
import oneflow.nn as nn
from flowvision.layers.weight_init import trunc_normal_
import libai.utils.distributed as dist
from libai.config import configurable
from libai.layers import MLP, DropPath, LayerNorm, Linear, PatchEmbedding
class Affine(nn.Module):
def __init__(self, dim, *, layer_idx=0):
super().__init__()
self.alpha = nn.Parameter(
flow.ones(
dim,
placement=dist.get_layer_placement(layer_idx),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
)
)
self.beta = nn.Parameter(
flow.zeros(
dim,
placement=dist.get_layer_placement(layer_idx),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
),
)
self.layer_idx = layer_idx
def forward(self, x):
x = x.to_global(placement=dist.get_layer_placement(self.layer_idx))
return self.alpha * x + self.beta
class layers_scale_mlp_blocks(nn.Module):
def __init__(
self, dim, drop=0.0, drop_path=0.0, init_values=1e-4, num_patches=196, *, layer_idx=0
):
super().__init__()
self.norm1 = Affine(dim, layer_idx=layer_idx)
self.attn = Linear(num_patches, num_patches, layer_idx=layer_idx)
self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
self.norm2 = Affine(dim, layer_idx=layer_idx)
self.mlp = MLP(hidden_size=dim, ffn_hidden_size=int(4.0 * dim), layer_idx=layer_idx)
self.gamma_1 = nn.Parameter(
init_values
* flow.ones(
dim,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=dist.get_layer_placement(layer_idx),
),
requires_grad=True,
)
self.gamma_2 = nn.Parameter(
init_values
* flow.ones(
dim,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=dist.get_layer_placement(layer_idx),
),
requires_grad=True,
)
self.layer_idx = layer_idx
def forward(self, x):
x = x.to_global(placement=dist.get_layer_placement(self.layer_idx))
x = x + self.drop_path(
self.gamma_1 * self.attn(self.norm1(x).transpose(1, 2)).transpose(1, 2)
)
x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
return x
class ResMLP(nn.Module):
"""ResMLP in LiBai.
LiBai's implementation of:
`ResMLP: Feedforward networks for image classification with data-efficient training
<https://arxiv.org/abs/2105.03404>`_
Args:
img_size (int, tuple(int)): input image size
patch_size (int, tuple(int)): patch size
in_chans (int): number of input channels
embed_dim (int): embedding dimension
depth (int): depth of transformer
drop_rate (float): dropout rate
drop_path_rate (float): stochastic depth rate
init_scale (float): the layer scale ratio
num_classes (int): number of classes for classification head
loss_func (callable, optional): loss function for computing the total loss
between logits and labels
"""
@configurable
def __init__(
self,
img_size=224,
patch_size=16,
in_chans=3,
embed_dim=768,
depth=12,
drop_rate=0.0,
drop_path_rate=0.0,
init_scale=1e-4,
num_classes=1000,
loss_func=None,
):
super().__init__()
self.num_classes = num_classes
self.num_features = self.embed_dim = embed_dim
self.patch_embed = PatchEmbedding(
img_size=img_size,
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim,
)
num_patches = self.patch_embed.num_patches
dpr = [drop_path_rate for i in range(depth)] # stochastic depth decay rule
self.blocks = nn.ModuleList(
[
layers_scale_mlp_blocks(
dim=embed_dim,
drop=drop_rate,
drop_path=dpr[i],
init_values=init_scale,
num_patches=num_patches,
layer_idx=i,
)
for i in range(depth)
]
)
self.norm = Affine(embed_dim, layer_idx=-1)
self.head = (
Linear(embed_dim, num_classes, layer_idx=-1) if num_classes > 0 else nn.Identity()
)
# loss func
self.loss_func = nn.CrossEntropyLoss() if loss_func is None else loss_func
# weight init
self.apply(self._init_weights)
@classmethod
def from_config(cls, cfg):
return {
"img_size": cfg.img_size,
"patch_size": cfg.patch_size,
"in_chans": cfg.in_chans,
"embed_dim": cfg.embed_dim,
"depth": cfg.depth,
"drop_rate": cfg.drop_rate,
"drop_path_rate": cfg.drop_path_rate,
"init_scale": cfg.init_scale,
"num_classes": cfg.num_classes,
"loss_func": cfg.loss_func,
}
def _init_weights(self, m):
if isinstance(m, Linear):
trunc_normal_(m.weight, std=0.02)
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
def forward_features(self, x):
x = self.patch_embed(x)
# layer scale mlp blocks
for i, blk in enumerate(self.blocks):
x = blk(x)
return x
def forward_head(self, x):
B = x.shape[0]
x = self.norm(x)
x = x.mean(dim=1).reshape(B, 1, -1)
return self.head(x[:, 0])
def forward(self, images, labels=None):
"""
Args:
images (flow.Tensor): training samples.
labels (flow.LongTensor, optional): training targets
Returns:
dict:
A dict containing :code:`loss_value` or :code:`logits`
depending on training or evaluation mode.
:code:`{"losses": loss_value}` when training,
:code:`{"prediction_scores": logits}` when evaluating.
"""
x = self.forward_features(images)
x = self.forward_head(x)
if labels is not None and self.training:
losses = self.loss_func(x, labels)
return {"losses": losses}
else:
return {"prediction_scores": x}
@staticmethod
def set_pipeline_stage_id(model):
dist_utils = dist.get_dist_util()
# Set pipeline parallelism stage_id
if hasattr(model.loss_func, "config"):
# Old API in OneFlow 0.8
for module_block in model.modules():
# module.origin can get the original module
if isinstance(module_block.origin, PatchEmbedding):
module_block.config.set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
elif isinstance(module_block.origin, layers_scale_mlp_blocks):
module_block.config.set_stage(
dist_utils.get_layer_stage_id(module_block.layer_idx),
dist.get_layer_placement(module_block.layer_idx),
)
# Set norm and head stage id
model.norm.config.set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
model.head.config.set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
model.loss_func.config.set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
else:
for module_block in model.modules():
if isinstance(module_block.to(nn.Module), PatchEmbedding):
module_block.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
elif isinstance(module_block.to(nn.Module), layers_scale_mlp_blocks):
module_block.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(module_block.layer_idx),
dist.get_layer_placement(module_block.layer_idx),
)
# Set norm and head stage id
model.norm.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
model.head.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
model.loss_func.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
@staticmethod
def set_activation_checkpoint(model):
for module_block in model.modules():
if hasattr(module_block, "origin"):
# Old API in OneFlow 0.8
if isinstance(module_block.origin, layers_scale_mlp_blocks):
module_block.config.activation_checkpointing = True
else:
if isinstance(module_block.to(nn.Module), layers_scale_mlp_blocks):
module_block.to(nn.graph.GraphModule).activation_checkpointing = True
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment