Commit 5988d2cc authored by yuguo960516's avatar yuguo960516
Browse files

bert-large

parent 478602ba
Pipeline #142 canceled with stages
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from enum import Enum
from typing import Optional
import oneflow as flow
from oneflow import nn
class Activation(str, Enum):
SquaredReLU = "squared_relu"
GeLU = "gelu"
GeLUTanh = "gelu_tanh"
LeakyReLU = "leaky_relu"
ReLU = "relu"
Tanh = "tanh"
QuickGELU = "quick_gelu"
# For unit testing / parity comparisons, probably not the fastest way
class SquaredReLU(nn.Module):
def __init__(self) -> None:
super().__init__()
def forward(self, x: flow.Tensor) -> flow.Tensor:
x_ = flow._C.relu(x)
return x_ * x_
class Passthrough(nn.Module):
def __init__(self) -> None:
super().__init__()
def forward(self, x: flow.Tensor) -> flow.Tensor:
return x
class GeLUTanh(nn.Module):
def __init__(self) -> None:
super().__init__()
def forward(self, x: flow.Tensor) -> flow.Tensor:
"""When the approximate argument is 'tanh', Gelu is estimated with:
0.5 * x * (1.0 + flow.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * flow.pow(x, 3.0))))
"""
return flow.nn.functional.gelu(x, approximate="tanh")
class QuickGELU(nn.Module):
def __init__(self) -> None:
super().__init__()
def forward(self, x: flow.Tensor) -> flow.Tensor:
return x * flow.sigmoid(1.702 * x)
def build_activation(activation: Optional[Activation]):
"""
Fetching activation layers by name, e.g.,
``build_activation("gelu")`` returns ``nn.GELU()`` module.
"""
if not activation:
return Passthrough()
return {
Activation.ReLU: nn.ReLU,
Activation.GeLU: nn.GELU,
Activation.GeLUTanh: GeLUTanh,
Activation.LeakyReLU: nn.LeakyReLU,
Activation.SquaredReLU: SquaredReLU,
Activation.Tanh: nn.Tanh,
Activation.QuickGELU: QuickGELU,
}[activation]()
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import enum
import math
from typing import Tuple
import oneflow as flow
from oneflow import nn
from .linear import Linear
class AttnMaskType(enum.Enum):
padding = 1
causal = 2
class MultiheadAttention(nn.Module):
"""Multi-head attention layer, support self attention and cross attention.
Args:
hidden_size: size of hidden state.
num_attention_heads: number of attention heads.
is_cross_attention: used to specify whether it is self attention or cross attention.
Defaults to False.
attention_dropout_prob: dropout probability of attention weights.
Defaults to 0.0.
output_dropout_prob: dropout probability of output. Defaults to 0.0.
init_method: method to initialize the input layer weights.
Defaults to ``init.xavier_normal_``.
output_layer_init_method: method to initialize the output layer weights.
If None, use ``init_method``.
bias_dropout_fusion: whether to fuse add bias and dropout.
Defaults to False.
scale_mask_softmax_fusion: whether to fuse scale, mask and softmax.
Defaults to False.
apply_query_key_layer_scaling: if `True`, scaling the attention score by layer index.
Defaults to False.
layer_idx: a layer_idx sign which determines the placements.
It will be used in pipeline parallelism. Defaults to 0.
"""
def __init__(
self,
hidden_size,
num_attention_heads,
is_cross_attention=False,
attention_dropout_prob=0.0,
output_dropout_prob=0.0,
init_method=nn.init.xavier_normal_,
output_layer_init_method=None,
bias_dropout_fusion=False,
scale_mask_softmax_fusion=False,
apply_query_key_layer_scaling=False,
attn_mask_type=AttnMaskType.padding,
*,
layer_idx=0
):
super().__init__()
self.hidden_size = hidden_size
if output_layer_init_method is None:
output_layer_init_method = init_method
assert (
hidden_size % num_attention_heads == 0
), "hidden_size must be divisible by num_attention_heads."
self.num_heads = num_attention_heads
self.head_size = hidden_size // num_attention_heads
self.attn_mask_type = attn_mask_type
self.attention_dropout_prob = attention_dropout_prob
self.dropout = nn.Dropout(p=attention_dropout_prob)
self.norm_factor = 1.0 / math.sqrt(float(self.head_size))
self.coeff = None
if apply_query_key_layer_scaling:
self.coeff = layer_idx + 1
self.norm_factor /= self.coeff
self.is_cross_attention = is_cross_attention
self.scale_mask_softmax_fusion = scale_mask_softmax_fusion
self.bias_dropout_fusion = bias_dropout_fusion
if self.bias_dropout_fusion:
self.output_dropout_prob = output_dropout_prob
else:
self.output_dropout = nn.Dropout(p=output_dropout_prob)
if self.is_cross_attention:
self.query = Linear(
self.hidden_size,
self.hidden_size,
parallel="col",
init_method=init_method,
layer_idx=layer_idx,
)
self.key_value = Linear(
self.hidden_size,
self.hidden_size * 2,
parallel="col",
init_method=init_method,
layer_idx=layer_idx,
)
else:
self.query_key_value = Linear(
self.hidden_size,
self.hidden_size * 3,
parallel="col",
init_method=init_method,
layer_idx=layer_idx,
)
self.dense = Linear(
self.hidden_size,
self.hidden_size,
parallel="row",
init_method=output_layer_init_method,
skip_bias_add=self.bias_dropout_fusion,
layer_idx=layer_idx,
)
def forward(
self,
hidden_states: flow.Tensor,
encoder_states: flow.Tensor = None,
attention_mask: flow.Tensor = None,
past_key_value: Tuple[flow.Tensor, flow.Tensor] = None,
use_cache: bool = False,
):
"""
Args:
hidden_states (flow.Tensor): shape is [bsz, tgt_len, hidden_size].
encoder_states (flow.Tensor, optional): shape is [bsz, src_len, hidden_size].
Defaults to None.
attention_mask (flow.Tensor, optional): shape is [bsz, 1, tgt_len, src_len].
It should be the combination of padding mask and casual mask.
It is the padding mask of source input when used with self-attention in encoder.
And it is the combination of padding mask of target input and casual mask when
used with self-attention in decoder. It is the padding mask of source input when
used with cross-attention in decoder.
Defaults to None.
past_key_value (Tuple[flow.Tensor, flow.Tensor], optional): tuple of key and value,
each shape is [bsz, num_heads, src_len, head_size]. Defaults to None.
use_cache (bool, optional): it will be set to True, when the model is in the inference
phase and used for incremental decoding. Defaults to False.
"""
# hidden_states, encoder_states: [S(0), B]
# attention_mask: [S(0), B]
if encoder_states is not None:
encoder_states = encoder_states.to_global(placement=hidden_states.placement)
if attention_mask is not None:
attention_mask = attention_mask.to_global(placement=hidden_states.placement)
bsz, tgt_len = hidden_states.size()[:2]
if self.is_cross_attention:
# if it is cross attention, key and value should be calculated only once, and the
# result can be reused.
query = self.query(hidden_states)
query = query.view(bsz, -1, self.num_heads, self.head_size)
query = query.permute(0, 2, 1, 3)
if past_key_value is not None:
key, value = past_key_value
elif encoder_states is not None:
key_value = self.key_value(encoder_states)
key_value = key_value.view(bsz, -1, self.num_heads, 2 * self.head_size)
key_value = key_value.permute(0, 2, 1, 3)
key, value = flow.chunk(key_value, chunks=2, dim=-1)
else:
raise ValueError(
"past_key_value and encoder_states cannot be None at the same time."
)
else:
# if it is self attention, query, key, and value are all obtained from hidden_states.
# when in the inference phase of an incremental decoder,
# hidden_states is the last-added state,
# the full key and value could be obtained by concatenating with past_key_value.
query_key_value = self.query_key_value(hidden_states)
query_key_value = query_key_value.view(bsz, -1, self.num_heads, 3 * self.head_size)
query_key_value = query_key_value.permute(
0, 2, 1, 3
) # [bsz, num_heads, src_len, 3 * head_size]
query, key, value = flow.chunk(query_key_value, chunks=3, dim=-1)
if past_key_value is not None:
past_key, past_value = past_key_value
key = flow.cat((past_key.type_as(key), key), dim=2)
value = flow.cat((past_value.type_as(value), value), dim=2)
# query, key, value: [S(0), S(1)], shape: [bsz, num_heads, seq_length, head_size]
if use_cache:
past_key_value = (key, value)
# [bsz, num_heads, tgt_len, src_len] with [S(0), S(1)]
attention_scores = flow.matmul(query, key, transpose_b=True, alpha=self.norm_factor)
# [S(0), S(1)] x [S(0), B] = [S(0), S(1)]
if attention_mask is not None:
if self.scale_mask_softmax_fusion:
if self.attn_mask_type == AttnMaskType.padding:
attention_mask = (
attention_mask.expand_as(attention_scores) if use_cache else attention_mask
)
attention_weights = flow._C.fused_scale_mask_softmax_dropout(
attention_scores,
attention_mask,
fill_value=-10000.0,
scale=self.coeff,
p=self.attention_dropout_prob,
)[0]
else:
if self.coeff is not None:
attention_scores *= self.coeff
attention_scores = flow.mul(attention_scores, attention_mask)
attention_scores = attention_scores - 10000.0 * (1 - attention_mask)
# TODO(xingyu.liao): graph will occur `where_scalar` errors
# when using `masked_fill`
# attention_scores = attention_scores.masked_fill(1 - attention_mask, -10000.0)
attention_weights = flow.softmax(attention_scores, dim=-1)
# [bsz, num_heads, tgt_len, src_len]
attention_weights = self.dropout(attention_weights)
else:
if self.scale_mask_softmax_fusion and self.attn_mask_type == AttnMaskType.causal:
attention_weights = flow._C.fused_scale_tril_softmax_mask_scale(
attention_scores,
p=self.attention_dropout_prob,
diagonal=0,
tril_scale_value=self.coeff,
tril_fill_value=-10000.0,
)[0]
else:
attention_weights = flow.softmax(attention_scores, dim=-1)
# [bsz, num_heads, tgt_len, src_len]
attention_weights = self.dropout(attention_weights)
# Context shape: [bsz, num_heads, tgt_len, head_size] with [S(0), S(1)]
context = flow.matmul(attention_weights, value)
# Change shape: [bsz, num_heads, tgt_len, head_size] -> [bsz, tgt_len, num_heads, head_size]
context = context.transpose(1, 2)
# Concat multi-head results from
# [bsz, tgt_len, num_heads, head_size] -> [bsz, tgt_len, num_heads * head_size]
# SBP sign: [S(0), S(2)]
# [S(0), S(2)] x [B, S(0)] = [S(0), P] -> [S(0), B]
output = self.dense(context.flatten(2))
if self.bias_dropout_fusion:
output, bias = output
output = flow._C.fused_bias_add_dropout(
output, bias, p=self.output_dropout_prob, axis=output.ndim - 1
)
else:
output = self.output_dropout(output)
if use_cache:
output = (output, past_key_value)
return output
def extra_repr(self) -> str:
return "hidden_size={}, num_heads={}, is_cross_attention={}".format(
self.hidden_size,
self.num_heads,
self.is_cross_attention,
)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow as flow
from oneflow import nn
class ParallelCrossEntropyLoss(nn.Module):
"""This criterion acts like :class:`~flow.nn.CrossEntropyLoss` except it will
execute distributed cross entropy loss computation cross different GPUs.
"""
def forward(self, logits: flow.Tensor, target: flow.Tensor):
"""Function for the distributed cross entropy.
Args:
logits (flow.Tensor): vocab_parallel_logits with shape
(batch_size, seq_length, vocab_size) and sbp signature is [S(0), S(2)].
target (flow.Tensor): target with shape (batch_size, seq_length) and
sbp signature is [S(0), B].
"""
assert logits.ndim == 3
assert target.ndim == 2
assert logits.shape[0:2] == target.shape
target = target.to_global(placement=logits.placement)
# Change -1 in target to 0 because sparse_softmax_cross_entropy don't accept -1
target = target * (target >= 0)
lm_loss = flow._C.sparse_softmax_cross_entropy(
logits.view(-1, logits.shape[-1]),
target.view(-1),
)
return lm_loss
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow as flow
import oneflow.nn as nn
def drop_path(x, drop_prob: float = 0.5, training: bool = False, scale_by_keep: bool = True):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
if drop_prob == 0.0 or not training:
return x
keep_prob = 1 - drop_prob
shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
# similar opeartion to new_tensor(shape).bernoulli_(keep_prob)
random_tensor = flow.rand(*shape, dtype=x.dtype, sbp=x.sbp, placement=x.placement)
random_tensor = (random_tensor < keep_prob).to(flow.float32)
if keep_prob > 0.0 and scale_by_keep:
random_tensor = random_tensor / keep_prob
return x * random_tensor
class DropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
super(DropPath, self).__init__()
self.drop_prob = drop_prob
self.scale_by_keep = scale_by_keep
def forward(self, x):
return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import oneflow as flow
from oneflow import nn
from oneflow.nn import init
from libai.utils import distributed as dist
class Embedding(nn.Module):
"""Construct the trainable embedding module, which does not support parallelization.
This can be used for positional embedding and token type embedding.
Arguments:
num_embeddings: size of vocabulary.
embedding_dim: dimension of embeddings.
padding_idx: pad index. Defaults to None.
init_method: method to initialize weights. Defaults to ``flow.nn.init.xavier_normal_``.
amp_enabled: fp16 option for embedding weight. Defaults to False.
"""
def __init__(
self,
num_embeddings,
embedding_dim,
padding_idx=None,
init_method=init.xavier_normal_,
amp_enabled=False,
layer_idx=0,
):
super().__init__()
self.num_embeddings = num_embeddings
self.embedding_dim = embedding_dim
if padding_idx is not None:
if padding_idx > 0:
assert (
padding_idx < self.num_embeddings
), "Padding_idx must be within num_embeddings"
elif padding_idx < 0:
assert (
padding_idx >= -self.num_embeddings
), "Padding_idx must be within num_embeddings"
padding_idx = self.num_embeddings + padding_idx
self.padding_idx = padding_idx
self.init_method = init_method
self.amp_enabled = amp_enabled
assert num_embeddings > 0
self.weight = nn.Parameter(
flow.empty(
(num_embeddings, embedding_dim),
dtype=flow.float32,
placement=dist.get_layer_placement(layer_idx),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
)
)
self.init_method(self.weight)
# FIXME(lxy): Fill padding_idx is not supported in nd_sbp right now.
# self._fill_padding_idx_with_zero()
def forward(self, input_ids):
weight = flow._C.amp_white_identity(self.weight) if self.amp_enabled else self.weight
# embeddings with sbp sign: [B, B]
# [B, B] x [S(0), B] --> [S(0), B]
# ↑ ↑ ↑
# embed pos_ids pos_embed
input_embeds = flow._C.gather(weight, input_ids, axis=0)
return input_embeds
def _fill_padding_idx_with_zero(self) -> None:
if self.padding_idx is not None:
with flow.no_grad():
self.weight[self.padding_idx] = flow.zeros(
self.embedding_dim,
placement=dist.get_layer_placement(0),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
)
def extra_repr(self) -> str:
s = "num_embeddings={num_embeddings}, embedding_dim={embedding_dim}"
if self.padding_idx is not None:
s += ", padding_idx={padding_idx}"
return s.format(**self.__dict__)
class VocabEmbedding(nn.Module):
"""Construct the word embeddings, which may be split along vocabulary dimension.
Arguments:
num_embeddings: size of vocabulary.
embedding_dim: dimension of embeddings.
padding_idx: pad index. Defaults to None.
init_method: method to initialize weights. Defaults to ``flow.nn.init.xavier_normal_``.
amp_enabled: fp16 option for embedding weight. Defaults to False.
"""
def __init__(
self,
num_embeddings,
embedding_dim,
padding_idx=None,
init_method=init.xavier_normal_,
amp_enabled=False,
):
super().__init__()
self.num_embeddings = num_embeddings
self.embedding_dim = embedding_dim
if padding_idx is not None:
if padding_idx > 0:
assert (
padding_idx < self.num_embeddings
), "Padding_idx must be within num_embeddings"
elif padding_idx < 0:
assert (
padding_idx >= -self.num_embeddings
), "Padding_idx must be within num_embeddings"
padding_idx = self.num_embeddings + padding_idx
self.padding_idx = padding_idx
self.init_method = init_method
self.amp_enabled = amp_enabled
# Word token embedding shape with (vocab_size, hidden_size)
# sbp: [B, S(0)]
self.weight = nn.Parameter(
flow.empty(
(num_embeddings, embedding_dim),
dtype=flow.float32,
placement=dist.get_layer_placement(0),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)]),
)
)
# Initialize the word embedding
self.init_method(self.weight)
# FIXME(Lxy): Fill padding_idx is not supported in nd_sbp right now.
# self._fill_padding_idx_with_zero()
def forward(self, input_ids):
weight = flow._C.amp_white_identity(self.weight) if self.amp_enabled else self.weight
# input_ids with shape (batch_size, seq_len), and sbp sign: [S(0), B]
# Gather forward sbp sign
# [B, S(0)] x [S(0), B] --> [S(0), P]
# ↑ ↑ ↑
# embed input_ids input_embeds
input_embeds = flow._C.gather(weight, input_ids, axis=0)
# Set the embeds sbp from [S(0), P] --> [S(0), B] to get complete embedding results.
input_embeds = input_embeds.to_global(sbp=dist.get_hidden_sbp())
return input_embeds
def _fill_padding_idx_with_zero(self) -> None:
if self.padding_idx is not None:
with flow.no_grad():
self.weight[self.padding_idx] = flow.zeros(
self.embedding_dim,
placement=dist.get_layer_placement(0),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
)
def extra_repr(self) -> str:
s = "num_embeddings={num_embeddings}, embedding_dim={embedding_dim}"
if self.padding_idx is not None:
s += ", padding_idx={padding_idx}"
return s.format(**self.__dict__)
class SinePositionalEmbedding(nn.Module):
"""Construct the sinusoidal positional embeddings.
Arguments:
num_embeddings: size of vocabulary.
embedding_dim: dimension of embeddings.
"""
def __init__(self, num_embeddings, embedding_dim):
super().__init__()
self.embedding_dim = embedding_dim
self.num_embeddings = num_embeddings
position_embedding = flow.zeros(
num_embeddings,
embedding_dim,
dtype=flow.float32,
placement=dist.get_layer_placement(0),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
)
position = flow._C.global_arange(
start=0,
end=num_embeddings,
placement=dist.get_layer_placement(0),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
dtype=flow.float32,
).unsqueeze(1)
position_range = flow._C.global_arange(
start=0,
end=embedding_dim,
step=2,
placement=dist.get_layer_placement(0),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
dtype=flow.float32,
)
div_term = flow.exp(position_range * (-math.log(10000.0) / embedding_dim))
position_embedding[:, 0::2] = flow.sin(position * div_term)
position_embedding[:, 1::2] = flow.cos(position * div_term)
self.register_buffer("position_embedding", position_embedding)
def forward(self, position_ids):
position_embeds = flow._C.gather(self.position_embedding, position_ids, axis=0)
return position_embeds
def extra_repr(self) -> str:
s = "num_embeddings={num_embeddings}, embedding_dim={embedding_dim}"
return s.format(**self.__dict__)
class PatchEmbedding(nn.Module):
"""2D Image to Patch Embedding
Arguments:
img_size: size of input image. Default to 224.
patch_size: embedded patch size. Default to 16.
in_chans: input channel's size. Default to 3.
embed_dim: dimension of embedded patch. Default to 768.
norm_layer: normalization patch embedding or not. Default to None.
flatten: flatten patch embedding or keep the 2-D shape. Default to True.
layer_idx: A layer_idx sign which determines the placement. It will be used in pipeline
parallelism. Default to 0.
"""
def __init__(
self,
img_size=224,
patch_size=16,
in_chans=3,
embed_dim=768,
norm_layer=None,
flatten=True,
*,
layer_idx=0,
):
super().__init__()
img_size = img_size if isinstance(img_size, tuple) else (img_size, img_size)
patch_size = patch_size if isinstance(patch_size, tuple) else (patch_size, patch_size)
self.img_size = img_size
self.patch_size = patch_size
self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
self.num_patches = self.grid_size[0] * self.grid_size[1]
self.flatten = flatten
self.proj = nn.Conv2d(
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
).to_global(
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=dist.get_layer_placement(layer_idx),
)
self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
def forward(self, x):
B, C, H, W = x.shape
assert (
H == self.img_size[0]
), f"Input image height ({H}) doesn't match model ({self.img_size[0]})."
assert (
W == self.img_size[1]
), f"Input image width ({W}) doesn't match model ({self.img_size[1]})."
x = self.proj(x)
if self.flatten:
x = x.flatten(2).transpose(1, 2) # BCHW -> BNC
x = self.norm(x)
return x
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow as flow
from oneflow import nn
from libai.utils import distributed as dist
class LayerNorm(nn.Module):
"""Applies Layer Normalization over a mini-batch of inputs in 1D parallelism.
Args:
normalized_shape: input shape from an expected input of size.
eps: a value added to the denominator for numerical stability. Defaults to 1e-5.
elementwise_affine: a boolean value that when set to ``True``, this module
has learnable per-element affine parameters initialized to ones (for weights)
and zeros (for biases). Default: ``True``.
elementwise_affine: a boolean value that when set to ``True``, this module
has learnable per-element affine parameters initialized to ones (for weights)
and zeros (for biases). Default: ``True``.
bias: If set to ``False``, the layer will not learn an additive bias. Defaults to ``True``.
layer_idx: a layer_idx sign which determines the placement. It will be used in pipeline
parallelism. Defaults to 0.
"""
def __init__(
self, normalized_shape, eps=1e-5, elementwise_affine=True, bias=True, *, layer_idx=0
):
super().__init__()
if isinstance(normalized_shape, int):
normalized_shape = (normalized_shape,)
self.normalized_shape = tuple(normalized_shape)
self.eps = eps
self.elementwise_affine = elementwise_affine
self.layer_idx = layer_idx
if elementwise_affine:
self.weight = nn.Parameter(
flow.ones(
normalized_shape,
dtype=flow.float32,
placement=dist.get_layer_placement(layer_idx),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
)
)
self.bias = nn.Parameter(
flow.zeros(
normalized_shape,
dtype=flow.float32,
placement=dist.get_layer_placement(layer_idx),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
),
requires_grad=bias,
)
else:
self.weight = None
self.bias = None
def forward(self, x):
assert x.shape[-len(self.normalized_shape) :] == self.normalized_shape
begin_norm_axis = x.ndim - len(self.normalized_shape)
begin_params_axis = x.ndim - len(self.normalized_shape)
if self.elementwise_affine:
y = flow._C.layer_norm_affine(
x,
self.weight,
self.bias,
begin_norm_axis=begin_norm_axis,
begin_params_axis=begin_params_axis,
epsilon=self.eps,
)
else:
y = flow._C.layer_norm(
x,
begin_norm_axis=begin_norm_axis,
begin_params_axis=begin_params_axis,
epsilon=self.eps,
)
return y
def extra_repr(self) -> str:
return "{normalized_shape}, eps={eps}, elementwise_affine={elementwise_affine}".format(
**self.__dict__
)
class RMSLayerNorm(nn.Module):
"""T5 uses a layer_norm which only scales and doesn't shift, which is also known as
Root Mean Square Layer Normalization thus varience is calculated w/o mean and
there is no bias. More details see: https://arxiv.org/abs/1910.07467.
Args:
normalized_shape: input shape from an expected input of size.
eps: a value added to the denominator for numerical stability. Defaults to 1e-5.
elementwise_affine: a boolean value that when set to ``True``, this module
has learnable per-element affine parameters initialized to ones (for weights)
and zeros (for biases). Default: ``True``.
layer_idx: a layer_idx sign which determines the placement. It will be used in pipeline
parallelism. Defaults to 0.
"""
def __init__(self, normalized_shape, eps=1e-6, layer_idx=0):
super().__init__()
self.layer_idx = layer_idx
self.weight = flow.nn.Parameter(
flow.ones(
normalized_shape,
dtype=flow.float32,
placement=dist.get_layer_placement(layer_idx),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
)
)
self.l2norm_epsilon = eps
def forward(self, hidden_states):
return flow._C.rms_layer_norm(hidden_states, self.weight, self.l2norm_epsilon)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow as flow
from oneflow import nn
from libai.utils import distributed as dist
class Linear1D(nn.Module):
r"""Linear layer with 1D parallelism which includes column parallelism and row parallelism.
The linear layer is defined as :math:`y = xA^T + b`.
In column parallelism, A^T is parallelized along the second dimension
as :math:`A^T = [A_1, ..., A_p]`.
In row parallelism, A^T is parallelized along the first dimension and X along its second
dimension as:
.. math::
A^T = \begin{bmatrix}
A\_1 \\
. \\
. \\
. \\
A\_p
\end{bmatrix}
x = \begin{bmatrix}
x\_1 & ... & x\_p
\end{bmatrix}
Arguments:
in_features: size of each input sample.
out_features: size of each output sample.
bias: If set to ``False``, the layer will not learn an additive bias. Defaults to ``True``.
parallel: Parallel mode. Defaults to "data".
init_method: method to initialize weight. Defaults to :func:`nn.init.xavier_normal_`.
skip_bias_add: skip adding bias but instead return it, so that adding bias can be fused with
other elementwise operations. Defaults to ``False``.
layer_idx: A layer_idx sign which determines the placement. It will be used in pipeline
parallelism. Defaults to 0.
"""
def __init__(
self,
in_features,
out_features,
bias=True,
parallel="data",
init_method=nn.init.xavier_normal_,
skip_bias_add=False,
*,
layer_idx=0, # enforce layer_idx passed with keyword
):
super().__init__()
self.in_features = in_features
self.out_features = out_features
self.parallel = parallel
self.skip_bias_add = skip_bias_add
if parallel == "col":
# Column parallel
# weight sbp sign: [B, S(0)], weight will be transposed when performing matmul
# so weight sbp sign actually be [B, S(1)]
# bias sbp sign: [B, S(0)]
weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)])
bias_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)])
elif parallel == "row":
# Row parallel
# weight sbp sign: [B, S(1)], weight will be transposed when performing matmul
# so weight sbp sign actually be [B, S(1)]
# bias sbp sign: [B, B]
weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(1)])
bias_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
elif parallel == "data":
weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
bias_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
else:
raise KeyError(f"{parallel} is not supported! Only support ('data', 'row' and 'col')")
self.weight = flow.nn.Parameter(
flow.empty(
(out_features, in_features),
dtype=flow.float32,
placement=dist.get_layer_placement(layer_idx), # for pipeline parallelism placement
sbp=weight_sbp,
)
)
init_method(self.weight)
self.bias = (
flow.nn.Parameter(
flow.zeros(
(out_features,),
dtype=flow.float32,
placement=dist.get_layer_placement(layer_idx),
sbp=bias_sbp,
)
)
if bias
else None
)
def forward(self, x):
if dist.same_sbp(self.weight.sbp, dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)])):
# If the last dim of weight sbp sign is S(0), then last dim of weight.t sbp
# sign is S(1), so the last dim of x sbp sign must be B.
if self.weight.sbp[-1] == flow.sbp.split(0):
x_sbp = x.sbp[:-1] + (flow.sbp.broadcast,)
x = x.to_global(sbp=x_sbp)
# x.grad sbp must be x.sbp, otherwise backward pass cannot be performed correctly.
x = x.to_global(grad_sbp=x.sbp)
x = flow.matmul(x, self.weight, transpose_b=True)
elif dist.same_sbp(
self.weight.sbp, dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(1)])
):
# If the last dim of weight sbp sign is S(1), then last dim of weight.t sbp
# sign is S(0), so the last dim of x sbp sign must be S(ndim-1).
if self.weight.sbp[-1] == flow.sbp.split(1):
x_sbp = x.sbp[:-1] + (flow.sbp.split(x.ndim - 1),)
x = x.to_global(sbp=x_sbp)
out_sbp = x.sbp[:-1] + (flow.sbp.broadcast,)
else:
out_sbp = x.sbp
x = flow.matmul(x, self.weight, transpose_b=True)
# Change x.sbp for followup forward pass.
# This line can be removed when sbp can be auto inferred.
x = x.to_global(sbp=out_sbp)
elif dist.same_sbp(
self.weight.sbp, dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
):
# x.grad sbp must be x.sbp, otherwise backward pass cannot be performed correctly.
x = x.to_global(grad_sbp=x.sbp)
# NOTE(chengcheng): when input x is [S(0), B], there is no need to change sbp for x.
# x = x.to_global(sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.split(0)]))
x = flow.matmul(x, self.weight, transpose_b=True)
else:
# Not supported weight_sbp, deduce sbp and communicate with nccl automatically.
x = flow.matmul(x, self.weight, transpose_b=True)
if self.bias is not None:
if self.skip_bias_add:
return x, self.bias
else:
return x + self.bias
else:
return x
def extra_repr(self) -> str:
return "in_features={}, out_features={}, bias={}, parallel={}".format(
self.in_features,
self.out_features,
self.bias is not None,
self.parallel,
)
# Give an alias for Linear1d
Linear = Linear1D
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow as flow
from oneflow import nn
from libai.utils import distributed as dist
class LMLogits(nn.Module):
def __init__(self, vocab_size, bias=False):
super().__init__()
self.bias = (
nn.Parameter(
flow.zeros(
(vocab_size,),
dtype=flow.float32,
placement=dist.get_layer_placement(-1),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)]),
)
)
if bias
else None
)
def forward(self, input, word_embeddings):
"""LM logits using word embedding weights"""
# input with sbp sign [S(0), B] and word_embeddings with sbp sign [S(0), B]
# NOTE(l1aoxingyu): This is for pipeline parallelism
# change word embedding placement from stage(0) to stage(-1)
w = word_embeddings.to_global(placement=input.placement)
# NOTE(l1aoxingyu): input x embed^T = logits with sbp sign
# [S(0), B] x [B, S(1)] --> [S(0), S(1)]
# ↑ ↑ ↑
# input embed^T logits
# Backward pass input.grad = logits.grad x embed with sbp sign
# [S(0), S(1)] x [B, S(0)] --> [S(0), P]
# ↑ ↑ ↑
# logits.grad embed input.grad
# When use input.grad as head node for backward pass, need to convert
# its sbp sign fromm [S(0), P] --> [S(0), B]
input = input.to_global(grad_sbp=input.sbp)
logits = flow._C.matmul(input, w, transpose_b=True)
if self.bias is not None:
logits = logits + self.bias
return logits
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow as flow
from oneflow import nn
from libai.layers import Linear, build_activation
class MLP(nn.Module):
"""MLP
MLP will take the input with h hidden state, project it to intermediate
hidden dimension, perform gelu transformation, and project the
state back into h hidden dimension.
Arguments:
hidden_size: size of each input and output sample.
ffn_hidden_size: size of each intermediate sample.
output_dropout_prob: Output dropout probability. Defaults to 0.0.
init_method: method to initialize the first linear weight.
Defaults to :func:`nn.init.xavier_normal_`.
output_layer_init_method: method to initialize the second linear weight. If set to None,
it will use ``init_method`` instead. Defaults to None.
bias_gelu_fusion: If set to ``True``, it will fuse bias adding and elementwise
gelu activation. Defaults to ``False``.
bias_dropout_fusion: If set to ``True``, it will fuse bias adding and dropout.
Defaults to ``False``.
layer_idx: A layer_idx sign which determines the placement. It will be used in
pipeline parallelism. Defaults to 0.
"""
def __init__(
self,
hidden_size,
ffn_hidden_size,
output_dropout_prob=0.0,
init_method=nn.init.xavier_normal_,
output_layer_init_method=None,
bias_gelu_fusion=False,
bias_dropout_fusion=False,
*,
layer_idx=0,
):
super().__init__()
self.output_dropout_prob = output_dropout_prob
self.bias_gelu_fusion = bias_gelu_fusion
self.bias_dropout_fusion = bias_dropout_fusion
if output_layer_init_method is None:
output_layer_init_method = init_method
self.dense_h_to_4h = Linear(
hidden_size,
ffn_hidden_size,
bias=True,
parallel="col",
skip_bias_add=bias_gelu_fusion,
init_method=init_method,
layer_idx=layer_idx,
)
if not bias_gelu_fusion:
self.activation_func = build_activation("gelu")
self.dense_4h_to_h = Linear(
ffn_hidden_size,
hidden_size,
bias=True,
parallel="row",
skip_bias_add=bias_dropout_fusion,
init_method=output_layer_init_method,
layer_idx=layer_idx,
)
if not bias_dropout_fusion:
self.dropout = nn.Dropout(self.output_dropout_prob)
def forward(self, hidden_states):
intermediate = self.dense_h_to_4h(hidden_states)
if self.bias_gelu_fusion:
intermediate, bias = intermediate
intermediate = flow._C.fused_bias_add_gelu(
intermediate, bias, axis=intermediate.ndim - 1
)
else:
intermediate = self.activation_func(intermediate)
output = self.dense_4h_to_h(intermediate)
if self.bias_dropout_fusion:
output, bias = output
output = flow._C.fused_bias_add_dropout(
output, bias, p=self.output_dropout_prob, axis=output.ndim - 1
)
else:
output = self.dropout(output)
return output
def extra_repr(self) -> str:
return "bias_gelu_fusion={}, bias_dropout_fusion={}, dropout={}".format(
self.bias_gelu_fusion, self.bias_dropout_fusion, self.output_dropout_prob
)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow.nn as nn
from libai.utils import distributed as dist
from .attention import AttnMaskType, MultiheadAttention
from .droppath import DropPath
from .layer_norm import LayerNorm
from .mlp import MLP
class TransformerLayer(nn.Module):
"""A single transformer layer.
Transformer layer takes input with size [bsz, seq_length, hidden size] and returns an
output of the same size.
The input and output has same sbp sign, (S(0), B).
Arguments:
hidden_size: size of hidden state.
ffn_hidden_size: size of feed forword neural network.
num_attention_heads: number of attention heads.
is_decoder: used to specify whether this is transformer encoder layer or transformer
decoder layer. Default: ``False``.
attention_dropout_prob: dropout probability of attention weights.
output_dropout_prob: dropout probability of output.
layernorm_epsilon: epsilon used in layernorm layer. Default: `1e-5`.
init_method: method to initialize the input layer weights.
output_layer_init_method: method to initialize the output layer weights.
If None, use `init_method`.
bias_gelu_fusion: whether fuse add bias and gelu. Default: ``False``.
bias_dropout_fusion: whether fuse add bias and dropout. Default: ``False``.
scale_mask_softmax_fusion: whether to fuse scale, mask and softmax. Default: ``False``.
apply_query_key_layer_scaling: if `true`, scaling the attention score by layer index.
Default: ``False``.
apply_residual_post_layernorm: if ``true``, use original BERT residual
connection ordering. Otherwise, use Megatron BERT residual connection which
is more stable when scaling model size introduced in
https://arxiv.org/pdf/1909.08053.pdf.
Default: ``False``.
layer_idx: the layer index, which determines the placement.
"""
def __init__(
self,
hidden_size,
ffn_hidden_size,
num_attention_heads,
is_decoder=False,
attention_dropout_prob=0.0,
output_dropout_prob=0.0,
drop_path_prob=0.0,
layernorm_epsilon=1e-5,
init_method=nn.init.xavier_normal_,
output_layer_init_method=None,
bias_gelu_fusion=False,
bias_dropout_fusion=False,
scale_mask_softmax_fusion=False,
apply_query_key_layer_scaling=False,
apply_residual_post_layernorm=False,
attn_mask_type=AttnMaskType.padding,
*,
layer_idx=0
):
super().__init__()
self.hidden_size = hidden_size
self.ffn_hidden_size = ffn_hidden_size
self.num_attention_heads = num_attention_heads
self.attention_dropout_prob = attention_dropout_prob
self.output_dropout_prob = output_dropout_prob
self.layernorm_epsilon = layernorm_epsilon
self.attn_mask_type = attn_mask_type
self.layer_idx = layer_idx
self.is_decoder = is_decoder
self.bias_gelu_fusion = bias_gelu_fusion
self.bias_dropout_fusion = bias_dropout_fusion
self.scale_mask_softmax_fusion = scale_mask_softmax_fusion
self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
self.apply_residual_post_layernorm = apply_residual_post_layernorm
self.init_method = init_method
if output_layer_init_method is None:
output_layer_init_method = init_method
self.output_layer_init_method = output_layer_init_method
self.drop_path = DropPath(drop_path_prob) if drop_path_prob > 0.0 else nn.Identity()
self.input_layernorm = LayerNorm(
self.hidden_size, eps=self.layernorm_epsilon, layer_idx=self.layer_idx
)
self.self_attention = self.build_attention(is_cross_attention=False)
self.post_attention_layernorm = LayerNorm(
self.hidden_size, eps=self.layernorm_epsilon, layer_idx=self.layer_idx
)
if self.is_decoder:
self.cross_attention = self.build_attention(is_cross_attention=True)
self.post_cross_attention_layernorm = LayerNorm(
self.hidden_size, eps=self.layernorm_epsilon, layer_idx=self.layer_idx
)
self.mlp = MLP(
self.hidden_size,
self.ffn_hidden_size,
self.output_dropout_prob,
self.init_method,
output_layer_init_method=self.output_layer_init_method,
bias_gelu_fusion=self.bias_gelu_fusion,
bias_dropout_fusion=self.bias_dropout_fusion,
layer_idx=self.layer_idx,
)
def forward(
self,
hidden_states,
attention_mask=None,
encoder_states=None,
encoder_attention_mask=None,
past_key_value=None,
use_cache=False,
):
"""
Args:
hidden_states: shape is (batch_size, seq_length, hidden_size),
sbp signature is (S(0), B).
attention_mask: the combination of key padding mask and casual mask of hidden states
with shape (batch_size, 1, seq_length, seq_length) and the sbp
signature is (S(0), B),
encoder_states: encoder output with shape (batch_size, seq_length, hidden_size)
and the sbp signature is (S(0), B), which will be used in cross attention.
encoder_attention_mask: key padding mask of encoder states with shape
(batch_size, 1, seq_length, seq_length) and the sbp signature is (S(0), B).
past_key_value: tuple of key and value, each shape is
(seq_length, bsz, num_heads, head_size), For decoder layer,
the past_key_value contains the states both from self attention
and cross attention.
use_cache: it will be set to `True` when the model is in the inference phase and
used for incremental decoding.
"""
# Change placement for pipeline parallelsim
hidden_states = hidden_states.to_global(placement=dist.get_layer_placement(self.layer_idx))
# hidden_states shape: (batch_size, seq_length, hidden_size)
if attention_mask is not None:
attention_mask = attention_mask.to_global(
placement=dist.get_layer_placement(self.layer_idx)
)
if past_key_value is not None:
if self.is_decoder:
assert len(past_key_value) == 4
self_attn_past_key_value = past_key_value[:2]
cross_attn_past_key_value = past_key_value[2:]
else:
self_attn_past_key_value = past_key_value
cross_attn_past_key_value = None
else:
self_attn_past_key_value, cross_attn_past_key_value = None, None
layernorm_output = self.input_layernorm(hidden_states)
attention_output = self.self_attention(
layernorm_output,
attention_mask=attention_mask,
past_key_value=self_attn_past_key_value,
use_cache=use_cache,
)
attention_output = self.drop_path(attention_output)
if use_cache:
attention_output, presents = attention_output
if self.apply_residual_post_layernorm:
residual = layernorm_output
else:
residual = hidden_states
hidden_states = residual + attention_output
layernorm_output = self.post_attention_layernorm(hidden_states)
if self.is_decoder:
attention_output = self.cross_attention(
layernorm_output,
encoder_states,
attention_mask=encoder_attention_mask,
past_key_value=cross_attn_past_key_value,
use_cache=use_cache,
)
if use_cache:
attention_output, decoder_presents = attention_output
presents += decoder_presents
attention_output = self.drop_path(attention_output)
if self.apply_residual_post_layernorm:
residual = layernorm_output
else:
residual = hidden_states
hidden_states = residual + attention_output
layernorm_output = self.post_cross_attention_layernorm(hidden_states)
mlp_output = self.mlp(layernorm_output)
mlp_output = self.drop_path(mlp_output)
if self.apply_residual_post_layernorm:
residual = layernorm_output
else:
residual = hidden_states
output = residual + mlp_output
if use_cache:
output = (output, presents)
return output
def build_attention(self, is_cross_attention=False):
return MultiheadAttention(
self.hidden_size,
self.num_attention_heads,
is_cross_attention=is_cross_attention,
attention_dropout_prob=self.attention_dropout_prob,
output_dropout_prob=self.output_dropout_prob,
init_method=self.init_method,
output_layer_init_method=self.output_layer_init_method,
bias_dropout_fusion=self.bias_dropout_fusion,
scale_mask_softmax_fusion=self.scale_mask_softmax_fusion,
apply_query_key_layer_scaling=self.apply_query_key_layer_scaling,
attn_mask_type=self.attn_mask_type,
layer_idx=self.layer_idx,
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment