Commit e4c07faf authored by Victor SANH's avatar Victor SANH
Browse files

add sparsity modules

parent 667003e4
from .modules import *
from .configuration_bert_masked import MaskedBertConfig
from .modeling_bert_masked import (
MaskedBertModel,
MaskedBertForQuestionAnswering,
MaskedBertForSequenceClassification,
MaskedBertForTokenClassification,
MaskedBertForMultipleChoice,
)
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Masked BERT model configuration. It replicates the class `~transformers.BertConfig`
and adapts it to the specificities of MaskedBert (`pruning_method`, `mask_init` and `mask_scale`."""
import logging
from transformers.configuration_utils import PretrainedConfig
from transformers.configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
logger = logging.getLogger(__name__)
class MaskedBertConfig(PretrainedConfig):
"""
A class replicating the `~transformers.BertConfig` with additional parameters for pruning/masking configuration.
"""
pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
model_type = "masked_bert"
def __init__(
self,
vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
pad_token_id=0,
pruning_method="topK",
mask_init="constant",
mask_scale=0.0,
**kwargs
):
super().__init__(pad_token_id=pad_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.pruning_method = pruning_method
self.mask_init = mask_init
self.mask_scale = mask_scale
This diff is collapsed.
from .binarizer import ThresholdBinarizer, TopKBinarizer, MagnitudeBinarizer
from .masked_nn import MaskedLinear
# coding=utf-8
# Copyright 2020-present, AllenAI Authors, University of Illinois Urbana-Champaign,
# Intel Nervana Systems and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Binarizers take a (real value) matrice as input and produce a binary (values in {0,1}) mask of the same shape.
"""
import torch
from torch import autograd
class ThresholdBinarizer(autograd.Function):
"""
Thresholdd binarizer.
Computes a binary mask M from a real value matrix S such that `M_{i,j} = 1` if and only if `S_{i,j} > \tau`
where `\tau` is a real value threshold.
Implementation is inspired from:
https://github.com/arunmallya/piggyback
Piggyback: Adapting a Single Network to Multiple Tasks by Learning to Mask Weights
Arun Mallya, Dillon Davis, Svetlana Lazebnik
"""
@staticmethod
def forward(ctx, inputs: torch.tensor, threshold: float, sigmoid: bool):
"""
Args:
inputs (`torch.FloatTensor`)
The input matrix from which the binarizer computes the binary mask.
threshold (`float`)
The threshold value (in R).
sigmoid (`bool`)
If set to ``True``, we apply the sigmoid function to the `inputs` matrix before comparing to `threshold`.
In this case, `threshold` should be a value between 0 and 1.
Returns:
mask (`torch.FloatTensor`)
Binary matrix of the same size as `inputs` acting as a mask (1 - the associated weight is
retained, 0 - the associated weight is pruned).
"""
nb_elems = inputs.numel()
nb_min = int(0.005 * nb_elems) + 1
if sigmoid:
mask = (torch.sigmoid(inputs) > threshold).type(inputs.type())
else:
mask = (inputs > threshold).type(inputs.type())
if mask.sum() < nb_min:
# We limit the pruning so that at least 0.5% (half a percent) of the weights are remaining
k_threshold = inputs.flatten().kthvalue(max(nb_elems - nb_min, 1)).values
mask = (inputs > k_threshold).type(inputs.type())
return mask
@staticmethod
def backward(ctx, gradOutput):
return gradOutput, None, None
class TopKBinarizer(autograd.Function):
"""
Top-k Binarizer.
Computes a binary mask M from a real value matrix S such that `M_{i,j} = 1` if and only if `S_{i,j}`
is among the k% highest values of S.
Implementation is inspired from:
https://github.com/allenai/hidden-networks
What's hidden in a randomly weighted neural network?
Vivek Ramanujan*, Mitchell Wortsman*, Aniruddha Kembhavi, Ali Farhadi, Mohammad Rastegari
"""
@staticmethod
def forward(ctx, inputs: torch.tensor, threshold: float):
"""
Args:
inputs (`torch.FloatTensor`)
The input matrix from which the binarizer computes the binary mask.
threshold (`float`)
The percentage of weights to keep (the rest is pruned).
`threshold` is a float between 0 and 1.
Returns:
mask (`torch.FloatTensor`)
Binary matrix of the same size as `inputs` acting as a mask (1 - the associated weight is
retained, 0 - the associated weight is pruned).
"""
# Get the subnetwork by sorting the inputs and using the top threshold %
mask = inputs.clone()
_, idx = inputs.flatten().sort(descending=True)
j = int(threshold * inputs.numel())
# flat_out and mask access the same memory.
flat_out = mask.flatten()
flat_out[idx[j:]] = 0
flat_out[idx[:j]] = 1
return mask
@staticmethod
def backward(ctx, gradOutput):
return gradOutput, None
class MagnitudeBinarizer(object):
"""
Magnitude Binarizer.
Computes a binary mask M from a real value matrix S such that `M_{i,j} = 1` if and only if `S_{i,j}`
is among the k% highest values of |S| (absolute value).
Implementation is inspired from https://github.com/NervanaSystems/distiller/blob/2291fdcc2ea642a98d4e20629acb5a9e2e04b4e6/distiller/pruning/automated_gradual_pruner.py#L24
"""
@staticmethod
def apply(inputs: torch.tensor, threshold: float):
"""
Args:
inputs (`torch.FloatTensor`)
The input matrix from which the binarizer computes the binary mask.
This input marix is typically the weight matrix.
threshold (`float`)
The percentage of weights to keep (the rest is pruned).
`threshold` is a float between 0 and 1.
Returns:
mask (`torch.FloatTensor`)
Binary matrix of the same size as `inputs` acting as a mask (1 - the associated weight is
retained, 0 - the associated weight is pruned).
"""
# Get the subnetwork by sorting the inputs and using the top threshold %
mask = inputs.clone()
_, idx = inputs.abs().flatten().sort(descending=True)
j = int(threshold * inputs.numel())
# flat_out and mask access the same memory.
flat_out = mask.flatten()
flat_out[idx[j:]] = 0
flat_out[idx[:j]] = 1
return mask
# coding=utf-8
# Copyright 2020-present, the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Masked Linear module: A fully connected layer that computes an adaptive binary mask on the fly.
The mask (binary or not) is computed at each forward pass and multiplied against
the weight matrix to prune a portion of the weights.
The pruned weight matrix is then multiplied against the inputs (and if necessary, the bias is added).
"""
import torch
from torch import nn
from torch.nn import functional as F
from torch.nn import init
import math
from .binarizer import ThresholdBinarizer, TopKBinarizer, MagnitudeBinarizer
class MaskedLinear(nn.Linear):
"""
Fully Connected layer with on the fly adaptive mask.
If needed, a score matrix is created to store the importance of each associated weight.
"""
def __init__(
self,
in_features: int,
out_features: int,
bias: bool = True,
mask_init: str = "constant",
mask_scale: float = 0.0,
pruning_method: str = "topK",
):
"""
Args:
in_features (`int`)
Size of each input sample
out_features (`int`)
Size of each output sample
bias (`bool`)
If set to ``False``, the layer will not learn an additive bias.
Default: ``True``
mask_init (`str`)
The initialization method for the score matrix if a score matrix is needed.
Choices: ["constant", "uniform", "kaiming"]
Default: ``constant``
mask_scale (`float`)
The initialization parameter for the chosen initialization method `mask_init`.
Default: ``0.``
pruning_method (`str`)
Method to compute the mask.
Choices: ["topK", "threshold", "sigmoied_threshold", "magnitude", "l0"]
Default: ``topK``
"""
super(MaskedLinear, self).__init__(in_features=in_features, out_features=out_features, bias=bias)
assert pruning_method in ["topK", "threshold", "sigmoied_threshold", "magnitude", "l0"]
self.pruning_method = pruning_method
if self.pruning_method in ["topK", "threshold", "sigmoied_threshold", "l0"]:
self.mask_scale = mask_scale
self.mask_init = mask_init
self.mask_scores = nn.Parameter(torch.Tensor(self.weight.size()))
self.init_mask()
def init_mask(self):
if self.mask_init == "constant":
init.constant_(self.mask_scores, val=self.mask_scale)
elif self.mask_init == "uniform":
init.uniform_(self.mask_scores, a=-self.mask_scale, b=self.mask_scale)
elif self.mask_init == "kaiming":
init.kaiming_uniform_(self.mask_scores, a=math.sqrt(5))
def forward(self, input: torch.tensor, threshold: float):
# Get the mask
if self.pruning_method == "topK":
mask = TopKBinarizer.apply(self.mask_scores, threshold)
elif self.pruning_method in ["threshold", "sigmoied_threshold"]:
sig = "sigmoied" in self.pruning_method
mask = ThresholdBinarizer.apply(self.mask_scores, threshold, sig)
elif self.pruning_method == "magnitude":
mask = MagnitudeBinarizer.apply(self.weight, threshold)
elif self.pruning_method == "l0":
l, r, b = -0.1, 1.1, 2 / 3
if self.training:
u = torch.zeros_like(self.mask_scores).uniform_().clamp(0.0001, 0.9999)
s = torch.sigmoid((u.log() - (1 - u).log() + self.mask_scores) / b)
else:
s = torch.sigmoid(self.mask_scores)
s_bar = s * (r - l) + l
mask = s_bar.clamp(min=0.0, max=1.0)
# Mask weights with computed mask
weight_thresholded = mask * self.weight
# Compute output (linear layer) with masked weights
return F.linear(input, weight_thresholded, self.bias)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment