simcse_sup.py

# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import oneflow as flow
from oneflow import nn

from libai.utils import distributed as dist
from projects.SimCSE.modeling.model_utils import MLPLayer, cosine_similarity
from projects.SimCSE.utils.load_huggingface_weight import load_huggingface_bert

from .bert_for_simcse import BertForSimCSE


class Simcse_sup(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.bert = BertForSimCSE(cfg)
        self.mlp = MLPLayer(cfg)
        self.pooler_type = cfg.pooler_type

        if cfg.pretrained_model_weight is not None:
            load_huggingface_bert(
                self.bert,
                cfg.pretrained_model_weight,
                cfg["hidden_size"],
                cfg["num_attention_heads"],
                cfg["hidden_layers"],
            )

    def pooler(self, inputs, attention_mask):
        if self.pooler_type == "cls":
            return inputs[0][:, 0]

        elif self.pooler_type == "pooled":
            return inputs[1]

        elif self.pooler_type == "last-avg":
            last_hidden = inputs[0]
            return (last_hidden * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(
                -1
            ).unsqueeze(-1)

        elif self.pooler_type == "first-last-avg":
            first_hidden = inputs[2][1]
            last_hidden = inputs[0]
            res = ((first_hidden + last_hidden) / 2.0 * attention_mask.unsqueeze(-1)).sum(
                1
            ) / attention_mask.sum(-1).unsqueeze(-1)
            return res

    def create_use_row(self, labels):
        count = 0
        use_row = []
        for row in range(labels.size(0)):
            if count % 2 == 0 and count != 0:
                count = 0
                continue
            use_row.append(row)
            count += 1
        return flow.tensor(use_row, sbp=labels.sbp, placement=labels.placement)

    def forward(self, input_ids, attention_mask, token_type_ids=None, labels=None):
        if self.training:
            bs = input_ids.size(0)
            input_ids = input_ids.view(bs * 3, -1)
            attention_mask = attention_mask.view(bs * 3, -1)
            out = self.bert(input_ids, attention_mask)
            out = self.pooler(out, attention_mask)
            out = self.mlp(out)
            labels = flow.arange(
                out.size(0),
                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
                placement=out.placement,
            )
            use_row = self.create_use_row(labels)
            labels = (use_row - use_row % 3 * 2) + 1
            sim = cosine_similarity(out.unsqueeze(1), out.unsqueeze(0))
            sim = (
                sim
                - flow.eye(
                    out.size(0),
                    sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
                    placement=out.placement,
                )
                * 1e12
            )
            sim = flow.index_select(sim, dim=0, index=use_row)
            sim = sim / 0.05
            loss = nn.CrossEntropyLoss()(sim, labels)
            return {"loss": loss}
        else:
            bs = input_ids.size(0)
            input_ids = input_ids.view(bs * 2, -1)
            attention_mask = attention_mask.view(bs * 2, -1)
            out = self.bert(input_ids, attention_mask)
            out = self.pooler(out, attention_mask)
            self.mlp(out)
            out = out.view(bs, 2, -1)
            sent1 = out[:, 0]
            sent2 = out[:, 1]
            sim = cosine_similarity(sent1, sent2)
            sim = sim.to_global(sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]))
            return {"sim": sim.unsqueeze(1), "labels": labels}