First add.

f75058c7 · Rayyyyy · f75058c7 · f75058c7 · f75058c7 · f75058c7
Commit f75058c7 authored Jul 25, 2024 by Rayyyyy
20 changed files
--- a/FlagEmbedding/baai_general_embedding/finetune/run.py
+++ b/FlagEmbedding/baai_general_embedding/finetune/run.py
+import logging
+import os
+from pathlib import Path
+from transformers import AutoConfig, AutoTokenizer
+from transformers import (
+    HfArgumentParser,
+    set_seed,
+)
+from .arguments import ModelArguments, DataArguments, \
+    RetrieverTrainingArguments as TrainingArguments
+from .data import TrainDatasetForEmbedding, EmbedCollator
+from .modeling import BiEncoderModel
+from .trainer import BiTrainer
+logger = logging.getLogger(__name__)
+def main():
+    parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    model_args: ModelArguments
+    data_args: DataArguments
+    training_args: TrainingArguments
+    if (
+            os.path.exists(training_args.output_dir)
+            and os.listdir(training_args.output_dir)
+            and training_args.do_train
+            and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
+        )
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        training_args.local_rank,
+        training_args.device,
+        training_args.n_gpu,
+        bool(training_args.local_rank != -1),
+        training_args.fp16,
+    )
+    logger.info("Training/evaluation parameters %s", training_args)
+    logger.info("Model parameters %s", model_args)
+    logger.info("Data parameters %s", data_args)
+    # Set seed
+    set_seed(training_args.seed)
+    num_labels = 1
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=False,
+    )
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        num_labels=num_labels,
+        cache_dir=model_args.cache_dir,
+    )
+    logger.info('Config: %s', config)
+    model = BiEncoderModel(model_name=model_args.model_name_or_path,
+                           normlized=training_args.normlized,
+                           sentence_pooling_method=training_args.sentence_pooling_method,
+                           negatives_cross_device=training_args.negatives_cross_device,
+                           temperature=training_args.temperature,
+                           use_inbatch_neg=training_args.use_inbatch_neg,
+                           )
+    if training_args.fix_position_embedding:
+        for k, v in model.named_parameters():
+            if "position_embeddings" in k:
+                logging.info(f"Freeze the parameters for {k}")
+                v.requires_grad = False
+    train_dataset = TrainDatasetForEmbedding(args=data_args, tokenizer=tokenizer)
+    trainer = BiTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        data_collator=EmbedCollator(
+            tokenizer,
+            query_max_len=data_args.query_max_len,
+            passage_max_len=data_args.passage_max_len
+        ),
+        tokenizer=tokenizer
+    )
+    Path(training_args.output_dir).mkdir(parents=True, exist_ok=True)
+    # Training
+    trainer.train()
+    trainer.save_model()
+    # For convenience, we also re-save the tokenizer to the same directory,
+    # so that you can share your model easily on huggingface.co/models =)
+    if trainer.is_world_process_zero():
+        tokenizer.save_pretrained(training_args.output_dir)
+if __name__ == "__main__":
+    main()
--- a/FlagEmbedding/baai_general_embedding/finetune/trainer.py
+++ b/FlagEmbedding/baai_general_embedding/finetune/trainer.py
+from sentence_transformers import SentenceTransformer, models
+from transformers.trainer import *
+def save_ckpt_for_sentence_transformers(ckpt_dir, pooling_mode: str = 'cls', normlized: bool=True):
+    word_embedding_model = models.Transformer(ckpt_dir)
+    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode=pooling_mode)
+    if normlized:
+        normlize_layer = models.Normalize()
+        model = SentenceTransformer(modules=[word_embedding_model, pooling_model, normlize_layer], device='cpu')
+    else:
+        model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device='cpu')
+    model.save(ckpt_dir)
+class BiTrainer(Trainer):
+    def _save(self, output_dir: Optional[str] = None, state_dict=None):
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
+        os.makedirs(output_dir, exist_ok=True)
+        logger.info("Saving model checkpoint to %s", output_dir)
+        # Save a trained model and configuration using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        if not hasattr(self.model, 'save'):
+            raise NotImplementedError(
+                f'MODEL {self.model.__class__.__name__} '
+                f'does not support save interface')
+        else:
+            self.model.save(output_dir)
+        if self.tokenizer is not None and self.is_world_process_zero():
+            self.tokenizer.save_pretrained(output_dir)
+        torch.save(self.args, os.path.join(output_dir, "training_args.bin"))
+        # save the checkpoint for sentence-transformers library
+        if self.is_world_process_zero():
+            save_ckpt_for_sentence_transformers(output_dir,
+                                                pooling_mode=self.args.sentence_pooling_method,
+                                                normlized=self.args.normlized)
+    def compute_loss(self, model, inputs, return_outputs=False):
+        """
+        How the loss is computed by Trainer. By default, all models return the loss in the first element.
+        Subclass and override for custom behavior.
+        """
+        outputs = model(**inputs)
+        loss = outputs.loss
+        return (loss, outputs) if return_outputs else loss
--- a/FlagEmbedding/baai_general_embedding/retromae_pretrain/__init__.py
+++ b/FlagEmbedding/baai_general_embedding/retromae_pretrain/__init__.py
--- a/FlagEmbedding/baai_general_embedding/retromae_pretrain/arguments.py
+++ b/FlagEmbedding/baai_general_embedding/retromae_pretrain/arguments.py
+import os
+from dataclasses import dataclass, field
+from typing import Optional
+@dataclass
+class DataTrainingArguments:
+    train_data: Optional[str] = field(
+        default=None, metadata={"help": "Path to pretrain data"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    max_seq_length: Optional[int] = field(
+        default=512,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+                    "than this will be truncated. Default to the max input length of the model."
+        },
+    )
+    encoder_mlm_probability: float = field(default=0.3, metadata={"help": "mask ratio for encoder"})
+    decoder_mlm_probability: float = field(default=0.5, metadata={"help": "mask ratio for decoder"})
+    def __post_init__(self):
+        if not os.path.exists(self.train_data):
+            raise FileNotFoundError(f"cannot find file: {self.train_data}, please set a true path")
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+    model_name_or_path: Optional[str] = field(
+        default='bert-base-uncased',
+        metadata={
+            "help": "The model checkpoint for weights initialization."
+                    "Don't set if you want to train a model from scratch."
+        },
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
--- a/FlagEmbedding/baai_general_embedding/retromae_pretrain/data.py
+++ b/FlagEmbedding/baai_general_embedding/retromae_pretrain/data.py
+import os
+import random
+from copy import deepcopy
+from dataclasses import dataclass
+import torch.utils.data.dataset
+from datasets import Dataset, load_dataset, concatenate_datasets
+from transformers import DataCollatorForWholeWordMask
+from .utils import tensorize_batch
+class DatasetForPretraining(torch.utils.data.Dataset):
+    def __init__(self, data_dir):
+        if os.path.isdir(data_dir):
+            datasets = []
+            for file in os.listdir(data_dir):
+                print(f"Loading {file}")
+                file = os.path.join(data_dir, file)
+                datasets.append(self.load_dataset(file))
+            self.dataset = concatenate_datasets(datasets)
+        else:
+            print(f"Loading {data_dir}")
+            self.dataset = self.load_dataset(data_dir)
+    def load_dataset(self, file):
+        if file.endswith('.jsonl') or file.endswith('.json'):
+            return load_dataset('json', data_files=file)['train']
+        elif os.path.isdir(file):
+            return Dataset.load_from_disk(file)
+        else:
+            raise NotImplementedError(f"Not support this file format:{file}")
+    def __getitem__(self, item):
+        return self.dataset[item]['text']
+    def __len__(self):
+        return len(self.dataset)
+@dataclass
+class RetroMAECollator(DataCollatorForWholeWordMask):
+    max_seq_length: int = 512
+    encoder_mlm_probability: float = 0.15
+    decoder_mlm_probability: float = 0.15
+    def __call__(self, examples):
+        input_ids_batch = []
+        attention_mask_batch = []
+        encoder_mlm_mask_batch = []
+        decoder_labels_batch = []
+        decoder_matrix_attention_mask_batch = []
+        for e in examples:
+            e_trunc = self.tokenizer.encode(e, max_length=self.max_seq_length, truncation=True)
+            tokens = [self.tokenizer._convert_id_to_token(tid) for tid in e_trunc]
+            self.mlm_probability = self.encoder_mlm_probability
+            text_encoder_mlm_mask = self._whole_word_mask(tokens)
+            self.mlm_probability = self.decoder_mlm_probability
+            mask_set = []
+            for _ in range(min(len(tokens), 128)):
+                mask_set.append(self._whole_word_mask(tokens))
+            text_matrix_attention_mask = []
+            for i in range(len(tokens)):
+                idx = random.randint(0, min(len(tokens), 128) - 1)
+                text_decoder_mlm_mask = deepcopy(mask_set[idx])
+                text_decoder_mlm_mask[i] = 1
+                text_matrix_attention_mask.append(text_decoder_mlm_mask)
+            input_ids_batch.append(torch.tensor(e_trunc))
+            attention_mask_batch.append(torch.tensor([1] * len(e_trunc)))
+            e_trunc[0] = -100
+            e_trunc[-1] = -100
+            decoder_labels_batch.append(torch.tensor(e_trunc))
+            encoder_mlm_mask_batch.append(torch.tensor(text_encoder_mlm_mask))
+            decoder_matrix_attention_mask_batch.append(1 - torch.tensor(text_matrix_attention_mask))
+        input_ids_batch = tensorize_batch(input_ids_batch, self.tokenizer.pad_token_id)
+        attention_mask_batch = tensorize_batch(attention_mask_batch, 0)
+        origin_input_ids_batch = input_ids_batch.clone()
+        encoder_mlm_mask_batch = tensorize_batch(encoder_mlm_mask_batch, 0)
+        encoder_input_ids_batch, encoder_labels_batch = self.torch_mask_tokens(input_ids_batch, encoder_mlm_mask_batch)
+        decoder_labels_batch = tensorize_batch(decoder_labels_batch, -100)
+        matrix_attention_mask_batch = tensorize_batch(decoder_matrix_attention_mask_batch, 0)
+        batch = {
+            "encoder_input_ids": encoder_input_ids_batch,
+            "encoder_attention_mask": attention_mask_batch,
+            "encoder_labels": encoder_labels_batch,
+            "decoder_input_ids": origin_input_ids_batch,
+            "decoder_attention_mask": matrix_attention_mask_batch,  # [B,L,L]
+            "decoder_labels": decoder_labels_batch,
+        }
+        return batch
--- a/FlagEmbedding/baai_general_embedding/retromae_pretrain/enhancedDecoder.py
+++ b/FlagEmbedding/baai_general_embedding/retromae_pretrain/enhancedDecoder.py
+'''
+The codes are modified based on huggingface transformers library.
+'''
+import math
+from typing import Optional, Tuple
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from transformers.modeling_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from transformers.models.bert.modeling_bert import BertIntermediate, BertOutput, BertSelfOutput
+from transformers.utils import (
+    logging,
+)
+logger = logging.get_logger(__name__)
+class BertSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        self.is_decoder = config.is_decoder
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(
+            self,
+            query,
+            key,
+            value,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            head_mask: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+            past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+            output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        mixed_query_layer = self.query(query)
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(key))
+            value_layer = self.transpose_for_scores(self.value(value))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(key))
+            value_layer = self.transpose_for_scores(self.value(value))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = query.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=query.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=query.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+class BertAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = BertSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(
+            self,
+            query: torch.Tensor,
+            key: torch.Tensor,
+            value: torch.Tensor,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            head_mask: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+            past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+            output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.self(
+            query, key, value,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], query)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+class BertLayerForDecoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = BertAttention(config, position_embedding_type="absolute")
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+    def forward(
+            self,
+            query: torch.Tensor,
+            key: torch.Tensor,
+            value: torch.Tensor,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            head_mask: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+            past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+            output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            query, key, value,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
+                )
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+        return outputs
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
--- a/FlagEmbedding/baai_general_embedding/retromae_pretrain/modeling.py
+++ b/FlagEmbedding/baai_general_embedding/retromae_pretrain/modeling.py
+import logging
+import os
+import torch
+from torch import nn
+from transformers import BertForMaskedLM, AutoModelForMaskedLM
+from transformers.modeling_outputs import MaskedLMOutput
+from .arguments import ModelArguments
+from .enhancedDecoder import BertLayerForDecoder
+logger = logging.getLogger(__name__)
+class RetroMAEForPretraining(nn.Module):
+    def __init__(
+            self,
+            bert: BertForMaskedLM,
+            model_args: ModelArguments,
+    ):
+        super(RetroMAEForPretraining, self).__init__()
+        self.lm = bert
+        if hasattr(self.lm, 'bert'):
+            self.decoder_embeddings = self.lm.bert.embeddings
+        elif hasattr(self.lm, 'roberta'):
+            self.decoder_embeddings = self.lm.roberta.embeddings
+        else:
+            self.decoder_embeddings = self.lm.bert.embeddings
+        self.c_head = BertLayerForDecoder(bert.config)
+        self.c_head.apply(self.lm._init_weights)
+        self.cross_entropy = nn.CrossEntropyLoss()
+        self.model_args = model_args
+    def gradient_checkpointing_enable(self, **kwargs):
+        self.lm.gradient_checkpointing_enable(**kwargs)
+    def forward(self,
+                encoder_input_ids, encoder_attention_mask, encoder_labels,
+                decoder_input_ids, decoder_attention_mask, decoder_labels):
+        lm_out: MaskedLMOutput = self.lm(
+            encoder_input_ids, encoder_attention_mask,
+            labels=encoder_labels,
+            output_hidden_states=True,
+            return_dict=True
+        )
+        cls_hiddens = lm_out.hidden_states[-1][:, :1]  # B 1 D
+        decoder_embedding_output = self.decoder_embeddings(input_ids=decoder_input_ids)
+        hiddens = torch.cat([cls_hiddens, decoder_embedding_output[:, 1:]], dim=1)
+        # decoder_position_ids = self.lm.bert.embeddings.position_ids[:, :decoder_input_ids.size(1)]
+        # decoder_position_embeddings = self.lm.bert.embeddings.position_embeddings(decoder_position_ids)  # B L D
+        # query = decoder_position_embeddings + cls_hiddens
+        cls_hiddens = cls_hiddens.expand(hiddens.size(0), hiddens.size(1), hiddens.size(2))
+        query = self.decoder_embeddings(inputs_embeds=cls_hiddens)
+        matrix_attention_mask = self.lm.get_extended_attention_mask(
+            decoder_attention_mask,
+            decoder_attention_mask.shape,
+            decoder_attention_mask.device
+        )
+        hiddens = self.c_head(query=query,
+                              key=hiddens,
+                              value=hiddens,
+                              attention_mask=matrix_attention_mask)[0]
+        pred_scores, loss = self.mlm_loss(hiddens, decoder_labels)
+        return (loss + lm_out.loss,)
+    def mlm_loss(self, hiddens, labels):
+        if hasattr(self.lm, 'cls'):
+            pred_scores = self.lm.cls(hiddens)
+        elif hasattr(self.lm, 'lm_head'):
+            pred_scores = self.lm.lm_head(hiddens)
+        else:
+            raise NotImplementedError
+        masked_lm_loss = self.cross_entropy(
+            pred_scores.view(-1, self.lm.config.vocab_size),
+            labels.view(-1)
+        )
+        return pred_scores, masked_lm_loss
+    def save_pretrained(self, output_dir: str):
+        self.lm.save_pretrained(os.path.join(output_dir, "encoder_model"))
+        torch.save(self.state_dict(), os.path.join(output_dir, 'pytorch_model.bin'))
+    @classmethod
+    def from_pretrained(
+            cls, model_args: ModelArguments,
+            *args, **kwargs
+    ):
+        hf_model = AutoModelForMaskedLM.from_pretrained(*args, **kwargs)
+        model = cls(hf_model, model_args)
+        return model
--- a/FlagEmbedding/baai_general_embedding/retromae_pretrain/run.py
+++ b/FlagEmbedding/baai_general_embedding/retromae_pretrain/run.py
+import logging
+import os
+import sys
+import transformers
+from transformers import (
+    AutoTokenizer,
+    BertForMaskedLM,
+    AutoConfig,
+    HfArgumentParser, set_seed, )
+from transformers import (
+    TrainerCallback,
+    TrainingArguments,
+    TrainerState,
+    TrainerControl
+)
+from transformers.trainer_utils import is_main_process
+from .arguments import DataTrainingArguments, ModelArguments
+from .data import DatasetForPretraining, RetroMAECollator
+from .modeling import RetroMAEForPretraining
+from .trainer import PreTrainer
+logger = logging.getLogger(__name__)
+class TrainerCallbackForSaving(TrainerCallback):
+    def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        """
+        Event called at the end of an epoch.
+        """
+        control.should_save = True
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    if (
+            os.path.exists(training_args.output_dir)
+            and os.listdir(training_args.output_dir)
+            and training_args.do_train
+            and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not empty."
+            "Use --overwrite_output_dir to overcome."
+        )
+    model_args: ModelArguments
+    data_args: DataTrainingArguments
+    training_args: TrainingArguments
+    training_args.remove_unused_columns = False
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN,
+    )
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    if is_main_process(training_args.local_rank):
+        transformers.utils.logging.set_verbosity_info()
+        transformers.utils.logging.enable_default_handler()
+        transformers.utils.logging.enable_explicit_format()
+    if training_args.local_rank in (0, -1):
+        logger.info("Training/evaluation parameters %s", training_args)
+        logger.info("Model parameters %s", model_args)
+        logger.info("Data parameters %s", data_args)
+    set_seed(training_args.seed)
+    model_class = RetroMAEForPretraining
+    collator_class = RetroMAECollator
+    if model_args.model_name_or_path:
+        model = model_class.from_pretrained(model_args, model_args.model_name_or_path)
+        logger.info(f"------Load model from {model_args.model_name_or_path}------")
+        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
+    elif model_args.config_name:
+        config = AutoConfig.from_pretrained(model_args.config_name)
+        bert = BertForMaskedLM(config)
+        model = model_class(bert, model_args)
+        logger.info("------Init the model------")
+        tokenizer = AutoTokenizer.from_pretrained(data_args.tokenizer_name)
+    else:
+        raise ValueError("You must provide the model_name_or_path or config_name")
+    dataset = DatasetForPretraining(data_args.train_data)
+    data_collator = collator_class(tokenizer,
+                                   encoder_mlm_probability=data_args.encoder_mlm_probability,
+                                   decoder_mlm_probability=data_args.decoder_mlm_probability,
+                                   max_seq_length=data_args.max_seq_length)
+    # Initialize our Trainer
+    trainer = PreTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=dataset,
+        data_collator=data_collator,
+        tokenizer=tokenizer
+    )
+    trainer.add_callback(TrainerCallbackForSaving())
+    # # Training
+    trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+    trainer.save_model()  # Saves the tokenizer too for easy upload
+if __name__ == "__main__":
+    main()
--- a/FlagEmbedding/baai_general_embedding/retromae_pretrain/trainer.py
+++ b/FlagEmbedding/baai_general_embedding/retromae_pretrain/trainer.py
+import logging
+import os
+from typing import Dict, Optional
+import torch
+from transformers import Trainer
+logger = logging.getLogger(__name__)
+class PreTrainer(Trainer):
+    def log(self, logs: Dict[str, float]) -> None:
+        """
+        Log `logs` on the various objects watching training.
+        Subclass and override this method to inject custom behavior.
+        Args:
+            logs (`Dict[str, float]`):
+                The values to log.
+        """
+        logs["step"] = self.state.global_step
+        if self.state.epoch is not None:
+            logs["epoch"] = round(self.state.epoch, 2)
+        output = {**logs, **{"step": self.state.global_step}}
+        self.state.log_history.append(output)
+        self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs)
+    def _save(self, output_dir: Optional[str] = None, state_dict=None):
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
+        os.makedirs(output_dir, exist_ok=True)
+        logger.info(f"Saving model checkpoint to {output_dir}")
+        # Save a trained model and configuration using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        if not hasattr(self.model, 'save_pretrained'):
+            logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
+            state_dict = self.model.state_dict()
+            torch.save(state_dict, os.path.join(output_dir, "pytorch_model.bin"))
+        else:
+            self.model.save_pretrained(output_dir)
+        if self.tokenizer is not None:
+            self.tokenizer.save_pretrained(os.path.join(output_dir, "encoder_model"))
+        # Good practice: save your training arguments together with the trained model
+        torch.save(self.args, os.path.join(output_dir, "training_args.bin"))
--- a/FlagEmbedding/baai_general_embedding/retromae_pretrain/utils.py
+++ b/FlagEmbedding/baai_general_embedding/retromae_pretrain/utils.py
+from typing import List
+import torch
+def tensorize_batch(sequences: List[torch.Tensor], padding_value, align_right=False) -> torch.Tensor:
+    if len(sequences[0].size()) == 1:
+        max_len_1 = max([s.size(0) for s in sequences])
+        out_dims = (len(sequences), max_len_1)
+        out_tensor = sequences[0].new_full(out_dims, padding_value)
+        for i, tensor in enumerate(sequences):
+            length_1 = tensor.size(0)
+            if align_right:
+                out_tensor[i, -length_1:] = tensor
+            else:
+                out_tensor[i, :length_1] = tensor
+        return out_tensor
+    elif len(sequences[0].size()) == 2:
+        max_len_1 = max([s.size(0) for s in sequences])
+        max_len_2 = max([s.size(1) for s in sequences])
+        out_dims = (len(sequences), max_len_1, max_len_2)
+        out_tensor = sequences[0].new_full(out_dims, padding_value)
+        for i, tensor in enumerate(sequences):
+            length_1 = tensor.size(0)
+            length_2 = tensor.size(1)
+            if align_right:
+                out_tensor[i, -length_1:, :length_2] = tensor
+            else:
+                out_tensor[i, :length_1, :length_2] = tensor
+        return out_tensor
+    else:
+        raise
--- a/FlagEmbedding/bge_m3.py
+++ b/FlagEmbedding/bge_m3.py
+from typing import cast, List, Union, Tuple, Optional, Dict
+import numpy as np
+from collections import defaultdict
+import torch
+from tqdm import tqdm
+import datasets
+from transformers import PreTrainedTokenizerFast, BatchEncoding, DataCollatorWithPadding, XLMRobertaForMaskedLM, is_torch_npu_available
+from torch.utils.data import DataLoader
+from functools import partial
+from FlagEmbedding.BGE_M3 import BGEM3ForInference
+def _transform_func(examples: Dict[str, List],
+                    tokenizer: PreTrainedTokenizerFast,
+                    max_length: int = 8192,
+                    ) -> BatchEncoding:
+    inputs = tokenizer(examples['text'],
+                       max_length=max_length,
+                       padding=True,
+                       return_token_type_ids=False,
+                       truncation=True,
+                       return_tensors='pt')
+    return inputs
+class BGEM3FlagModel:
+    def __init__(
+            self,
+            model_name_or_path: str = None,
+            pooling_method: str = 'cls',
+            normalize_embeddings: bool = True,
+            use_fp16: bool = True,
+            device: str = None
+    ) -> None:
+        self.model = BGEM3ForInference(
+            model_name=model_name_or_path,
+            normlized=normalize_embeddings,
+            sentence_pooling_method=pooling_method,
+        )
+        self.tokenizer = self.model.tokenizer
+        if device:
+            self.device = torch.device(device)
+        else:
+            if torch.cuda.is_available():
+                self.device = torch.device("cuda")
+            elif torch.backends.mps.is_available():
+                self.device = torch.device("mps")
+            elif is_torch_npu_available():
+                self.device = torch.device("npu")
+            else:
+                self.device = torch.device("cpu")
+                use_fp16 = False
+        if use_fp16: self.model.half()
+        self.model = self.model.to(self.device)
+        if device is None:
+            self.num_gpus = torch.cuda.device_count()
+            if self.num_gpus > 1:
+                print(f"----------using {self.num_gpus}*GPUs----------")
+                self.model.model = torch.nn.DataParallel(self.model.model)
+        else:
+            self.num_gpus = 1
+        self.model.eval()
+    def convert_id_to_token(self, lexical_weights: List[Dict]):
+        if isinstance(lexical_weights, dict):
+            lexical_weights = [lexical_weights]
+        new_lexical_weights = []
+        for item in lexical_weights:
+            new_item = {}
+            for id, weight in item.items():
+                token = self.tokenizer.decode([int(id)])
+                new_item[token] = weight
+            new_lexical_weights.append(new_item)
+        if len(new_lexical_weights) == 1:
+            new_lexical_weights = new_lexical_weights[0]
+        return new_lexical_weights
+    def compute_lexical_matching_score(self, lexical_weights_1: Dict, lexical_weights_2: Dict):
+        scores = 0
+        for token, weight in lexical_weights_1.items():
+            if token in lexical_weights_2:
+                scores += weight * lexical_weights_2[token]
+        return scores
+    def colbert_score(self, q_reps, p_reps):
+        q_reps, p_reps = torch.from_numpy(q_reps), torch.from_numpy(p_reps)
+        token_scores = torch.einsum('in,jn->ij', q_reps, p_reps)
+        scores, _ = token_scores.max(-1)
+        scores = torch.sum(scores) / q_reps.size(0)
+        return scores
+    @torch.no_grad()
+    def encode(self,
+               sentences: Union[List[str], str],
+               batch_size: int = 12,
+               max_length: int = 8192,
+               return_dense: bool = True,
+               return_sparse: bool = False,
+               return_colbert_vecs: bool = False) -> Dict:
+        if self.num_gpus > 1:
+            batch_size *= self.num_gpus
+        self.model.eval()
+        input_was_string = False
+        if isinstance(sentences, str):
+            sentences = [sentences]
+            input_was_string = True
+        def _process_token_weights(token_weights: np.ndarray, input_ids: list):
+            # conver to dict
+            result = defaultdict(int)
+            unused_tokens = set([self.tokenizer.cls_token_id, self.tokenizer.eos_token_id, self.tokenizer.pad_token_id,
+                                 self.tokenizer.unk_token_id])
+            # token_weights = np.ceil(token_weights * 100)
+            for w, idx in zip(token_weights, input_ids):
+                if idx not in unused_tokens and w > 0:
+                    idx = str(idx)
+                    # w = int(w)
+                    if w > result[idx]:
+                        result[idx] = w
+            return result
+        def _process_colbert_vecs(colbert_vecs: np.ndarray, attention_mask: list):
+            # delte the vectors of padding tokens
+            tokens_num = np.sum(attention_mask)
+            return colbert_vecs[:tokens_num - 1]  # we don't use the embedding of cls, so select tokens_num-1
+        all_dense_embeddings, all_lexical_weights, all_colbert_vec = [], [], []
+        for start_index in tqdm(range(0, len(sentences), batch_size), desc="Inference Embeddings",
+                                disable=len(sentences) < 256):
+            sentences_batch = sentences[start_index:start_index + batch_size]
+            batch_data = self.tokenizer(
+                sentences_batch,
+                padding=True,
+                truncation=True,
+                return_tensors='pt',
+                max_length=max_length,
+            ).to(self.device)
+            output = self.model(batch_data,
+                                return_dense=return_dense,
+                                return_sparse=return_sparse,
+                                return_colbert=return_colbert_vecs)
+            if return_dense:
+                all_dense_embeddings.append(output['dense_vecs'].cpu().numpy())
+            if return_sparse:
+                token_weights = output['sparse_vecs'].squeeze(-1)
+                all_lexical_weights.extend(list(map(_process_token_weights, token_weights.cpu().numpy(),
+                                                    batch_data['input_ids'].cpu().numpy().tolist())))
+            if return_colbert_vecs:
+                all_colbert_vec.extend(list(map(_process_colbert_vecs, output['colbert_vecs'].cpu().numpy(),
+                                                batch_data['attention_mask'].cpu().numpy())))
+        if return_dense:
+            all_dense_embeddings = np.concatenate(all_dense_embeddings, axis=0)
+        if return_dense:
+            if input_was_string:
+                all_dense_embeddings = all_dense_embeddings[0]
+        else:
+            all_dense_embeddings = None
+        if return_sparse:
+            if input_was_string:
+                all_lexical_weights = all_lexical_weights[0]
+        else:
+            all_lexical_weights = None
+        if return_colbert_vecs:
+            if input_was_string:
+                all_colbert_vec = all_colbert_vec[0]
+        else:
+            all_colbert_vec = None
+        return {"dense_vecs": all_dense_embeddings, "lexical_weights": all_lexical_weights,
+                "colbert_vecs": all_colbert_vec}
+    @torch.no_grad()
+    def compute_score(self,
+                      sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]],
+                      batch_size: int = 256,
+                      max_query_length: int = 512,
+                      max_passage_length: int = 8192,
+                      weights_for_different_modes: List[float] = None) -> Dict[str, List[float]]:
+        def _tokenize(texts: list, max_length: int):
+            return self.tokenizer(
+                texts,
+                max_length=max_length,
+                padding=True,
+                return_token_type_ids=False,
+                truncation=True,
+                return_tensors='pt'
+            )
+        if self.num_gpus > 0:
+            batch_size *= self.num_gpus
+        self.model.eval()
+        if isinstance(sentence_pairs, list) and len(sentence_pairs) == 0:
+            return []
+        if isinstance(sentence_pairs[0], str):
+            one_input_pair = True
+            sentence_pairs = [sentence_pairs]
+        else:
+            one_input_pair = False
+        all_scores = {
+            'colbert': [],
+            'sparse': [],
+            'dense': [],
+            'sparse+dense': [],
+            'colbert+sparse+dense': []
+        }
+        for start_index in tqdm(range(0, len(sentence_pairs), batch_size), desc="Compute Scores",
+                                disable=len(sentence_pairs) < 128):
+            sentences_batch = sentence_pairs[start_index:start_index + batch_size]
+            queries_batch = [pair[0] for pair in sentences_batch]
+            corpus_batch = [pair[1] for pair in sentences_batch]
+            queries_inputs = _tokenize(queries_batch, max_length=max_query_length).to(self.device)
+            corpus_inputs = _tokenize(corpus_batch, max_length=max_passage_length).to(self.device)
+            queries_output = self.model(queries_inputs, return_dense=True, return_sparse=True, return_colbert=True,
+                                        return_sparse_embedding=True)
+            corpus_output = self.model(corpus_inputs, return_dense=True, return_sparse=True, return_colbert=True,
+                                       return_sparse_embedding=True)
+            q_dense_vecs, q_sparse_vecs, q_colbert_vecs = queries_output['dense_vecs'], queries_output['sparse_vecs'], \
+            queries_output['colbert_vecs']
+            p_dense_vecs, p_sparse_vecs, p_colbert_vecs = corpus_output['dense_vecs'], corpus_output['sparse_vecs'], \
+            corpus_output['colbert_vecs']
+            dense_scores = self.model.dense_score(q_dense_vecs, p_dense_vecs)
+            sparse_scores = self.model.sparse_score(q_sparse_vecs, p_sparse_vecs)
+            colbert_scores = self.model.colbert_score(q_colbert_vecs, p_colbert_vecs,
+                                                      q_mask=queries_inputs['attention_mask'])
+            if weights_for_different_modes is None:
+                weights_for_different_modes = [1, 1., 1.]
+                weight_sum = 3
+                print("default weights for dense, sparse, colbert are [1.0, 1.0, 1.0] ")
+            else:
+                assert len(weights_for_different_modes) == 3
+                weight_sum = sum(weights_for_different_modes)
+            inx = torch.arange(0, len(sentences_batch))
+            dense_scores, sparse_scores, colbert_scores = dense_scores[inx, inx].float(), sparse_scores[
+                inx, inx].float(), colbert_scores[inx, inx].float()
+            all_scores['colbert'].extend(
+                colbert_scores.cpu().numpy().tolist()
+            )
+            all_scores['sparse'].extend(
+                sparse_scores.cpu().numpy().tolist()
+            )
+            all_scores['dense'].extend(
+                dense_scores.cpu().numpy().tolist()
+            )
+            all_scores['sparse+dense'].extend(
+                ((sparse_scores * weights_for_different_modes[1] + dense_scores * weights_for_different_modes[0])/(weights_for_different_modes[1]+weights_for_different_modes[0])).cpu().numpy().tolist()
+            )
+            all_scores['colbert+sparse+dense'].extend(
+                ((colbert_scores * weights_for_different_modes[2] + sparse_scores * weights_for_different_modes[1] + dense_scores * weights_for_different_modes[0])/weight_sum).cpu().numpy().tolist()
+            )
+        if one_input_pair:
+            return {k: v[0] for k, v in all_scores.items()}
+        return all_scores
--- a/FlagEmbedding/flag_models.py
+++ b/FlagEmbedding/flag_models.py
+from typing import cast, List, Union, Tuple
+import numpy as np
+import torch
+from tqdm import tqdm
+from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, is_torch_npu_available
+class FlagModel:
+    def __init__(
+            self,
+            model_name_or_path: str = None,
+            pooling_method: str = 'cls',
+            normalize_embeddings: bool = True,
+            query_instruction_for_retrieval: str = None,
+            use_fp16: bool = True
+    ) -> None:
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+        self.model = AutoModel.from_pretrained(model_name_or_path)
+        self.query_instruction_for_retrieval = query_instruction_for_retrieval
+        self.normalize_embeddings = normalize_embeddings
+        self.pooling_method = pooling_method
+        if torch.cuda.is_available():
+            self.device = torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            self.device = torch.device("mps")
+        elif is_torch_npu_available():
+            self.device = torch.device("npu")
+        else:
+            self.device = torch.device("cpu")
+            use_fp16 = False
+        if use_fp16: self.model.half()
+        self.model = self.model.to(self.device)
+        self.num_gpus = torch.cuda.device_count()
+        if self.num_gpus > 1:
+            print(f"----------using {self.num_gpus}*GPUs----------")
+            self.model = torch.nn.DataParallel(self.model)
+    def encode_queries(self, queries: Union[List[str], str],
+                       batch_size: int = 256,
+                       max_length: int = 512,
+                       convert_to_numpy: bool = True) -> np.ndarray:
+        '''
+        This function will be used for retrieval task
+        if there is a instruction for queries, we will add it to the query text
+        '''
+        if self.query_instruction_for_retrieval is not None:
+            if isinstance(queries, str):
+                input_texts = self.query_instruction_for_retrieval + queries
+            else:
+                input_texts = ['{}{}'.format(self.query_instruction_for_retrieval, q) for q in queries]
+        else:
+            input_texts = queries
+        return self.encode(input_texts, batch_size=batch_size, max_length=max_length, convert_to_numpy=convert_to_numpy)
+    def encode_corpus(self,
+                      corpus: Union[List[str], str],
+                      batch_size: int = 256,
+                      max_length: int = 512,
+                      convert_to_numpy: bool = True) -> np.ndarray:
+        '''
+        This function will be used for retrieval task
+        encode corpus for retrieval task
+        '''
+        return self.encode(corpus, batch_size=batch_size, max_length=max_length, convert_to_numpy=convert_to_numpy)
+    @torch.no_grad()
+    def encode(self,
+               sentences: Union[List[str], str],
+               batch_size: int = 256,
+               max_length: int = 512,
+               convert_to_numpy: bool = True) -> np.ndarray:
+        if self.num_gpus > 0:
+            batch_size = batch_size * self.num_gpus
+        self.model.eval()
+        input_was_string = False
+        if isinstance(sentences, str):
+            sentences = [sentences]
+            input_was_string = True
+        all_embeddings = []
+        for start_index in tqdm(range(0, len(sentences), batch_size), desc="Inference Embeddings",
+                                disable=len(sentences) < 256):
+            sentences_batch = sentences[start_index:start_index + batch_size]
+            inputs = self.tokenizer(
+                sentences_batch,
+                padding=True,
+                truncation=True,
+                return_tensors='pt',
+                max_length=max_length,
+            ).to(self.device)
+            last_hidden_state = self.model(**inputs, return_dict=True).last_hidden_state
+            embeddings = self.pooling(last_hidden_state, inputs['attention_mask'])
+            if self.normalize_embeddings:
+                embeddings = torch.nn.functional.normalize(embeddings, dim=-1)
+            embeddings = cast(torch.Tensor, embeddings)
+            if convert_to_numpy:
+                embeddings = embeddings.cpu().numpy()
+            all_embeddings.append(embeddings)
+        if convert_to_numpy:
+            all_embeddings = np.concatenate(all_embeddings, axis=0)
+        else:
+            all_embeddings = torch.stack(all_embeddings)
+        if input_was_string:
+            return all_embeddings[0]
+        return all_embeddings
+    def pooling(self,
+                last_hidden_state: torch.Tensor,
+                attention_mask: torch.Tensor = None):
+        if self.pooling_method == 'cls':
+            return last_hidden_state[:, 0]
+        elif self.pooling_method == 'mean':
+            s = torch.sum(last_hidden_state * attention_mask.unsqueeze(-1).float(), dim=1)
+            d = attention_mask.sum(dim=1, keepdim=True).float()
+            return s / d
+class LLMEmbedder:
+    instructions = {
+        "qa": {
+            "query": "Represent this query for retrieving relevant documents: ",
+            "key": "Represent this document for retrieval: ",
+        },
+        "convsearch": {
+            "query": "Encode this query and context for searching relevant passages: ",
+            "key": "Encode this passage for retrieval: ",
+        },
+        "chat": {
+            "query": "Embed this dialogue to find useful historical dialogues: ",
+            "key": "Embed this historical dialogue for retrieval: ",
+        },
+        "lrlm": {
+            "query": "Embed this text chunk for finding useful historical chunks: ",
+            "key": "Embed this historical text chunk for retrieval: ",
+        },
+        "icl": {
+            "query": "Convert this example into vector to look for useful examples: ",
+            "key": "Convert this example into vector for retrieval: ",
+        },
+        "tool": {
+            "query": "Transform this user request for fetching helpful tool descriptions: ",
+            "key": "Transform this tool description for retrieval: "
+        },
+    }
+    def __init__(
+            self,
+            model_name_or_path: str = None,
+            pooling_method: str = 'cls',
+            normalize_embeddings: bool = True,
+            use_fp16: bool = True
+    ) -> None:
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+        self.model = AutoModel.from_pretrained(model_name_or_path)
+        self.normalize_embeddings = normalize_embeddings
+        self.pooling_method = pooling_method
+        if torch.cuda.is_available():
+            self.device = torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            self.device = torch.device("mps")
+        elif is_torch_npu_available():
+            self.device = torch.device("npu")
+        else:
+            self.device = torch.device("cpu")
+            use_fp16 = False
+        if use_fp16: self.model.half()
+        self.model = self.model.to(self.device)
+        self.num_gpus = torch.cuda.device_count()
+        if self.num_gpus > 1:
+            print(f"----------using {self.num_gpus}*GPUs----------")
+            self.model = torch.nn.DataParallel(self.model)
+    def encode_queries(self, queries: Union[List[str], str],
+                       batch_size: int = 256,
+                       max_length: int = 256,
+                       task: str = 'qa') -> np.ndarray:
+        '''
+        Encode queries into dense vectors. 
+        Automatically add instructions according to given task.
+        '''
+        instruction = self.instructions[task]["query"]
+        if isinstance(queries, str):
+            input_texts = instruction + queries
+        else:
+            input_texts = [instruction + q for q in queries]
+        return self._encode(input_texts, batch_size=batch_size, max_length=max_length)
+    def encode_keys(self, keys: Union[List[str], str],
+                    batch_size: int = 256,
+                    max_length: int = 512,
+                    task: str = 'qa') -> np.ndarray:
+        '''
+        Encode keys into dense vectors. 
+        Automatically add instructions according to given task.
+        '''
+        instruction = self.instructions[task]["key"]
+        if isinstance(keys, str):
+            input_texts = instruction + keys
+        else:
+            input_texts = [instruction + k for k in keys]
+        return self._encode(input_texts, batch_size=batch_size, max_length=max_length)
+    @torch.no_grad()
+    def _encode(self, sentences: Union[List[str], str], batch_size: int = 256, max_length: int = 512) -> np.ndarray:
+        if self.num_gpus > 0:
+            batch_size = batch_size * self.num_gpus
+        self.model.eval()
+        input_was_string = False
+        if isinstance(sentences, str):
+            sentences = [sentences]
+            input_was_string = True
+        all_embeddings = []
+        for start_index in tqdm(range(0, len(sentences), batch_size), desc="Inference Embeddings",
+                                disable=len(sentences) < 256):
+            sentences_batch = sentences[start_index:start_index + batch_size]
+            inputs = self.tokenizer(
+                sentences_batch,
+                padding=True,
+                truncation=True,
+                return_tensors='pt',
+                max_length=max_length,
+            ).to(self.device)
+            last_hidden_state = self.model(**inputs, return_dict=True).last_hidden_state
+            embeddings = self.pooling(last_hidden_state, inputs['attention_mask'])
+            if self.normalize_embeddings:
+                embeddings = torch.nn.functional.normalize(embeddings, dim=-1)
+            embeddings = cast(torch.Tensor, embeddings)
+            all_embeddings.append(embeddings.cpu().numpy())
+        all_embeddings = np.concatenate(all_embeddings, axis=0)
+        if input_was_string:
+            return all_embeddings[0]
+        return all_embeddings
+    def pooling(self,
+                last_hidden_state: torch.Tensor,
+                attention_mask: torch.Tensor = None):
+        if self.pooling_method == 'cls':
+            return last_hidden_state[:, 0]
+        elif self.pooling_method == 'mean':
+            s = torch.sum(last_hidden_state * attention_mask.unsqueeze(-1).float(), dim=1)
+            d = attention_mask.sum(dim=1, keepdim=True).float()
+            return s / d
+        else:
+            raise NotImplementedError(f"Pooling method {self.pooling_method} not implemented!")
--- a/FlagEmbedding/flag_reranker.py
+++ b/FlagEmbedding/flag_reranker.py
+from typing import Union, List, Tuple, Any
+import numpy as np
+import torch
+from torch import Tensor
+from torch.utils.data import DataLoader
+from tqdm import tqdm, trange
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, is_torch_npu_available
+from peft import PeftModel
+import warnings
+from torch.utils.data import Dataset
+import os
+os.environ['TOKENIZERS_PARALLELISM'] = 'true'
+class DatasetForReranker(Dataset):
+    def __init__(
+            self,
+            dataset,
+            tokenizer_path: str,
+            max_len: int = 512,
+            query_prefix: str = 'A: ',
+            passage_prefix: str = 'B: ',
+            cache_dir: str = None,
+            prompt: str = None
+    ):
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path,
+                                                       trust_remote_code=True,
+                                                       cache_dir=cache_dir)
+        self.dataset = dataset
+        self.max_len = max_len
+        self.query_prefix = query_prefix
+        self.passage_prefix = passage_prefix
+        self.total_len = len(self.dataset)
+        if prompt is None:
+            prompt = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'."
+        self.prompt_inputs = self.tokenizer(prompt,
+                                            return_tensors=None,
+                                            add_special_tokens=False)['input_ids']
+        sep = "\n"
+        self.sep_inputs = self.tokenizer(sep,
+                                         return_tensors=None,
+                                         add_special_tokens=False)['input_ids']
+        self.encode_max_length = self.max_len + len(self.sep_inputs) + len(self.prompt_inputs)
+    def __len__(self):
+        return self.total_len
+    def __getitem__(self, item):
+        query, passage = self.dataset[item]
+        query = self.query_prefix + query
+        passage = self.passage_prefix + passage
+        query_inputs = self.tokenizer(query,
+                                      return_tensors=None,
+                                      add_special_tokens=False,
+                                      max_length=self.max_len * 3 // 4,
+                                      truncation=True)
+        passage_inputs = self.tokenizer(passage,
+                                        return_tensors=None,
+                                        add_special_tokens=False,
+                                        max_length=self.max_len,
+                                        truncation=True)
+        item = self.tokenizer.prepare_for_model(
+            [self.tokenizer.bos_token_id] + query_inputs['input_ids'],
+            self.sep_inputs + passage_inputs['input_ids'],
+            truncation='only_second',
+            max_length=self.encode_max_length,
+            padding=False,
+            return_attention_mask=False,
+            return_token_type_ids=False,
+            add_special_tokens=False
+        )
+        item['input_ids'] = item['input_ids'] + self.sep_inputs + self.prompt_inputs
+        item['attention_mask'] = [1] * len(item['input_ids'])
+        item.pop('token_type_ids') if 'token_type_ids' in item.keys() else None
+        if 'position_ids' in item.keys():
+            item['position_ids'] = list(range(len(item['input_ids'])))
+        return item
+class collater():
+    def __init__(self, tokenizer, max_len):
+        self.tokenizer = tokenizer
+        self.max_len = max_len
+        self.pad_to_multiple_of = 8
+        self.label_pad_token_id = -100
+        warnings.filterwarnings("ignore",
+                                message="`max_length` is ignored when `padding`=`True` and there is no truncation strategy.")
+    def __call__(self, data):
+        labels = [feature["labels"] for feature in data] if "labels" in data[0].keys() else None
+        # We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
+        # same length to return tensors.
+        if labels is not None:
+            max_label_length = max(len(l) for l in labels)
+            if self.pad_to_multiple_of is not None:
+                max_label_length = (
+                        (max_label_length + self.pad_to_multiple_of - 1)
+                        // self.pad_to_multiple_of
+                        * self.pad_to_multiple_of
+                )
+            padding_side = self.tokenizer.padding_side
+            for feature in data:
+                remainder = [self.label_pad_token_id] * (max_label_length - len(feature["labels"]))
+                if isinstance(feature["labels"], list):
+                    feature["labels"] = (
+                        feature["labels"] + remainder if padding_side == "right" else remainder + feature["labels"]
+                    )
+                elif padding_side == "right":
+                    feature["labels"] = np.concatenate([feature["labels"], remainder]).astype(np.int64)
+                else:
+                    feature["labels"] = np.concatenate([remainder, feature["labels"]]).astype(np.int64)
+        return self.tokenizer.pad(
+            data,
+            padding=True,
+            max_length=self.max_len,
+            pad_to_multiple_of=8,
+            return_tensors='pt',
+        )
+def last_logit_pool(logits: Tensor,
+                    attention_mask: Tensor) -> Tensor:
+    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
+    if left_padding:
+        return logits[:, -1, :]
+    else:
+        sequence_lengths = attention_mask.sum(dim=1) - 1
+        batch_size = logits.shape[0]
+        return torch.stack([logits[i, sequence_lengths[i], :] for i in range(batch_size)], dim=0)
+def last_logit_pool_layerwise(logits: Tensor,
+                              attention_mask: Tensor) -> Tensor:
+    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
+    if left_padding:
+        return logits[:, -1]
+    else:
+        sequence_lengths = attention_mask.sum(dim=1) - 1
+        batch_size = logits.shape[0]
+        return logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+def sigmoid(x):
+    return 1 / (1 + np.exp(-x))
+class FlagReranker:
+    def __init__(
+            self,
+            model_name_or_path: str = None,
+            use_fp16: bool = False,
+            cache_dir: str = None,
+            device: Union[str, int] = None
+    ) -> None:
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)
+        self.model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, cache_dir=cache_dir)
+        if device and isinstance(device, str):
+            self.device = torch.device(device)
+            if device == 'cpu':
+                use_fp16 = False
+        else:
+            if torch.cuda.is_available():
+                if device is not None:
+                    self.device = torch.device(f"cuda:{device}")
+                else:
+                    self.device = torch.device("cuda")
+            elif torch.backends.mps.is_available():
+                self.device = torch.device("mps")
+            elif is_torch_npu_available():
+                self.device = torch.device("npu")
+            else:
+                self.device = torch.device("cpu")
+                use_fp16 = False
+        if use_fp16:
+            self.model.half()
+        self.model = self.model.to(self.device)
+        self.model.eval()
+        if device is None:
+            self.num_gpus = torch.cuda.device_count()
+            if self.num_gpus > 1:
+                print(f"----------using {self.num_gpus}*GPUs----------")
+                self.model = torch.nn.DataParallel(self.model)
+        else:
+            self.num_gpus = 1
+    @torch.no_grad()
+    def compute_score(self, sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]], batch_size: int = 256,
+                      max_length: int = 512, normalize: bool = False) -> List[float]:
+        if self.num_gpus > 0:
+            batch_size = batch_size * self.num_gpus
+        assert isinstance(sentence_pairs, list)
+        if isinstance(sentence_pairs[0], str):
+            sentence_pairs = [sentence_pairs]
+        all_scores = []
+        for start_index in tqdm(range(0, len(sentence_pairs), batch_size), desc="Compute Scores",
+                                disable=len(sentence_pairs) < 128):
+            sentences_batch = sentence_pairs[start_index:start_index + batch_size]
+            inputs = self.tokenizer(
+                sentences_batch,
+                padding=True,
+                truncation=True,
+                return_tensors='pt',
+                max_length=max_length,
+            ).to(self.device)
+            scores = self.model(**inputs, return_dict=True).logits.view(-1, ).float()
+            all_scores.extend(scores.cpu().numpy().tolist())
+        if normalize:
+            all_scores = [sigmoid(score) for score in all_scores]
+        return all_scores
+class FlagLLMReranker:
+    def __init__(
+            self,
+            model_name_or_path: str = None,
+            peft_path: str = None,
+            use_fp16: bool = False,
+            use_bf16: bool = False,
+            cache_dir: str = None,
+            device: Union[str, int] = None
+    ) -> None:
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,
+                                                       cache_dir=cache_dir,
+                                                       trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
+                                                          cache_dir=cache_dir,
+                                                          trust_remote_code=True,
+                                                          torch_dtype=torch.bfloat16 if use_bf16 else torch.float32)
+        if peft_path:
+            self.model = PeftModel.from_pretrained(self.model,peft_path)
+            self.model = self.model.merge_and_unload()
+        self.model_name_or_path = model_name_or_path
+        self.cache_dir = cache_dir
+        if device and isinstance(device, str):
+            self.device = torch.device(device)
+        else:
+            device = 0 if device is None else device
+            if torch.cuda.is_available():
+                torch.cuda.set_device(device)
+                self.device = torch.device("cuda")
+            elif torch.backends.mps.is_available():
+                self.device = torch.device("mps")
+            elif is_torch_npu_available():
+                self.device = torch.device("npu")
+            else:
+                self.device = torch.device("cpu")
+                use_fp16 = False
+        if use_fp16 and use_bf16 is False:
+            self.model.half()
+        self.model = self.model.to(self.device)
+        self.model.eval()
+        self.yes_loc = self.tokenizer('Yes', add_special_tokens=False)['input_ids'][0]
+    @torch.no_grad()
+    def compute_score(self, sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]], batch_size: int = 16,
+                      max_length: int = 512, prompt: str = None, normalize: bool = False,
+                      use_dataloader: bool = False, num_workers: int = None) -> List[float]:
+        assert isinstance(sentence_pairs, list)
+        if isinstance(sentence_pairs[0], str):
+            sentence_pairs = [sentence_pairs]
+        length_sorted_idx = np.argsort([-self._text_length(q) - self._text_length(p) for q, p in sentence_pairs])
+        sentences_sorted = [sentence_pairs[idx] for idx in length_sorted_idx]
+        dataset, dataloader = None, None
+        if use_dataloader:
+            if num_workers is None:
+                num_workers = min(batch_size, 16)
+            dataset = DatasetForReranker(sentences_sorted,
+                                         self.model_name_or_path,
+                                         max_length,
+                                         cache_dir=self.cache_dir,
+                                         prompt=prompt)
+            dataloader = DataLoader(dataset, shuffle=False, batch_size=batch_size, drop_last=False,
+                                    num_workers=num_workers,
+                                    collate_fn=collater(self.tokenizer, max_length))
+        all_scores = []
+        if dataloader is not None:
+            for inputs in tqdm(dataloader):
+                inputs = inputs.to(self.device)
+                outputs = self.model(**inputs, output_hidden_states=True)
+                logits = outputs.logits
+                scores = last_logit_pool(logits, inputs['attention_mask'])
+                scores = scores[:, self.yes_loc]
+                all_scores.extend(scores.cpu().float().tolist())
+        else:
+            if prompt is None:
+                prompt = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'."
+            prompt_inputs = self.tokenizer(prompt,
+                                                return_tensors=None,
+                                                add_special_tokens=False)['input_ids']
+            sep = "\n"
+            sep_inputs = self.tokenizer(sep,
+                                             return_tensors=None,
+                                             add_special_tokens=False)['input_ids']
+            encode_max_length = max_length + len(sep_inputs) + len(prompt_inputs)
+            for batch_start in trange(0, len(sentences_sorted), batch_size):
+                batch_sentences = sentences_sorted[batch_start:batch_start + batch_size]
+                batch_sentences = [(f'A: {q}', f'B: {p}') for q,p in batch_sentences]
+                queries = [s[0] for s in batch_sentences]
+                passages = [s[1] for s in batch_sentences]
+                queries_inputs = self.tokenizer(queries,
+                                                return_tensors=None,
+                                                add_special_tokens=False,
+                                                max_length=max_length * 3 // 4,
+                                                truncation=True)
+                passages_inputs = self.tokenizer(passages,
+                                                 return_tensors=None,
+                                                 add_special_tokens=False,
+                                                 max_length=max_length,
+                                                 truncation=True)
+                batch_inputs = []
+                for query_inputs, passage_inputs in zip(queries_inputs['input_ids'], passages_inputs['input_ids']):
+                    item = self.tokenizer.prepare_for_model(
+                        [self.tokenizer.bos_token_id] + query_inputs,
+                        sep_inputs + passage_inputs,
+                        truncation='only_second',
+                        max_length=encode_max_length,
+                        padding=False,
+                        return_attention_mask=False,
+                        return_token_type_ids=False,
+                        add_special_tokens=False
+                    )
+                    item['input_ids'] = item['input_ids'] + sep_inputs + prompt_inputs
+                    item['attention_mask'] = [1] * len(item['input_ids'])
+                    item.pop('token_type_ids') if 'token_type_ids' in item.keys() else None
+                    if 'position_ids' in item.keys():
+                        item['position_ids'] = list(range(len(item['input_ids'])))
+                    batch_inputs.append(item)
+                collater_instance = collater(self.tokenizer, max_length)
+                batch_inputs = collater_instance(
+                    [{'input_ids': item['input_ids'], 'attention_mask': item['attention_mask']} for item in
+                     batch_inputs])
+                batch_inputs = {key: val.to(self.device) for key, val in batch_inputs.items()}
+                outputs = self.model(**batch_inputs, output_hidden_states=True)
+                logits = outputs.logits
+                scores = last_logit_pool(logits, batch_inputs['attention_mask'])
+                scores = scores[:, self.yes_loc]
+                all_scores.extend(scores.cpu().float().tolist())
+        all_scores = [all_scores[idx] for idx in np.argsort(length_sorted_idx)]
+        if normalize:
+            all_scores = [sigmoid(score) for score in all_scores]
+        # if len(all_scores) == 1:
+        #     return all_scores[0]
+        return all_scores
+    def _text_length(self, text: Union[List[int], List[List[int]]]):
+        """
+        Help function to get the length for the input text. Text can be either
+        a list of ints (which means a single text as input), or a tuple of list of ints
+        (representing several text inputs to the model).
+        """
+        if isinstance(text, dict):  # {key: value} case
+            return len(next(iter(text.values())))
+        elif not hasattr(text, '__len__'):  # Object has no len() method
+            return 1
+        elif len(text) == 0 or isinstance(text[0], int):  # Empty string or list of ints
+            return len(text)
+        else:
+            return sum([len(t) for t in text])  # Sum of length of individual strings
+class LayerWiseFlagLLMReranker:
+    def __init__(
+            self,
+            model_name_or_path: str = None,
+            peft_path: str = None,
+            use_fp16: bool = False,
+            use_bf16: bool = False,
+            cache_dir: str = None,
+            device: Union[str, int] = None
+    ) -> None:
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,
+                                                       cache_dir=cache_dir,
+                                                       trust_remote_code=True)
+        if use_bf16 is False and use_fp16 is False:
+            warnings.warn("Due to model constraints, `use_bf16` and `use_fp16` cannot both be `False`. Here, `use_fp16` is set to `True` by default.", UserWarning)
+            use_fp16 = True
+        self.model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
+                                                          cache_dir=cache_dir,
+                                                          trust_remote_code=True,
+                                                          torch_dtype=torch.bfloat16 if use_bf16 else torch.float32)
+        if peft_path:
+            self.model = PeftModel.from_pretrained(self.model,peft_path)
+            self.model = self.model.merge_and_unload()
+        self.model_name_or_path = model_name_or_path
+        self.cache_dir = cache_dir
+        if device and isinstance(device, str):
+            if device == 'cpu':
+                warnings.warn('The LLM-based layer-wise reranker does not support CPU; it has been set to CUDA.')
+                device = 'cuda'
+            self.device = torch.device(device)
+        else:
+            device = 0 if device is None else device
+            if torch.cuda.is_available():
+                torch.cuda.set_device(device)
+                self.device = torch.device("cuda")
+            elif torch.backends.mps.is_available():
+                self.device = torch.device("mps")
+            elif is_torch_npu_available():
+                self.device = torch.device("npu")
+            else:
+                self.device = torch.device("cpu")
+                use_fp16 = False
+        if use_fp16 and use_bf16 is False:
+            self.model.half()
+        self.model = self.model.to(self.device)
+        self.model.eval()
+        self.yes_loc = self.tokenizer('Yes', add_special_tokens=False)['input_ids'][0]
+    @torch.no_grad()
+    def compute_score(self, sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]], batch_size: int = 16,
+                      max_length: int = 512, cutoff_layers: List[int] = None, prompt: str = None,
+                      normalize: bool = False, use_dataloader: bool = False,
+                      num_workers: int = None) -> Union[float, List[float], List[List[float]]]:
+        assert isinstance(sentence_pairs, list)
+        if isinstance(sentence_pairs[0], str):
+            sentence_pairs = [sentence_pairs]
+        length_sorted_idx = np.argsort([-self._text_length(q) - self._text_length(p) for q, p in sentence_pairs])
+        sentences_sorted = [sentence_pairs[idx] for idx in length_sorted_idx]
+        dataset, dataloader = None, None
+        if use_dataloader:
+            if num_workers is None:
+                num_workers = min(batch_size, 16)
+            dataset = DatasetForReranker(sentences_sorted,
+                                         self.model_name_or_path,
+                                         max_length,
+                                         cache_dir=self.cache_dir,
+                                         prompt=prompt)
+            dataloader = DataLoader(dataset, shuffle=False, batch_size=batch_size, drop_last=False,
+                                    num_workers=num_workers,
+                                    collate_fn=collater(self.tokenizer, max_length))
+        all_scores = []
+        if dataloader is not None:
+            for inputs in tqdm(dataloader):
+                inputs = inputs.to(self.device)
+                outputs = self.model(**inputs, output_hidden_states=True, cutoff_layers=cutoff_layers)
+                all_logits = outputs.logits
+                tmp_all_scores = []
+                for logits in all_logits:
+                    scores = last_logit_pool_layerwise(logits, inputs['attention_mask'])
+                    tmp_all_scores.append(scores.contiguous())
+                if len(all_scores) == 0:
+                    for _ in range(len(tmp_all_scores)):
+                        all_scores.append([])
+                for i in range(len(tmp_all_scores)):
+                    all_scores[i].extend(tmp_all_scores[i].cpu().float().tolist())
+        else:
+            if prompt is None:
+                prompt = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'."
+            prompt_inputs = self.tokenizer(prompt,
+                                                return_tensors=None,
+                                                add_special_tokens=False)['input_ids']
+            sep = "\n"
+            sep_inputs = self.tokenizer(sep,
+                                             return_tensors=None,
+                                             add_special_tokens=False)['input_ids']
+            encode_max_length = max_length + len(sep_inputs) + len(prompt_inputs)
+            for batch_start in trange(0, len(sentences_sorted), batch_size):
+                batch_sentences = sentences_sorted[batch_start:batch_start + batch_size]
+                batch_sentences = [(f'A: {q}', f'B: {p}') for q, p in batch_sentences]
+                queries = [s[0] for s in batch_sentences]
+                passages = [s[1] for s in batch_sentences]
+                queries_inputs = self.tokenizer(queries,
+                                                return_tensors=None,
+                                                add_special_tokens=False,
+                                                max_length=max_length * 3 // 4,
+                                                truncation=True)
+                passages_inputs = self.tokenizer(passages,
+                                                 return_tensors=None,
+                                                 add_special_tokens=False,
+                                                 max_length=max_length,
+                                                 truncation=True)
+                batch_inputs = []
+                for query_inputs, passage_inputs in zip(queries_inputs['input_ids'], passages_inputs['input_ids']):
+                    item = self.tokenizer.prepare_for_model(
+                        [self.tokenizer.bos_token_id] + query_inputs,
+                        sep_inputs + passage_inputs,
+                        truncation='only_second',
+                        max_length=encode_max_length,
+                        padding=False,
+                        return_attention_mask=False,
+                        return_token_type_ids=False,
+                        add_special_tokens=False
+                    )
+                    item['input_ids'] = item['input_ids'] + sep_inputs + prompt_inputs
+                    item['attention_mask'] = [1] * len(item['input_ids'])
+                    item.pop('token_type_ids') if 'token_type_ids' in item.keys() else None
+                    if 'position_ids' in item.keys():
+                        item['position_ids'] = list(range(len(item['input_ids'])))
+                    batch_inputs.append(item)
+                collater_instance = collater(self.tokenizer, max_length)
+                batch_inputs = collater_instance(
+                    [{'input_ids': item['input_ids'], 'attention_mask': item['attention_mask']} for item in
+                     batch_inputs])
+                batch_inputs = {key: val.to(self.device) for key, val in batch_inputs.items()}
+                outputs = self.model(**batch_inputs, output_hidden_states=True, cutoff_layers=cutoff_layers)
+                all_logits = outputs.logits
+                tmp_all_scores = []
+                for logits in all_logits:
+                    scores = last_logit_pool_layerwise(logits, batch_inputs['attention_mask'])
+                    tmp_all_scores.append(scores.contiguous())
+                if len(all_scores) == 0:
+                    for _ in range(len(tmp_all_scores)):
+                        all_scores.append([])
+                for i in range(len(tmp_all_scores)):
+                    all_scores[i].extend(tmp_all_scores[i].cpu().float().tolist())
+        for i in range(len(all_scores)):
+            all_scores[i] = [all_scores[i][idx] for idx in np.argsort(length_sorted_idx)]
+            if normalize:
+                all_scores[i] = [sigmoid(score) for score in all_scores[i]]
+        # if len(all_scores) == 1:
+        #     if len(all_scores[0]) == 1:
+        #         return all_scores[0][0]
+        #     return all_scores[0]
+        return all_scores
+    def _text_length(self, text: Union[List[int], List[List[int]]]):
+        """
+        Help function to get the length for the input text. Text can be either
+        a list of ints (which means a single text as input), or a tuple of list of ints
+        (representing several text inputs to the model).
+        """
+        if isinstance(text, dict):  # {key: value} case
+            return len(next(iter(text.values())))
+        elif not hasattr(text, '__len__'):  # Object has no len() method
+            return 1
+        elif len(text) == 0 or isinstance(text[0], int):  # Empty string or list of ints
+            return len(text)
+        else:
+            return sum([len(t) for t in text])  # Sum of length of individual strings
--- a/FlagEmbedding/llm_embedder/README.md
+++ b/FlagEmbedding/llm_embedder/README.md
+<div align="center">
+<h1>LLM-Embedder [<a href="https://arxiv.org/abs/2310.07554">paper</a>]</h1>
+<img src="imgs/llm-embedder.png" width="60%" class="center">
+</div>
+This is the codebase for LLM-Embedder, a unified embedding model to comprehensively support the retrieval augmentation needs of large language models, including knowledge retrieval, memory retrieval, examplar retrieval, and tool retrieval. It is fine-tuned over 6 tasks: 
+- *Question Answering (qa)*
+- *Conversational Search (convsearch)*
+- *Long Conversation (chat)*
+- *Long-Range Language Modeling (lrlm)*
+- *In-Context Learning (icl)*
+- *Tool Learning (tool)*
+## Roadmap
+- Details about how to fine-tune the LLM-Embedder are [here](docs/fine-tune.md).
+- Details about how to evaluate different retrievers on various retrieval-augmented scenarios are [here](docs/evaluation.md).
+## Usage
+### Using `FlagEmbedding`
+```pip install -U FlagEmbedding```
+```python
+from FlagEmbedding import LLMEmbedder
+# Define queries and keys
+queries = ["test query 1", "test query 2"]
+keys = ["test key 1", "test key 2"]
+# Load model (automatically use GPUs)
+model = LLMEmbedder('BAAI/llm-embedder', use_fp16=False)
+# Encode for a specific task (qa, icl, chat, lrlm, tool, convsearch)
+task = "qa"
+query_embeddings = model.encode_queries(queries, task=task)
+key_embeddings = model.encode_keys(keys, task=task)
+similarity = query_embeddings @ key_embeddings.T
+print(similarity)
+# [[0.8971, 0.8534]
+# [0.8462, 0.9091]]
+```
+### Using `transformers`
+```pip install -U transformers```
+```python
+import torch
+from transformers import AutoTokenizer, AutoModel
+INSTRUCTIONS = {
+    "qa": {
+        "query": "Represent this query for retrieving relevant documents: ",
+        "key": "Represent this document for retrieval: ",
+    },
+    "icl": {
+        "query": "Convert this example into vector to look for useful examples: ",
+        "key": "Convert this example into vector for retrieval: ",
+    },
+    "chat": {
+        "query": "Embed this dialogue to find useful historical dialogues: ",
+        "key": "Embed this historical dialogue for retrieval: ",
+    },
+    "lrlm": {
+        "query": "Embed this text chunk for finding useful historical chunks: ",
+        "key": "Embed this historical text chunk for retrieval: ",
+    },
+    "tool": {
+        "query": "Transform this user request for fetching helpful tool descriptions: ",
+        "key": "Transform this tool description for retrieval: "
+    },
+    "convsearch": {
+        "query": "Encode this query and context for searching relevant passages: ",
+        "key": "Encode this passage for retrieval: ",
+    },
+}
+# Define queries and keys
+queries = ["test query 1", "test query 2"]
+keys = ["test key 1", "test key 2"]
+# Load model
+tokenizer = AutoTokenizer.from_pretrained('BAAI/llm-embedder')
+model = AutoModel.from_pretrained('BAAI/llm-embedder')
+# Add instructions for specific task (qa, icl, chat, lrlm, tool, convsearch)
+instruction = INSTRUCTIONS["qa"]
+queries = [instruction["query"] + query for query in queries]
+keys = [instruction["key"] + key for key in keys]
+# Tokenize sentences
+query_inputs = tokenizer(queries, padding=True, return_tensors='pt')
+key_inputs = tokenizer(keys, padding=True, return_tensors='pt')
+# Encode
+with torch.no_grad():
+    query_outputs = model(**query_inputs)
+    key_outputs = model(**key_inputs)
+    # CLS pooling
+    query_embeddings = query_outputs.last_hidden_state[:, 0]
+    key_embeddings = key_outputs.last_hidden_state[:, 0]
+    # Normalize
+    query_embeddings = torch.nn.functional.normalize(query_embeddings, p=2, dim=1)
+    key_embeddings = torch.nn.functional.normalize(key_embeddings, p=2, dim=1)
+similarity = query_embeddings @ key_embeddings.T
+print(similarity)
+# [[0.8971, 0.8534]
+# [0.8462, 0.9091]]
+```
+### Using `sentence-transformers`
+```pip install -U sentence-transformers```
+```python
+from sentence_transformers import SentenceTransformer
+INSTRUCTIONS = {
+    "qa": {
+        "query": "Represent this query for retrieving relevant documents: ",
+        "key": "Represent this document for retrieval: ",
+    },
+    "icl": {
+        "query": "Convert this example into vector to look for useful examples: ",
+        "key": "Convert this example into vector for retrieval: ",
+    },
+    "chat": {
+        "query": "Embed this dialogue to find useful historical dialogues: ",
+        "key": "Embed this historical dialogue for retrieval: ",
+    },
+    "lrlm": {
+        "query": "Embed this text chunk for finding useful historical chunks: ",
+        "key": "Embed this historical text chunk for retrieval: ",
+    },
+    "tool": {
+        "query": "Transform this user request for fetching helpful tool descriptions: ",
+        "key": "Transform this tool description for retrieval: "
+    },
+    "convsearch": {
+        "query": "Encode this query and context for searching relevant passages: ",
+        "key": "Encode this passage for retrieval: ",
+    },
+}
+# Define queries and keys
+queries = ["test query 1", "test query 2"]
+keys = ["test key 1", "test key 2"]
+# Load model
+model = SentenceTransformer('BAAI/llm-embedder', device="cpu")
+# Add instructions for specific task (qa, icl, chat, lrlm, tool, convsearch)
+instruction = INSTRUCTIONS["qa"]
+queries = [instruction["query"] + query for query in queries]
+keys = [instruction["key"] + key for key in keys]
+# Encode
+query_embeddings = model.encode(queries)
+key_embeddings = model.encode(keys)
+similarity = query_embeddings @ key_embeddings.T
+print(similarity)
+# [[0.8971, 0.8534]
+# [0.8462, 0.9091]]
+```
+## Contact
+If you have any question or suggestion related to this project, feel free to open an issue or pull request. You also can email Peitian Zhang (namespace.pt@gmail.com).
+## Citation
+If you find this repository useful, please consider giving a star ⭐ and citation
+```
+@misc{zhang2023retrieve,
+      title={Retrieve Anything To Augment Large Language Models}, 
+      author={Peitian Zhang and Shitao Xiao and Zheng Liu and Zhicheng Dou and Jian-Yun Nie},
+      year={2023},
+      eprint={2310.07554},
+      archivePrefix={arXiv},
+      primaryClass={cs.IR}
+}
+```
\ No newline at end of file
--- a/FlagEmbedding/llm_embedder/data/deepspeed/stage0.json
+++ b/FlagEmbedding/llm_embedder/data/deepspeed/stage0.json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupDecayLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto",
+            "total_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 0
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 100,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
--- a/FlagEmbedding/llm_embedder/data/deepspeed/stage2-offload.json
+++ b/FlagEmbedding/llm_embedder/data/deepspeed/stage2-offload.json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupDecayLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto",
+            "total_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 5e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 5e8,
+        "contiguous_gradients": true
+    },
+    "offload_optimizer": {
+        "device": "cpu",
+        "pin_memory": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
\ No newline at end of file
--- a/FlagEmbedding/llm_embedder/data/deepspeed/stage2.json
+++ b/FlagEmbedding/llm_embedder/data/deepspeed/stage2.json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupDecayLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto",
+            "total_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 5e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 5e8,
+        "contiguous_gradients": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
--- a/FlagEmbedding/llm_embedder/data/deepspeed/stage3-offload-all.json
+++ b/FlagEmbedding/llm_embedder/data/deepspeed/stage3-offload-all.json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupDecayLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto",
+            "total_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
\ No newline at end of file
--- a/FlagEmbedding/llm_embedder/data/deepspeed/stage3-offload-optim.json
+++ b/FlagEmbedding/llm_embedder/data/deepspeed/stage3-offload-optim.json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupDecayLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto",
+            "total_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
\ No newline at end of file
--- a/FlagEmbedding/llm_embedder/data/deepspeed/stage3.json
+++ b/FlagEmbedding/llm_embedder/data/deepspeed/stage3.json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupDecayLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto",
+            "total_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto"
+}
\ No newline at end of file