Commit f75058c7 authored by Rayyyyy's avatar Rayyyyy
Browse files

First add.

parents
Pipeline #1411 canceled with stages
import logging
import os
from pathlib import Path
from transformers import AutoConfig, AutoTokenizer
from transformers import (
HfArgumentParser,
set_seed,
)
from .arguments import ModelArguments, DataArguments, \
RetrieverTrainingArguments as TrainingArguments
from .data import TrainDatasetForEmbedding, EmbedCollator
from .modeling import BiEncoderModel
from .trainer import BiTrainer
logger = logging.getLogger(__name__)
def main():
parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
model_args: ModelArguments
data_args: DataArguments
training_args: TrainingArguments
if (
os.path.exists(training_args.output_dir)
and os.listdir(training_args.output_dir)
and training_args.do_train
and not training_args.overwrite_output_dir
):
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
)
# Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
)
logger.warning(
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
training_args.local_rank,
training_args.device,
training_args.n_gpu,
bool(training_args.local_rank != -1),
training_args.fp16,
)
logger.info("Training/evaluation parameters %s", training_args)
logger.info("Model parameters %s", model_args)
logger.info("Data parameters %s", data_args)
# Set seed
set_seed(training_args.seed)
num_labels = 1
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_fast=False,
)
config = AutoConfig.from_pretrained(
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
num_labels=num_labels,
cache_dir=model_args.cache_dir,
)
logger.info('Config: %s', config)
model = BiEncoderModel(model_name=model_args.model_name_or_path,
normlized=training_args.normlized,
sentence_pooling_method=training_args.sentence_pooling_method,
negatives_cross_device=training_args.negatives_cross_device,
temperature=training_args.temperature,
use_inbatch_neg=training_args.use_inbatch_neg,
)
if training_args.fix_position_embedding:
for k, v in model.named_parameters():
if "position_embeddings" in k:
logging.info(f"Freeze the parameters for {k}")
v.requires_grad = False
train_dataset = TrainDatasetForEmbedding(args=data_args, tokenizer=tokenizer)
trainer = BiTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
data_collator=EmbedCollator(
tokenizer,
query_max_len=data_args.query_max_len,
passage_max_len=data_args.passage_max_len
),
tokenizer=tokenizer
)
Path(training_args.output_dir).mkdir(parents=True, exist_ok=True)
# Training
trainer.train()
trainer.save_model()
# For convenience, we also re-save the tokenizer to the same directory,
# so that you can share your model easily on huggingface.co/models =)
if trainer.is_world_process_zero():
tokenizer.save_pretrained(training_args.output_dir)
if __name__ == "__main__":
main()
from sentence_transformers import SentenceTransformer, models
from transformers.trainer import *
def save_ckpt_for_sentence_transformers(ckpt_dir, pooling_mode: str = 'cls', normlized: bool=True):
word_embedding_model = models.Transformer(ckpt_dir)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode=pooling_mode)
if normlized:
normlize_layer = models.Normalize()
model = SentenceTransformer(modules=[word_embedding_model, pooling_model, normlize_layer], device='cpu')
else:
model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device='cpu')
model.save(ckpt_dir)
class BiTrainer(Trainer):
def _save(self, output_dir: Optional[str] = None, state_dict=None):
output_dir = output_dir if output_dir is not None else self.args.output_dir
os.makedirs(output_dir, exist_ok=True)
logger.info("Saving model checkpoint to %s", output_dir)
# Save a trained model and configuration using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
if not hasattr(self.model, 'save'):
raise NotImplementedError(
f'MODEL {self.model.__class__.__name__} '
f'does not support save interface')
else:
self.model.save(output_dir)
if self.tokenizer is not None and self.is_world_process_zero():
self.tokenizer.save_pretrained(output_dir)
torch.save(self.args, os.path.join(output_dir, "training_args.bin"))
# save the checkpoint for sentence-transformers library
if self.is_world_process_zero():
save_ckpt_for_sentence_transformers(output_dir,
pooling_mode=self.args.sentence_pooling_method,
normlized=self.args.normlized)
def compute_loss(self, model, inputs, return_outputs=False):
"""
How the loss is computed by Trainer. By default, all models return the loss in the first element.
Subclass and override for custom behavior.
"""
outputs = model(**inputs)
loss = outputs.loss
return (loss, outputs) if return_outputs else loss
import os
from dataclasses import dataclass, field
from typing import Optional
@dataclass
class DataTrainingArguments:
train_data: Optional[str] = field(
default=None, metadata={"help": "Path to pretrain data"}
)
tokenizer_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)
max_seq_length: Optional[int] = field(
default=512,
metadata={
"help": "The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated. Default to the max input length of the model."
},
)
encoder_mlm_probability: float = field(default=0.3, metadata={"help": "mask ratio for encoder"})
decoder_mlm_probability: float = field(default=0.5, metadata={"help": "mask ratio for decoder"})
def __post_init__(self):
if not os.path.exists(self.train_data):
raise FileNotFoundError(f"cannot find file: {self.train_data}, please set a true path")
@dataclass
class ModelArguments:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
"""
model_name_or_path: Optional[str] = field(
default='bert-base-uncased',
metadata={
"help": "The model checkpoint for weights initialization."
"Don't set if you want to train a model from scratch."
},
)
config_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
)
import os
import random
from copy import deepcopy
from dataclasses import dataclass
import torch.utils.data.dataset
from datasets import Dataset, load_dataset, concatenate_datasets
from transformers import DataCollatorForWholeWordMask
from .utils import tensorize_batch
class DatasetForPretraining(torch.utils.data.Dataset):
def __init__(self, data_dir):
if os.path.isdir(data_dir):
datasets = []
for file in os.listdir(data_dir):
print(f"Loading {file}")
file = os.path.join(data_dir, file)
datasets.append(self.load_dataset(file))
self.dataset = concatenate_datasets(datasets)
else:
print(f"Loading {data_dir}")
self.dataset = self.load_dataset(data_dir)
def load_dataset(self, file):
if file.endswith('.jsonl') or file.endswith('.json'):
return load_dataset('json', data_files=file)['train']
elif os.path.isdir(file):
return Dataset.load_from_disk(file)
else:
raise NotImplementedError(f"Not support this file format:{file}")
def __getitem__(self, item):
return self.dataset[item]['text']
def __len__(self):
return len(self.dataset)
@dataclass
class RetroMAECollator(DataCollatorForWholeWordMask):
max_seq_length: int = 512
encoder_mlm_probability: float = 0.15
decoder_mlm_probability: float = 0.15
def __call__(self, examples):
input_ids_batch = []
attention_mask_batch = []
encoder_mlm_mask_batch = []
decoder_labels_batch = []
decoder_matrix_attention_mask_batch = []
for e in examples:
e_trunc = self.tokenizer.encode(e, max_length=self.max_seq_length, truncation=True)
tokens = [self.tokenizer._convert_id_to_token(tid) for tid in e_trunc]
self.mlm_probability = self.encoder_mlm_probability
text_encoder_mlm_mask = self._whole_word_mask(tokens)
self.mlm_probability = self.decoder_mlm_probability
mask_set = []
for _ in range(min(len(tokens), 128)):
mask_set.append(self._whole_word_mask(tokens))
text_matrix_attention_mask = []
for i in range(len(tokens)):
idx = random.randint(0, min(len(tokens), 128) - 1)
text_decoder_mlm_mask = deepcopy(mask_set[idx])
text_decoder_mlm_mask[i] = 1
text_matrix_attention_mask.append(text_decoder_mlm_mask)
input_ids_batch.append(torch.tensor(e_trunc))
attention_mask_batch.append(torch.tensor([1] * len(e_trunc)))
e_trunc[0] = -100
e_trunc[-1] = -100
decoder_labels_batch.append(torch.tensor(e_trunc))
encoder_mlm_mask_batch.append(torch.tensor(text_encoder_mlm_mask))
decoder_matrix_attention_mask_batch.append(1 - torch.tensor(text_matrix_attention_mask))
input_ids_batch = tensorize_batch(input_ids_batch, self.tokenizer.pad_token_id)
attention_mask_batch = tensorize_batch(attention_mask_batch, 0)
origin_input_ids_batch = input_ids_batch.clone()
encoder_mlm_mask_batch = tensorize_batch(encoder_mlm_mask_batch, 0)
encoder_input_ids_batch, encoder_labels_batch = self.torch_mask_tokens(input_ids_batch, encoder_mlm_mask_batch)
decoder_labels_batch = tensorize_batch(decoder_labels_batch, -100)
matrix_attention_mask_batch = tensorize_batch(decoder_matrix_attention_mask_batch, 0)
batch = {
"encoder_input_ids": encoder_input_ids_batch,
"encoder_attention_mask": attention_mask_batch,
"encoder_labels": encoder_labels_batch,
"decoder_input_ids": origin_input_ids_batch,
"decoder_attention_mask": matrix_attention_mask_batch, # [B,L,L]
"decoder_labels": decoder_labels_batch,
}
return batch
'''
The codes are modified based on huggingface transformers library.
'''
import math
from typing import Optional, Tuple
import torch
import torch.utils.checkpoint
from torch import nn
from transformers.modeling_utils import (
apply_chunking_to_forward,
find_pruneable_heads_and_indices,
prune_linear_layer,
)
from transformers.models.bert.modeling_bert import BertIntermediate, BertOutput, BertSelfOutput
from transformers.utils import (
logging,
)
logger = logging.get_logger(__name__)
class BertSelfAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.position_embedding_type = position_embedding_type or getattr(
config, "position_embedding_type", "absolute"
)
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
self.max_position_embeddings = config.max_position_embeddings
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
self.is_decoder = config.is_decoder
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
query,
key,
value,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
mixed_query_layer = self.query(query)
# If this is instantiated as a cross-attention module, the keys
# and values come from an encoder; the attention mask needs to be
# such that the encoder's padding tokens are not attended to.
is_cross_attention = encoder_hidden_states is not None
if is_cross_attention and past_key_value is not None:
# reuse k,v, cross_attentions
key_layer = past_key_value[0]
value_layer = past_key_value[1]
attention_mask = encoder_attention_mask
elif is_cross_attention:
key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
attention_mask = encoder_attention_mask
elif past_key_value is not None:
key_layer = self.transpose_for_scores(self.key(key))
value_layer = self.transpose_for_scores(self.value(value))
key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
else:
key_layer = self.transpose_for_scores(self.key(key))
value_layer = self.transpose_for_scores(self.value(value))
query_layer = self.transpose_for_scores(mixed_query_layer)
if self.is_decoder:
# if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
# Further calls to cross_attention layer can then reuse all cross-attention
# key/value_states (first "if" case)
# if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
# all previous decoder key/value_states. Further calls to uni-directional self-attention
# can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
# if encoder bi-directional self-attention `past_key_value` is always `None`
past_key_value = (key_layer, value_layer)
# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
seq_length = query.size()[1]
position_ids_l = torch.arange(seq_length, dtype=torch.long, device=query.device).view(-1, 1)
position_ids_r = torch.arange(seq_length, dtype=torch.long, device=query.device).view(1, -1)
distance = position_ids_l - position_ids_r
positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility
if self.position_embedding_type == "relative_key":
relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
attention_scores = attention_scores + relative_position_scores
elif self.position_embedding_type == "relative_key_query":
relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities.
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)
# Mask heads if we want to
if head_mask is not None:
attention_probs = attention_probs * head_mask
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(new_context_layer_shape)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
if self.is_decoder:
outputs = outputs + (past_key_value,)
return outputs
class BertAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
self.self = BertSelfAttention(config, position_embedding_type=position_embedding_type)
self.output = BertSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
# Prune linear layers
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
# Update hyper params and store pruned heads
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_outputs = self.self(
query, key, value,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
attention_output = self.output(self_outputs[0], query)
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
return outputs
class BertLayerForDecoder(nn.Module):
def __init__(self, config):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = BertAttention(config)
self.is_decoder = config.is_decoder
self.add_cross_attention = config.add_cross_attention
if self.add_cross_attention:
if not self.is_decoder:
raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
self.crossattention = BertAttention(config, position_embedding_type="absolute")
self.intermediate = BertIntermediate(config)
self.output = BertOutput(config)
def forward(
self,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
# decoder uni-directional self-attention cached key/values tuple is at positions 1,2
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention(
query, key, value,
attention_mask,
head_mask,
output_attentions=output_attentions,
past_key_value=self_attn_past_key_value,
)
attention_output = self_attention_outputs[0]
# if decoder, the last output is tuple of self-attn cache
if self.is_decoder:
outputs = self_attention_outputs[1:-1]
present_key_value = self_attention_outputs[-1]
else:
outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
cross_attn_present_key_value = None
if self.is_decoder and encoder_hidden_states is not None:
if not hasattr(self, "crossattention"):
raise ValueError(
f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
)
# cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
cross_attention_outputs = self.crossattention(
attention_output,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
cross_attn_past_key_value,
output_attentions,
)
attention_output = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights
# add cross-attn cache to positions 3,4 of present_key_value tuple
cross_attn_present_key_value = cross_attention_outputs[-1]
present_key_value = present_key_value + cross_attn_present_key_value
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
outputs = (layer_output,) + outputs
# if decoder, return the attn key/values as the last output
if self.is_decoder:
outputs = outputs + (present_key_value,)
return outputs
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
import logging
import os
import torch
from torch import nn
from transformers import BertForMaskedLM, AutoModelForMaskedLM
from transformers.modeling_outputs import MaskedLMOutput
from .arguments import ModelArguments
from .enhancedDecoder import BertLayerForDecoder
logger = logging.getLogger(__name__)
class RetroMAEForPretraining(nn.Module):
def __init__(
self,
bert: BertForMaskedLM,
model_args: ModelArguments,
):
super(RetroMAEForPretraining, self).__init__()
self.lm = bert
if hasattr(self.lm, 'bert'):
self.decoder_embeddings = self.lm.bert.embeddings
elif hasattr(self.lm, 'roberta'):
self.decoder_embeddings = self.lm.roberta.embeddings
else:
self.decoder_embeddings = self.lm.bert.embeddings
self.c_head = BertLayerForDecoder(bert.config)
self.c_head.apply(self.lm._init_weights)
self.cross_entropy = nn.CrossEntropyLoss()
self.model_args = model_args
def gradient_checkpointing_enable(self, **kwargs):
self.lm.gradient_checkpointing_enable(**kwargs)
def forward(self,
encoder_input_ids, encoder_attention_mask, encoder_labels,
decoder_input_ids, decoder_attention_mask, decoder_labels):
lm_out: MaskedLMOutput = self.lm(
encoder_input_ids, encoder_attention_mask,
labels=encoder_labels,
output_hidden_states=True,
return_dict=True
)
cls_hiddens = lm_out.hidden_states[-1][:, :1] # B 1 D
decoder_embedding_output = self.decoder_embeddings(input_ids=decoder_input_ids)
hiddens = torch.cat([cls_hiddens, decoder_embedding_output[:, 1:]], dim=1)
# decoder_position_ids = self.lm.bert.embeddings.position_ids[:, :decoder_input_ids.size(1)]
# decoder_position_embeddings = self.lm.bert.embeddings.position_embeddings(decoder_position_ids) # B L D
# query = decoder_position_embeddings + cls_hiddens
cls_hiddens = cls_hiddens.expand(hiddens.size(0), hiddens.size(1), hiddens.size(2))
query = self.decoder_embeddings(inputs_embeds=cls_hiddens)
matrix_attention_mask = self.lm.get_extended_attention_mask(
decoder_attention_mask,
decoder_attention_mask.shape,
decoder_attention_mask.device
)
hiddens = self.c_head(query=query,
key=hiddens,
value=hiddens,
attention_mask=matrix_attention_mask)[0]
pred_scores, loss = self.mlm_loss(hiddens, decoder_labels)
return (loss + lm_out.loss,)
def mlm_loss(self, hiddens, labels):
if hasattr(self.lm, 'cls'):
pred_scores = self.lm.cls(hiddens)
elif hasattr(self.lm, 'lm_head'):
pred_scores = self.lm.lm_head(hiddens)
else:
raise NotImplementedError
masked_lm_loss = self.cross_entropy(
pred_scores.view(-1, self.lm.config.vocab_size),
labels.view(-1)
)
return pred_scores, masked_lm_loss
def save_pretrained(self, output_dir: str):
self.lm.save_pretrained(os.path.join(output_dir, "encoder_model"))
torch.save(self.state_dict(), os.path.join(output_dir, 'pytorch_model.bin'))
@classmethod
def from_pretrained(
cls, model_args: ModelArguments,
*args, **kwargs
):
hf_model = AutoModelForMaskedLM.from_pretrained(*args, **kwargs)
model = cls(hf_model, model_args)
return model
import logging
import os
import sys
import transformers
from transformers import (
AutoTokenizer,
BertForMaskedLM,
AutoConfig,
HfArgumentParser, set_seed, )
from transformers import (
TrainerCallback,
TrainingArguments,
TrainerState,
TrainerControl
)
from transformers.trainer_utils import is_main_process
from .arguments import DataTrainingArguments, ModelArguments
from .data import DatasetForPretraining, RetroMAECollator
from .modeling import RetroMAEForPretraining
from .trainer import PreTrainer
logger = logging.getLogger(__name__)
class TrainerCallbackForSaving(TrainerCallback):
def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
"""
Event called at the end of an epoch.
"""
control.should_save = True
def main():
# See all possible arguments in src/transformers/training_args.py
# or by passing the --help flag to this script.
# We now keep distinct sets of args, for a cleaner separation of concerns.
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
# If we pass only one argument to the script and it's the path to a json file,
# let's parse it to get our arguments.
model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
if (
os.path.exists(training_args.output_dir)
and os.listdir(training_args.output_dir)
and training_args.do_train
and not training_args.overwrite_output_dir
):
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty."
"Use --overwrite_output_dir to overcome."
)
model_args: ModelArguments
data_args: DataTrainingArguments
training_args: TrainingArguments
training_args.remove_unused_columns = False
# Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN,
)
# Log on each process the small summary:
logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)
# Set the verbosity to info of the Transformers logger (on main process only):
if is_main_process(training_args.local_rank):
transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
if training_args.local_rank in (0, -1):
logger.info("Training/evaluation parameters %s", training_args)
logger.info("Model parameters %s", model_args)
logger.info("Data parameters %s", data_args)
set_seed(training_args.seed)
model_class = RetroMAEForPretraining
collator_class = RetroMAECollator
if model_args.model_name_or_path:
model = model_class.from_pretrained(model_args, model_args.model_name_or_path)
logger.info(f"------Load model from {model_args.model_name_or_path}------")
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
elif model_args.config_name:
config = AutoConfig.from_pretrained(model_args.config_name)
bert = BertForMaskedLM(config)
model = model_class(bert, model_args)
logger.info("------Init the model------")
tokenizer = AutoTokenizer.from_pretrained(data_args.tokenizer_name)
else:
raise ValueError("You must provide the model_name_or_path or config_name")
dataset = DatasetForPretraining(data_args.train_data)
data_collator = collator_class(tokenizer,
encoder_mlm_probability=data_args.encoder_mlm_probability,
decoder_mlm_probability=data_args.decoder_mlm_probability,
max_seq_length=data_args.max_seq_length)
# Initialize our Trainer
trainer = PreTrainer(
model=model,
args=training_args,
train_dataset=dataset,
data_collator=data_collator,
tokenizer=tokenizer
)
trainer.add_callback(TrainerCallbackForSaving())
# # Training
trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
trainer.save_model() # Saves the tokenizer too for easy upload
if __name__ == "__main__":
main()
import logging
import os
from typing import Dict, Optional
import torch
from transformers import Trainer
logger = logging.getLogger(__name__)
class PreTrainer(Trainer):
def log(self, logs: Dict[str, float]) -> None:
"""
Log `logs` on the various objects watching training.
Subclass and override this method to inject custom behavior.
Args:
logs (`Dict[str, float]`):
The values to log.
"""
logs["step"] = self.state.global_step
if self.state.epoch is not None:
logs["epoch"] = round(self.state.epoch, 2)
output = {**logs, **{"step": self.state.global_step}}
self.state.log_history.append(output)
self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs)
def _save(self, output_dir: Optional[str] = None, state_dict=None):
output_dir = output_dir if output_dir is not None else self.args.output_dir
os.makedirs(output_dir, exist_ok=True)
logger.info(f"Saving model checkpoint to {output_dir}")
# Save a trained model and configuration using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
if not hasattr(self.model, 'save_pretrained'):
logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
state_dict = self.model.state_dict()
torch.save(state_dict, os.path.join(output_dir, "pytorch_model.bin"))
else:
self.model.save_pretrained(output_dir)
if self.tokenizer is not None:
self.tokenizer.save_pretrained(os.path.join(output_dir, "encoder_model"))
# Good practice: save your training arguments together with the trained model
torch.save(self.args, os.path.join(output_dir, "training_args.bin"))
from typing import List
import torch
def tensorize_batch(sequences: List[torch.Tensor], padding_value, align_right=False) -> torch.Tensor:
if len(sequences[0].size()) == 1:
max_len_1 = max([s.size(0) for s in sequences])
out_dims = (len(sequences), max_len_1)
out_tensor = sequences[0].new_full(out_dims, padding_value)
for i, tensor in enumerate(sequences):
length_1 = tensor.size(0)
if align_right:
out_tensor[i, -length_1:] = tensor
else:
out_tensor[i, :length_1] = tensor
return out_tensor
elif len(sequences[0].size()) == 2:
max_len_1 = max([s.size(0) for s in sequences])
max_len_2 = max([s.size(1) for s in sequences])
out_dims = (len(sequences), max_len_1, max_len_2)
out_tensor = sequences[0].new_full(out_dims, padding_value)
for i, tensor in enumerate(sequences):
length_1 = tensor.size(0)
length_2 = tensor.size(1)
if align_right:
out_tensor[i, -length_1:, :length_2] = tensor
else:
out_tensor[i, :length_1, :length_2] = tensor
return out_tensor
else:
raise
from typing import cast, List, Union, Tuple, Optional, Dict
import numpy as np
from collections import defaultdict
import torch
from tqdm import tqdm
import datasets
from transformers import PreTrainedTokenizerFast, BatchEncoding, DataCollatorWithPadding, XLMRobertaForMaskedLM, is_torch_npu_available
from torch.utils.data import DataLoader
from functools import partial
from FlagEmbedding.BGE_M3 import BGEM3ForInference
def _transform_func(examples: Dict[str, List],
tokenizer: PreTrainedTokenizerFast,
max_length: int = 8192,
) -> BatchEncoding:
inputs = tokenizer(examples['text'],
max_length=max_length,
padding=True,
return_token_type_ids=False,
truncation=True,
return_tensors='pt')
return inputs
class BGEM3FlagModel:
def __init__(
self,
model_name_or_path: str = None,
pooling_method: str = 'cls',
normalize_embeddings: bool = True,
use_fp16: bool = True,
device: str = None
) -> None:
self.model = BGEM3ForInference(
model_name=model_name_or_path,
normlized=normalize_embeddings,
sentence_pooling_method=pooling_method,
)
self.tokenizer = self.model.tokenizer
if device:
self.device = torch.device(device)
else:
if torch.cuda.is_available():
self.device = torch.device("cuda")
elif torch.backends.mps.is_available():
self.device = torch.device("mps")
elif is_torch_npu_available():
self.device = torch.device("npu")
else:
self.device = torch.device("cpu")
use_fp16 = False
if use_fp16: self.model.half()
self.model = self.model.to(self.device)
if device is None:
self.num_gpus = torch.cuda.device_count()
if self.num_gpus > 1:
print(f"----------using {self.num_gpus}*GPUs----------")
self.model.model = torch.nn.DataParallel(self.model.model)
else:
self.num_gpus = 1
self.model.eval()
def convert_id_to_token(self, lexical_weights: List[Dict]):
if isinstance(lexical_weights, dict):
lexical_weights = [lexical_weights]
new_lexical_weights = []
for item in lexical_weights:
new_item = {}
for id, weight in item.items():
token = self.tokenizer.decode([int(id)])
new_item[token] = weight
new_lexical_weights.append(new_item)
if len(new_lexical_weights) == 1:
new_lexical_weights = new_lexical_weights[0]
return new_lexical_weights
def compute_lexical_matching_score(self, lexical_weights_1: Dict, lexical_weights_2: Dict):
scores = 0
for token, weight in lexical_weights_1.items():
if token in lexical_weights_2:
scores += weight * lexical_weights_2[token]
return scores
def colbert_score(self, q_reps, p_reps):
q_reps, p_reps = torch.from_numpy(q_reps), torch.from_numpy(p_reps)
token_scores = torch.einsum('in,jn->ij', q_reps, p_reps)
scores, _ = token_scores.max(-1)
scores = torch.sum(scores) / q_reps.size(0)
return scores
@torch.no_grad()
def encode(self,
sentences: Union[List[str], str],
batch_size: int = 12,
max_length: int = 8192,
return_dense: bool = True,
return_sparse: bool = False,
return_colbert_vecs: bool = False) -> Dict:
if self.num_gpus > 1:
batch_size *= self.num_gpus
self.model.eval()
input_was_string = False
if isinstance(sentences, str):
sentences = [sentences]
input_was_string = True
def _process_token_weights(token_weights: np.ndarray, input_ids: list):
# conver to dict
result = defaultdict(int)
unused_tokens = set([self.tokenizer.cls_token_id, self.tokenizer.eos_token_id, self.tokenizer.pad_token_id,
self.tokenizer.unk_token_id])
# token_weights = np.ceil(token_weights * 100)
for w, idx in zip(token_weights, input_ids):
if idx not in unused_tokens and w > 0:
idx = str(idx)
# w = int(w)
if w > result[idx]:
result[idx] = w
return result
def _process_colbert_vecs(colbert_vecs: np.ndarray, attention_mask: list):
# delte the vectors of padding tokens
tokens_num = np.sum(attention_mask)
return colbert_vecs[:tokens_num - 1] # we don't use the embedding of cls, so select tokens_num-1
all_dense_embeddings, all_lexical_weights, all_colbert_vec = [], [], []
for start_index in tqdm(range(0, len(sentences), batch_size), desc="Inference Embeddings",
disable=len(sentences) < 256):
sentences_batch = sentences[start_index:start_index + batch_size]
batch_data = self.tokenizer(
sentences_batch,
padding=True,
truncation=True,
return_tensors='pt',
max_length=max_length,
).to(self.device)
output = self.model(batch_data,
return_dense=return_dense,
return_sparse=return_sparse,
return_colbert=return_colbert_vecs)
if return_dense:
all_dense_embeddings.append(output['dense_vecs'].cpu().numpy())
if return_sparse:
token_weights = output['sparse_vecs'].squeeze(-1)
all_lexical_weights.extend(list(map(_process_token_weights, token_weights.cpu().numpy(),
batch_data['input_ids'].cpu().numpy().tolist())))
if return_colbert_vecs:
all_colbert_vec.extend(list(map(_process_colbert_vecs, output['colbert_vecs'].cpu().numpy(),
batch_data['attention_mask'].cpu().numpy())))
if return_dense:
all_dense_embeddings = np.concatenate(all_dense_embeddings, axis=0)
if return_dense:
if input_was_string:
all_dense_embeddings = all_dense_embeddings[0]
else:
all_dense_embeddings = None
if return_sparse:
if input_was_string:
all_lexical_weights = all_lexical_weights[0]
else:
all_lexical_weights = None
if return_colbert_vecs:
if input_was_string:
all_colbert_vec = all_colbert_vec[0]
else:
all_colbert_vec = None
return {"dense_vecs": all_dense_embeddings, "lexical_weights": all_lexical_weights,
"colbert_vecs": all_colbert_vec}
@torch.no_grad()
def compute_score(self,
sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]],
batch_size: int = 256,
max_query_length: int = 512,
max_passage_length: int = 8192,
weights_for_different_modes: List[float] = None) -> Dict[str, List[float]]:
def _tokenize(texts: list, max_length: int):
return self.tokenizer(
texts,
max_length=max_length,
padding=True,
return_token_type_ids=False,
truncation=True,
return_tensors='pt'
)
if self.num_gpus > 0:
batch_size *= self.num_gpus
self.model.eval()
if isinstance(sentence_pairs, list) and len(sentence_pairs) == 0:
return []
if isinstance(sentence_pairs[0], str):
one_input_pair = True
sentence_pairs = [sentence_pairs]
else:
one_input_pair = False
all_scores = {
'colbert': [],
'sparse': [],
'dense': [],
'sparse+dense': [],
'colbert+sparse+dense': []
}
for start_index in tqdm(range(0, len(sentence_pairs), batch_size), desc="Compute Scores",
disable=len(sentence_pairs) < 128):
sentences_batch = sentence_pairs[start_index:start_index + batch_size]
queries_batch = [pair[0] for pair in sentences_batch]
corpus_batch = [pair[1] for pair in sentences_batch]
queries_inputs = _tokenize(queries_batch, max_length=max_query_length).to(self.device)
corpus_inputs = _tokenize(corpus_batch, max_length=max_passage_length).to(self.device)
queries_output = self.model(queries_inputs, return_dense=True, return_sparse=True, return_colbert=True,
return_sparse_embedding=True)
corpus_output = self.model(corpus_inputs, return_dense=True, return_sparse=True, return_colbert=True,
return_sparse_embedding=True)
q_dense_vecs, q_sparse_vecs, q_colbert_vecs = queries_output['dense_vecs'], queries_output['sparse_vecs'], \
queries_output['colbert_vecs']
p_dense_vecs, p_sparse_vecs, p_colbert_vecs = corpus_output['dense_vecs'], corpus_output['sparse_vecs'], \
corpus_output['colbert_vecs']
dense_scores = self.model.dense_score(q_dense_vecs, p_dense_vecs)
sparse_scores = self.model.sparse_score(q_sparse_vecs, p_sparse_vecs)
colbert_scores = self.model.colbert_score(q_colbert_vecs, p_colbert_vecs,
q_mask=queries_inputs['attention_mask'])
if weights_for_different_modes is None:
weights_for_different_modes = [1, 1., 1.]
weight_sum = 3
print("default weights for dense, sparse, colbert are [1.0, 1.0, 1.0] ")
else:
assert len(weights_for_different_modes) == 3
weight_sum = sum(weights_for_different_modes)
inx = torch.arange(0, len(sentences_batch))
dense_scores, sparse_scores, colbert_scores = dense_scores[inx, inx].float(), sparse_scores[
inx, inx].float(), colbert_scores[inx, inx].float()
all_scores['colbert'].extend(
colbert_scores.cpu().numpy().tolist()
)
all_scores['sparse'].extend(
sparse_scores.cpu().numpy().tolist()
)
all_scores['dense'].extend(
dense_scores.cpu().numpy().tolist()
)
all_scores['sparse+dense'].extend(
((sparse_scores * weights_for_different_modes[1] + dense_scores * weights_for_different_modes[0])/(weights_for_different_modes[1]+weights_for_different_modes[0])).cpu().numpy().tolist()
)
all_scores['colbert+sparse+dense'].extend(
((colbert_scores * weights_for_different_modes[2] + sparse_scores * weights_for_different_modes[1] + dense_scores * weights_for_different_modes[0])/weight_sum).cpu().numpy().tolist()
)
if one_input_pair:
return {k: v[0] for k, v in all_scores.items()}
return all_scores
from typing import cast, List, Union, Tuple
import numpy as np
import torch
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, is_torch_npu_available
class FlagModel:
def __init__(
self,
model_name_or_path: str = None,
pooling_method: str = 'cls',
normalize_embeddings: bool = True,
query_instruction_for_retrieval: str = None,
use_fp16: bool = True
) -> None:
self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
self.model = AutoModel.from_pretrained(model_name_or_path)
self.query_instruction_for_retrieval = query_instruction_for_retrieval
self.normalize_embeddings = normalize_embeddings
self.pooling_method = pooling_method
if torch.cuda.is_available():
self.device = torch.device("cuda")
elif torch.backends.mps.is_available():
self.device = torch.device("mps")
elif is_torch_npu_available():
self.device = torch.device("npu")
else:
self.device = torch.device("cpu")
use_fp16 = False
if use_fp16: self.model.half()
self.model = self.model.to(self.device)
self.num_gpus = torch.cuda.device_count()
if self.num_gpus > 1:
print(f"----------using {self.num_gpus}*GPUs----------")
self.model = torch.nn.DataParallel(self.model)
def encode_queries(self, queries: Union[List[str], str],
batch_size: int = 256,
max_length: int = 512,
convert_to_numpy: bool = True) -> np.ndarray:
'''
This function will be used for retrieval task
if there is a instruction for queries, we will add it to the query text
'''
if self.query_instruction_for_retrieval is not None:
if isinstance(queries, str):
input_texts = self.query_instruction_for_retrieval + queries
else:
input_texts = ['{}{}'.format(self.query_instruction_for_retrieval, q) for q in queries]
else:
input_texts = queries
return self.encode(input_texts, batch_size=batch_size, max_length=max_length, convert_to_numpy=convert_to_numpy)
def encode_corpus(self,
corpus: Union[List[str], str],
batch_size: int = 256,
max_length: int = 512,
convert_to_numpy: bool = True) -> np.ndarray:
'''
This function will be used for retrieval task
encode corpus for retrieval task
'''
return self.encode(corpus, batch_size=batch_size, max_length=max_length, convert_to_numpy=convert_to_numpy)
@torch.no_grad()
def encode(self,
sentences: Union[List[str], str],
batch_size: int = 256,
max_length: int = 512,
convert_to_numpy: bool = True) -> np.ndarray:
if self.num_gpus > 0:
batch_size = batch_size * self.num_gpus
self.model.eval()
input_was_string = False
if isinstance(sentences, str):
sentences = [sentences]
input_was_string = True
all_embeddings = []
for start_index in tqdm(range(0, len(sentences), batch_size), desc="Inference Embeddings",
disable=len(sentences) < 256):
sentences_batch = sentences[start_index:start_index + batch_size]
inputs = self.tokenizer(
sentences_batch,
padding=True,
truncation=True,
return_tensors='pt',
max_length=max_length,
).to(self.device)
last_hidden_state = self.model(**inputs, return_dict=True).last_hidden_state
embeddings = self.pooling(last_hidden_state, inputs['attention_mask'])
if self.normalize_embeddings:
embeddings = torch.nn.functional.normalize(embeddings, dim=-1)
embeddings = cast(torch.Tensor, embeddings)
if convert_to_numpy:
embeddings = embeddings.cpu().numpy()
all_embeddings.append(embeddings)
if convert_to_numpy:
all_embeddings = np.concatenate(all_embeddings, axis=0)
else:
all_embeddings = torch.stack(all_embeddings)
if input_was_string:
return all_embeddings[0]
return all_embeddings
def pooling(self,
last_hidden_state: torch.Tensor,
attention_mask: torch.Tensor = None):
if self.pooling_method == 'cls':
return last_hidden_state[:, 0]
elif self.pooling_method == 'mean':
s = torch.sum(last_hidden_state * attention_mask.unsqueeze(-1).float(), dim=1)
d = attention_mask.sum(dim=1, keepdim=True).float()
return s / d
class LLMEmbedder:
instructions = {
"qa": {
"query": "Represent this query for retrieving relevant documents: ",
"key": "Represent this document for retrieval: ",
},
"convsearch": {
"query": "Encode this query and context for searching relevant passages: ",
"key": "Encode this passage for retrieval: ",
},
"chat": {
"query": "Embed this dialogue to find useful historical dialogues: ",
"key": "Embed this historical dialogue for retrieval: ",
},
"lrlm": {
"query": "Embed this text chunk for finding useful historical chunks: ",
"key": "Embed this historical text chunk for retrieval: ",
},
"icl": {
"query": "Convert this example into vector to look for useful examples: ",
"key": "Convert this example into vector for retrieval: ",
},
"tool": {
"query": "Transform this user request for fetching helpful tool descriptions: ",
"key": "Transform this tool description for retrieval: "
},
}
def __init__(
self,
model_name_or_path: str = None,
pooling_method: str = 'cls',
normalize_embeddings: bool = True,
use_fp16: bool = True
) -> None:
self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
self.model = AutoModel.from_pretrained(model_name_or_path)
self.normalize_embeddings = normalize_embeddings
self.pooling_method = pooling_method
if torch.cuda.is_available():
self.device = torch.device("cuda")
elif torch.backends.mps.is_available():
self.device = torch.device("mps")
elif is_torch_npu_available():
self.device = torch.device("npu")
else:
self.device = torch.device("cpu")
use_fp16 = False
if use_fp16: self.model.half()
self.model = self.model.to(self.device)
self.num_gpus = torch.cuda.device_count()
if self.num_gpus > 1:
print(f"----------using {self.num_gpus}*GPUs----------")
self.model = torch.nn.DataParallel(self.model)
def encode_queries(self, queries: Union[List[str], str],
batch_size: int = 256,
max_length: int = 256,
task: str = 'qa') -> np.ndarray:
'''
Encode queries into dense vectors.
Automatically add instructions according to given task.
'''
instruction = self.instructions[task]["query"]
if isinstance(queries, str):
input_texts = instruction + queries
else:
input_texts = [instruction + q for q in queries]
return self._encode(input_texts, batch_size=batch_size, max_length=max_length)
def encode_keys(self, keys: Union[List[str], str],
batch_size: int = 256,
max_length: int = 512,
task: str = 'qa') -> np.ndarray:
'''
Encode keys into dense vectors.
Automatically add instructions according to given task.
'''
instruction = self.instructions[task]["key"]
if isinstance(keys, str):
input_texts = instruction + keys
else:
input_texts = [instruction + k for k in keys]
return self._encode(input_texts, batch_size=batch_size, max_length=max_length)
@torch.no_grad()
def _encode(self, sentences: Union[List[str], str], batch_size: int = 256, max_length: int = 512) -> np.ndarray:
if self.num_gpus > 0:
batch_size = batch_size * self.num_gpus
self.model.eval()
input_was_string = False
if isinstance(sentences, str):
sentences = [sentences]
input_was_string = True
all_embeddings = []
for start_index in tqdm(range(0, len(sentences), batch_size), desc="Inference Embeddings",
disable=len(sentences) < 256):
sentences_batch = sentences[start_index:start_index + batch_size]
inputs = self.tokenizer(
sentences_batch,
padding=True,
truncation=True,
return_tensors='pt',
max_length=max_length,
).to(self.device)
last_hidden_state = self.model(**inputs, return_dict=True).last_hidden_state
embeddings = self.pooling(last_hidden_state, inputs['attention_mask'])
if self.normalize_embeddings:
embeddings = torch.nn.functional.normalize(embeddings, dim=-1)
embeddings = cast(torch.Tensor, embeddings)
all_embeddings.append(embeddings.cpu().numpy())
all_embeddings = np.concatenate(all_embeddings, axis=0)
if input_was_string:
return all_embeddings[0]
return all_embeddings
def pooling(self,
last_hidden_state: torch.Tensor,
attention_mask: torch.Tensor = None):
if self.pooling_method == 'cls':
return last_hidden_state[:, 0]
elif self.pooling_method == 'mean':
s = torch.sum(last_hidden_state * attention_mask.unsqueeze(-1).float(), dim=1)
d = attention_mask.sum(dim=1, keepdim=True).float()
return s / d
else:
raise NotImplementedError(f"Pooling method {self.pooling_method} not implemented!")
from typing import Union, List, Tuple, Any
import numpy as np
import torch
from torch import Tensor
from torch.utils.data import DataLoader
from tqdm import tqdm, trange
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, is_torch_npu_available
from peft import PeftModel
import warnings
from torch.utils.data import Dataset
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'true'
class DatasetForReranker(Dataset):
def __init__(
self,
dataset,
tokenizer_path: str,
max_len: int = 512,
query_prefix: str = 'A: ',
passage_prefix: str = 'B: ',
cache_dir: str = None,
prompt: str = None
):
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path,
trust_remote_code=True,
cache_dir=cache_dir)
self.dataset = dataset
self.max_len = max_len
self.query_prefix = query_prefix
self.passage_prefix = passage_prefix
self.total_len = len(self.dataset)
if prompt is None:
prompt = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'."
self.prompt_inputs = self.tokenizer(prompt,
return_tensors=None,
add_special_tokens=False)['input_ids']
sep = "\n"
self.sep_inputs = self.tokenizer(sep,
return_tensors=None,
add_special_tokens=False)['input_ids']
self.encode_max_length = self.max_len + len(self.sep_inputs) + len(self.prompt_inputs)
def __len__(self):
return self.total_len
def __getitem__(self, item):
query, passage = self.dataset[item]
query = self.query_prefix + query
passage = self.passage_prefix + passage
query_inputs = self.tokenizer(query,
return_tensors=None,
add_special_tokens=False,
max_length=self.max_len * 3 // 4,
truncation=True)
passage_inputs = self.tokenizer(passage,
return_tensors=None,
add_special_tokens=False,
max_length=self.max_len,
truncation=True)
item = self.tokenizer.prepare_for_model(
[self.tokenizer.bos_token_id] + query_inputs['input_ids'],
self.sep_inputs + passage_inputs['input_ids'],
truncation='only_second',
max_length=self.encode_max_length,
padding=False,
return_attention_mask=False,
return_token_type_ids=False,
add_special_tokens=False
)
item['input_ids'] = item['input_ids'] + self.sep_inputs + self.prompt_inputs
item['attention_mask'] = [1] * len(item['input_ids'])
item.pop('token_type_ids') if 'token_type_ids' in item.keys() else None
if 'position_ids' in item.keys():
item['position_ids'] = list(range(len(item['input_ids'])))
return item
class collater():
def __init__(self, tokenizer, max_len):
self.tokenizer = tokenizer
self.max_len = max_len
self.pad_to_multiple_of = 8
self.label_pad_token_id = -100
warnings.filterwarnings("ignore",
message="`max_length` is ignored when `padding`=`True` and there is no truncation strategy.")
def __call__(self, data):
labels = [feature["labels"] for feature in data] if "labels" in data[0].keys() else None
# We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
# same length to return tensors.
if labels is not None:
max_label_length = max(len(l) for l in labels)
if self.pad_to_multiple_of is not None:
max_label_length = (
(max_label_length + self.pad_to_multiple_of - 1)
// self.pad_to_multiple_of
* self.pad_to_multiple_of
)
padding_side = self.tokenizer.padding_side
for feature in data:
remainder = [self.label_pad_token_id] * (max_label_length - len(feature["labels"]))
if isinstance(feature["labels"], list):
feature["labels"] = (
feature["labels"] + remainder if padding_side == "right" else remainder + feature["labels"]
)
elif padding_side == "right":
feature["labels"] = np.concatenate([feature["labels"], remainder]).astype(np.int64)
else:
feature["labels"] = np.concatenate([remainder, feature["labels"]]).astype(np.int64)
return self.tokenizer.pad(
data,
padding=True,
max_length=self.max_len,
pad_to_multiple_of=8,
return_tensors='pt',
)
def last_logit_pool(logits: Tensor,
attention_mask: Tensor) -> Tensor:
left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
if left_padding:
return logits[:, -1, :]
else:
sequence_lengths = attention_mask.sum(dim=1) - 1
batch_size = logits.shape[0]
return torch.stack([logits[i, sequence_lengths[i], :] for i in range(batch_size)], dim=0)
def last_logit_pool_layerwise(logits: Tensor,
attention_mask: Tensor) -> Tensor:
left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
if left_padding:
return logits[:, -1]
else:
sequence_lengths = attention_mask.sum(dim=1) - 1
batch_size = logits.shape[0]
return logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
def sigmoid(x):
return 1 / (1 + np.exp(-x))
class FlagReranker:
def __init__(
self,
model_name_or_path: str = None,
use_fp16: bool = False,
cache_dir: str = None,
device: Union[str, int] = None
) -> None:
self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)
self.model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, cache_dir=cache_dir)
if device and isinstance(device, str):
self.device = torch.device(device)
if device == 'cpu':
use_fp16 = False
else:
if torch.cuda.is_available():
if device is not None:
self.device = torch.device(f"cuda:{device}")
else:
self.device = torch.device("cuda")
elif torch.backends.mps.is_available():
self.device = torch.device("mps")
elif is_torch_npu_available():
self.device = torch.device("npu")
else:
self.device = torch.device("cpu")
use_fp16 = False
if use_fp16:
self.model.half()
self.model = self.model.to(self.device)
self.model.eval()
if device is None:
self.num_gpus = torch.cuda.device_count()
if self.num_gpus > 1:
print(f"----------using {self.num_gpus}*GPUs----------")
self.model = torch.nn.DataParallel(self.model)
else:
self.num_gpus = 1
@torch.no_grad()
def compute_score(self, sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]], batch_size: int = 256,
max_length: int = 512, normalize: bool = False) -> List[float]:
if self.num_gpus > 0:
batch_size = batch_size * self.num_gpus
assert isinstance(sentence_pairs, list)
if isinstance(sentence_pairs[0], str):
sentence_pairs = [sentence_pairs]
all_scores = []
for start_index in tqdm(range(0, len(sentence_pairs), batch_size), desc="Compute Scores",
disable=len(sentence_pairs) < 128):
sentences_batch = sentence_pairs[start_index:start_index + batch_size]
inputs = self.tokenizer(
sentences_batch,
padding=True,
truncation=True,
return_tensors='pt',
max_length=max_length,
).to(self.device)
scores = self.model(**inputs, return_dict=True).logits.view(-1, ).float()
all_scores.extend(scores.cpu().numpy().tolist())
if normalize:
all_scores = [sigmoid(score) for score in all_scores]
return all_scores
class FlagLLMReranker:
def __init__(
self,
model_name_or_path: str = None,
peft_path: str = None,
use_fp16: bool = False,
use_bf16: bool = False,
cache_dir: str = None,
device: Union[str, int] = None
) -> None:
self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,
cache_dir=cache_dir,
trust_remote_code=True)
self.model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
cache_dir=cache_dir,
trust_remote_code=True,
torch_dtype=torch.bfloat16 if use_bf16 else torch.float32)
if peft_path:
self.model = PeftModel.from_pretrained(self.model,peft_path)
self.model = self.model.merge_and_unload()
self.model_name_or_path = model_name_or_path
self.cache_dir = cache_dir
if device and isinstance(device, str):
self.device = torch.device(device)
else:
device = 0 if device is None else device
if torch.cuda.is_available():
torch.cuda.set_device(device)
self.device = torch.device("cuda")
elif torch.backends.mps.is_available():
self.device = torch.device("mps")
elif is_torch_npu_available():
self.device = torch.device("npu")
else:
self.device = torch.device("cpu")
use_fp16 = False
if use_fp16 and use_bf16 is False:
self.model.half()
self.model = self.model.to(self.device)
self.model.eval()
self.yes_loc = self.tokenizer('Yes', add_special_tokens=False)['input_ids'][0]
@torch.no_grad()
def compute_score(self, sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]], batch_size: int = 16,
max_length: int = 512, prompt: str = None, normalize: bool = False,
use_dataloader: bool = False, num_workers: int = None) -> List[float]:
assert isinstance(sentence_pairs, list)
if isinstance(sentence_pairs[0], str):
sentence_pairs = [sentence_pairs]
length_sorted_idx = np.argsort([-self._text_length(q) - self._text_length(p) for q, p in sentence_pairs])
sentences_sorted = [sentence_pairs[idx] for idx in length_sorted_idx]
dataset, dataloader = None, None
if use_dataloader:
if num_workers is None:
num_workers = min(batch_size, 16)
dataset = DatasetForReranker(sentences_sorted,
self.model_name_or_path,
max_length,
cache_dir=self.cache_dir,
prompt=prompt)
dataloader = DataLoader(dataset, shuffle=False, batch_size=batch_size, drop_last=False,
num_workers=num_workers,
collate_fn=collater(self.tokenizer, max_length))
all_scores = []
if dataloader is not None:
for inputs in tqdm(dataloader):
inputs = inputs.to(self.device)
outputs = self.model(**inputs, output_hidden_states=True)
logits = outputs.logits
scores = last_logit_pool(logits, inputs['attention_mask'])
scores = scores[:, self.yes_loc]
all_scores.extend(scores.cpu().float().tolist())
else:
if prompt is None:
prompt = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'."
prompt_inputs = self.tokenizer(prompt,
return_tensors=None,
add_special_tokens=False)['input_ids']
sep = "\n"
sep_inputs = self.tokenizer(sep,
return_tensors=None,
add_special_tokens=False)['input_ids']
encode_max_length = max_length + len(sep_inputs) + len(prompt_inputs)
for batch_start in trange(0, len(sentences_sorted), batch_size):
batch_sentences = sentences_sorted[batch_start:batch_start + batch_size]
batch_sentences = [(f'A: {q}', f'B: {p}') for q,p in batch_sentences]
queries = [s[0] for s in batch_sentences]
passages = [s[1] for s in batch_sentences]
queries_inputs = self.tokenizer(queries,
return_tensors=None,
add_special_tokens=False,
max_length=max_length * 3 // 4,
truncation=True)
passages_inputs = self.tokenizer(passages,
return_tensors=None,
add_special_tokens=False,
max_length=max_length,
truncation=True)
batch_inputs = []
for query_inputs, passage_inputs in zip(queries_inputs['input_ids'], passages_inputs['input_ids']):
item = self.tokenizer.prepare_for_model(
[self.tokenizer.bos_token_id] + query_inputs,
sep_inputs + passage_inputs,
truncation='only_second',
max_length=encode_max_length,
padding=False,
return_attention_mask=False,
return_token_type_ids=False,
add_special_tokens=False
)
item['input_ids'] = item['input_ids'] + sep_inputs + prompt_inputs
item['attention_mask'] = [1] * len(item['input_ids'])
item.pop('token_type_ids') if 'token_type_ids' in item.keys() else None
if 'position_ids' in item.keys():
item['position_ids'] = list(range(len(item['input_ids'])))
batch_inputs.append(item)
collater_instance = collater(self.tokenizer, max_length)
batch_inputs = collater_instance(
[{'input_ids': item['input_ids'], 'attention_mask': item['attention_mask']} for item in
batch_inputs])
batch_inputs = {key: val.to(self.device) for key, val in batch_inputs.items()}
outputs = self.model(**batch_inputs, output_hidden_states=True)
logits = outputs.logits
scores = last_logit_pool(logits, batch_inputs['attention_mask'])
scores = scores[:, self.yes_loc]
all_scores.extend(scores.cpu().float().tolist())
all_scores = [all_scores[idx] for idx in np.argsort(length_sorted_idx)]
if normalize:
all_scores = [sigmoid(score) for score in all_scores]
# if len(all_scores) == 1:
# return all_scores[0]
return all_scores
def _text_length(self, text: Union[List[int], List[List[int]]]):
"""
Help function to get the length for the input text. Text can be either
a list of ints (which means a single text as input), or a tuple of list of ints
(representing several text inputs to the model).
"""
if isinstance(text, dict): # {key: value} case
return len(next(iter(text.values())))
elif not hasattr(text, '__len__'): # Object has no len() method
return 1
elif len(text) == 0 or isinstance(text[0], int): # Empty string or list of ints
return len(text)
else:
return sum([len(t) for t in text]) # Sum of length of individual strings
class LayerWiseFlagLLMReranker:
def __init__(
self,
model_name_or_path: str = None,
peft_path: str = None,
use_fp16: bool = False,
use_bf16: bool = False,
cache_dir: str = None,
device: Union[str, int] = None
) -> None:
self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,
cache_dir=cache_dir,
trust_remote_code=True)
if use_bf16 is False and use_fp16 is False:
warnings.warn("Due to model constraints, `use_bf16` and `use_fp16` cannot both be `False`. Here, `use_fp16` is set to `True` by default.", UserWarning)
use_fp16 = True
self.model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
cache_dir=cache_dir,
trust_remote_code=True,
torch_dtype=torch.bfloat16 if use_bf16 else torch.float32)
if peft_path:
self.model = PeftModel.from_pretrained(self.model,peft_path)
self.model = self.model.merge_and_unload()
self.model_name_or_path = model_name_or_path
self.cache_dir = cache_dir
if device and isinstance(device, str):
if device == 'cpu':
warnings.warn('The LLM-based layer-wise reranker does not support CPU; it has been set to CUDA.')
device = 'cuda'
self.device = torch.device(device)
else:
device = 0 if device is None else device
if torch.cuda.is_available():
torch.cuda.set_device(device)
self.device = torch.device("cuda")
elif torch.backends.mps.is_available():
self.device = torch.device("mps")
elif is_torch_npu_available():
self.device = torch.device("npu")
else:
self.device = torch.device("cpu")
use_fp16 = False
if use_fp16 and use_bf16 is False:
self.model.half()
self.model = self.model.to(self.device)
self.model.eval()
self.yes_loc = self.tokenizer('Yes', add_special_tokens=False)['input_ids'][0]
@torch.no_grad()
def compute_score(self, sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]], batch_size: int = 16,
max_length: int = 512, cutoff_layers: List[int] = None, prompt: str = None,
normalize: bool = False, use_dataloader: bool = False,
num_workers: int = None) -> Union[float, List[float], List[List[float]]]:
assert isinstance(sentence_pairs, list)
if isinstance(sentence_pairs[0], str):
sentence_pairs = [sentence_pairs]
length_sorted_idx = np.argsort([-self._text_length(q) - self._text_length(p) for q, p in sentence_pairs])
sentences_sorted = [sentence_pairs[idx] for idx in length_sorted_idx]
dataset, dataloader = None, None
if use_dataloader:
if num_workers is None:
num_workers = min(batch_size, 16)
dataset = DatasetForReranker(sentences_sorted,
self.model_name_or_path,
max_length,
cache_dir=self.cache_dir,
prompt=prompt)
dataloader = DataLoader(dataset, shuffle=False, batch_size=batch_size, drop_last=False,
num_workers=num_workers,
collate_fn=collater(self.tokenizer, max_length))
all_scores = []
if dataloader is not None:
for inputs in tqdm(dataloader):
inputs = inputs.to(self.device)
outputs = self.model(**inputs, output_hidden_states=True, cutoff_layers=cutoff_layers)
all_logits = outputs.logits
tmp_all_scores = []
for logits in all_logits:
scores = last_logit_pool_layerwise(logits, inputs['attention_mask'])
tmp_all_scores.append(scores.contiguous())
if len(all_scores) == 0:
for _ in range(len(tmp_all_scores)):
all_scores.append([])
for i in range(len(tmp_all_scores)):
all_scores[i].extend(tmp_all_scores[i].cpu().float().tolist())
else:
if prompt is None:
prompt = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'."
prompt_inputs = self.tokenizer(prompt,
return_tensors=None,
add_special_tokens=False)['input_ids']
sep = "\n"
sep_inputs = self.tokenizer(sep,
return_tensors=None,
add_special_tokens=False)['input_ids']
encode_max_length = max_length + len(sep_inputs) + len(prompt_inputs)
for batch_start in trange(0, len(sentences_sorted), batch_size):
batch_sentences = sentences_sorted[batch_start:batch_start + batch_size]
batch_sentences = [(f'A: {q}', f'B: {p}') for q, p in batch_sentences]
queries = [s[0] for s in batch_sentences]
passages = [s[1] for s in batch_sentences]
queries_inputs = self.tokenizer(queries,
return_tensors=None,
add_special_tokens=False,
max_length=max_length * 3 // 4,
truncation=True)
passages_inputs = self.tokenizer(passages,
return_tensors=None,
add_special_tokens=False,
max_length=max_length,
truncation=True)
batch_inputs = []
for query_inputs, passage_inputs in zip(queries_inputs['input_ids'], passages_inputs['input_ids']):
item = self.tokenizer.prepare_for_model(
[self.tokenizer.bos_token_id] + query_inputs,
sep_inputs + passage_inputs,
truncation='only_second',
max_length=encode_max_length,
padding=False,
return_attention_mask=False,
return_token_type_ids=False,
add_special_tokens=False
)
item['input_ids'] = item['input_ids'] + sep_inputs + prompt_inputs
item['attention_mask'] = [1] * len(item['input_ids'])
item.pop('token_type_ids') if 'token_type_ids' in item.keys() else None
if 'position_ids' in item.keys():
item['position_ids'] = list(range(len(item['input_ids'])))
batch_inputs.append(item)
collater_instance = collater(self.tokenizer, max_length)
batch_inputs = collater_instance(
[{'input_ids': item['input_ids'], 'attention_mask': item['attention_mask']} for item in
batch_inputs])
batch_inputs = {key: val.to(self.device) for key, val in batch_inputs.items()}
outputs = self.model(**batch_inputs, output_hidden_states=True, cutoff_layers=cutoff_layers)
all_logits = outputs.logits
tmp_all_scores = []
for logits in all_logits:
scores = last_logit_pool_layerwise(logits, batch_inputs['attention_mask'])
tmp_all_scores.append(scores.contiguous())
if len(all_scores) == 0:
for _ in range(len(tmp_all_scores)):
all_scores.append([])
for i in range(len(tmp_all_scores)):
all_scores[i].extend(tmp_all_scores[i].cpu().float().tolist())
for i in range(len(all_scores)):
all_scores[i] = [all_scores[i][idx] for idx in np.argsort(length_sorted_idx)]
if normalize:
all_scores[i] = [sigmoid(score) for score in all_scores[i]]
# if len(all_scores) == 1:
# if len(all_scores[0]) == 1:
# return all_scores[0][0]
# return all_scores[0]
return all_scores
def _text_length(self, text: Union[List[int], List[List[int]]]):
"""
Help function to get the length for the input text. Text can be either
a list of ints (which means a single text as input), or a tuple of list of ints
(representing several text inputs to the model).
"""
if isinstance(text, dict): # {key: value} case
return len(next(iter(text.values())))
elif not hasattr(text, '__len__'): # Object has no len() method
return 1
elif len(text) == 0 or isinstance(text[0], int): # Empty string or list of ints
return len(text)
else:
return sum([len(t) for t in text]) # Sum of length of individual strings
<div align="center">
<h1>LLM-Embedder [<a href="https://arxiv.org/abs/2310.07554">paper</a>]</h1>
<img src="imgs/llm-embedder.png" width="60%" class="center">
</div>
This is the codebase for LLM-Embedder, a unified embedding model to comprehensively support the retrieval augmentation needs of large language models, including knowledge retrieval, memory retrieval, examplar retrieval, and tool retrieval. It is fine-tuned over 6 tasks:
- *Question Answering (qa)*
- *Conversational Search (convsearch)*
- *Long Conversation (chat)*
- *Long-Range Language Modeling (lrlm)*
- *In-Context Learning (icl)*
- *Tool Learning (tool)*
## Roadmap
- Details about how to fine-tune the LLM-Embedder are [here](docs/fine-tune.md).
- Details about how to evaluate different retrievers on various retrieval-augmented scenarios are [here](docs/evaluation.md).
## Usage
### Using `FlagEmbedding`
```pip install -U FlagEmbedding```
```python
from FlagEmbedding import LLMEmbedder
# Define queries and keys
queries = ["test query 1", "test query 2"]
keys = ["test key 1", "test key 2"]
# Load model (automatically use GPUs)
model = LLMEmbedder('BAAI/llm-embedder', use_fp16=False)
# Encode for a specific task (qa, icl, chat, lrlm, tool, convsearch)
task = "qa"
query_embeddings = model.encode_queries(queries, task=task)
key_embeddings = model.encode_keys(keys, task=task)
similarity = query_embeddings @ key_embeddings.T
print(similarity)
# [[0.8971, 0.8534]
# [0.8462, 0.9091]]
```
### Using `transformers`
```pip install -U transformers```
```python
import torch
from transformers import AutoTokenizer, AutoModel
INSTRUCTIONS = {
"qa": {
"query": "Represent this query for retrieving relevant documents: ",
"key": "Represent this document for retrieval: ",
},
"icl": {
"query": "Convert this example into vector to look for useful examples: ",
"key": "Convert this example into vector for retrieval: ",
},
"chat": {
"query": "Embed this dialogue to find useful historical dialogues: ",
"key": "Embed this historical dialogue for retrieval: ",
},
"lrlm": {
"query": "Embed this text chunk for finding useful historical chunks: ",
"key": "Embed this historical text chunk for retrieval: ",
},
"tool": {
"query": "Transform this user request for fetching helpful tool descriptions: ",
"key": "Transform this tool description for retrieval: "
},
"convsearch": {
"query": "Encode this query and context for searching relevant passages: ",
"key": "Encode this passage for retrieval: ",
},
}
# Define queries and keys
queries = ["test query 1", "test query 2"]
keys = ["test key 1", "test key 2"]
# Load model
tokenizer = AutoTokenizer.from_pretrained('BAAI/llm-embedder')
model = AutoModel.from_pretrained('BAAI/llm-embedder')
# Add instructions for specific task (qa, icl, chat, lrlm, tool, convsearch)
instruction = INSTRUCTIONS["qa"]
queries = [instruction["query"] + query for query in queries]
keys = [instruction["key"] + key for key in keys]
# Tokenize sentences
query_inputs = tokenizer(queries, padding=True, return_tensors='pt')
key_inputs = tokenizer(keys, padding=True, return_tensors='pt')
# Encode
with torch.no_grad():
query_outputs = model(**query_inputs)
key_outputs = model(**key_inputs)
# CLS pooling
query_embeddings = query_outputs.last_hidden_state[:, 0]
key_embeddings = key_outputs.last_hidden_state[:, 0]
# Normalize
query_embeddings = torch.nn.functional.normalize(query_embeddings, p=2, dim=1)
key_embeddings = torch.nn.functional.normalize(key_embeddings, p=2, dim=1)
similarity = query_embeddings @ key_embeddings.T
print(similarity)
# [[0.8971, 0.8534]
# [0.8462, 0.9091]]
```
### Using `sentence-transformers`
```pip install -U sentence-transformers```
```python
from sentence_transformers import SentenceTransformer
INSTRUCTIONS = {
"qa": {
"query": "Represent this query for retrieving relevant documents: ",
"key": "Represent this document for retrieval: ",
},
"icl": {
"query": "Convert this example into vector to look for useful examples: ",
"key": "Convert this example into vector for retrieval: ",
},
"chat": {
"query": "Embed this dialogue to find useful historical dialogues: ",
"key": "Embed this historical dialogue for retrieval: ",
},
"lrlm": {
"query": "Embed this text chunk for finding useful historical chunks: ",
"key": "Embed this historical text chunk for retrieval: ",
},
"tool": {
"query": "Transform this user request for fetching helpful tool descriptions: ",
"key": "Transform this tool description for retrieval: "
},
"convsearch": {
"query": "Encode this query and context for searching relevant passages: ",
"key": "Encode this passage for retrieval: ",
},
}
# Define queries and keys
queries = ["test query 1", "test query 2"]
keys = ["test key 1", "test key 2"]
# Load model
model = SentenceTransformer('BAAI/llm-embedder', device="cpu")
# Add instructions for specific task (qa, icl, chat, lrlm, tool, convsearch)
instruction = INSTRUCTIONS["qa"]
queries = [instruction["query"] + query for query in queries]
keys = [instruction["key"] + key for key in keys]
# Encode
query_embeddings = model.encode(queries)
key_embeddings = model.encode(keys)
similarity = query_embeddings @ key_embeddings.T
print(similarity)
# [[0.8971, 0.8534]
# [0.8462, 0.9091]]
```
## Contact
If you have any question or suggestion related to this project, feel free to open an issue or pull request. You also can email Peitian Zhang (namespace.pt@gmail.com).
## Citation
If you find this repository useful, please consider giving a star ⭐ and citation
```
@misc{zhang2023retrieve,
title={Retrieve Anything To Augment Large Language Models},
author={Peitian Zhang and Shitao Xiao and Zheng Liu and Zhicheng Dou and Jian-Yun Nie},
year={2023},
eprint={2310.07554},
archivePrefix={arXiv},
primaryClass={cs.IR}
}
```
\ No newline at end of file
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto",
"total_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 0
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 100,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto",
"total_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 2,
"allgather_partitions": true,
"allgather_bucket_size": 5e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 5e8,
"contiguous_gradients": true
},
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
\ No newline at end of file
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto",
"total_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 2,
"allgather_partitions": true,
"allgather_bucket_size": 5e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 5e8,
"contiguous_gradients": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto",
"total_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
\ No newline at end of file
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto",
"total_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
\ No newline at end of file
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto",
"total_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 3,
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto"
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment