First add.

f75058c7 · Rayyyyy · f75058c7 · f75058c7 · f75058c7 · f75058c7
Commit f75058c7 authored Jul 25, 2024 by Rayyyyy
20 changed files
--- a/C_MTEB/MLDR/sparse_retrieval/bm25_baseline_same_tokenizer.py
+++ b/C_MTEB/MLDR/sparse_retrieval/bm25_baseline_same_tokenizer.py
+"""
+# 1. Output Search Results with BM25
+python bm25_baseline_same_tokenizer.py
+
+# 2. Print and Save Evaluation Results
+python step2-eval_sparse_mldr.py \
+--encoder bm25_same_tokenizer \
+--languages ar de es fr hi it ja ko pt ru th en zh \
+--search_result_save_dir ./search_results \
+--qrels_dir ../qrels \
+--eval_result_save_dir ./eval_results \
+--metrics ndcg@10
+"""
+import os
+import datasets
+from tqdm import tqdm
+from transformers import AutoTokenizer
+
+
+tokenizer = AutoTokenizer.from_pretrained(
+    'BAAI/bge-m3',
+    use_fast=False,
+)
+
+
+def _map_func_corpus(examples):
+    results = {}
+    results['docid'] = examples['docid']
+    results['text'] = []
+    
+    inputs = tokenizer(
+        examples['text'],
+        padding=False,
+        truncation=True,
+        max_length=8192
+    )
+    input_ids_list = inputs['input_ids']
+    
+    for i in range(len(examples['docid'])):
+        token_ids = input_ids_list[i][1:-1]
+        token_ids = [str(_id) for _id in token_ids]
+        results['text'].append(" ".join(token_ids))
+    return results
+
+
+def _map_func_query(examples):
+    results = {}
+    results['query_id'] = examples['query_id']
+    results['query'] = []
+    
+    inputs = tokenizer(
+        examples['query'],
+        padding=False,
+        truncation=True,
+        max_length=512
+    )
+    
+    input_ids_list = inputs['input_ids']
+    
+    for i in range(len(examples['query_id'])):
+        token_ids = input_ids_list[i][1:-1]
+        token_ids = [str(_id) for _id in token_ids]
+        results['query'].append(" ".join(token_ids))
+    return results
+
+
+def generate_corpus(lang: str, corpus_save_dir: str):
+    corpus_save_path = os.path.join(corpus_save_dir, 'corpus.jsonl')
+    if os.path.exists(corpus_save_path):
+        return
+    
+    corpus = datasets.load_dataset('Shitao/MLDR', f'corpus-{lang}', split='corpus')
+    
+    corpus = corpus.map(_map_func_corpus, batched=True, num_proc=48)
+    
+    corpus_list = [{'id': e['docid'], 'contents': e['text']} for e in tqdm(corpus, desc="Generating corpus")]
+    corpus = datasets.Dataset.from_list(corpus_list)
+    
+    corpus.to_json(corpus_save_path, force_ascii=False)
+
+
+def generate_queries(lang: str, queries_save_dir: str, split: str='test'):
+    queries_save_path = os.path.join(queries_save_dir, f"{lang}.tsv")
+    if os.path.exists(queries_save_path):
+        return
+    
+    dataset = datasets.load_dataset('Shitao/MLDR', lang, split=split)
+    
+    dataset = dataset.map(_map_func_query, batched=True, num_proc=48)
+    
+    queries_list = []
+    for data in dataset:
+        queries_list.append({
+            'id': data['query_id'],
+            'content': data['query']
+        })
+    with open(queries_save_path, 'w', encoding='utf-8') as f:
+        for query in queries_list:
+            assert '\n' not in query['content'] and '\t' not in query['content']
+            line = f"{query['id']}\t{query['content']}"
+            f.write(line + '\n')
+
+
+def index(corpus_save_dir: str, index_save_dir: str):
+    cmd = f"python -m pyserini.index.lucene \
+            --collection JsonCollection \
+            --input {corpus_save_dir} \
+            --index {index_save_dir} \
+            --generator DefaultLuceneDocumentGenerator \
+            --threads 1 --optimize \
+        "
+    os.system(cmd)
+
+
+def search(index_save_dir: str, queries_save_dir: str, lang: str, result_save_path: str):
+    queries_save_path = os.path.join(queries_save_dir, f"{lang}.tsv")
+    cmd = f"python -m pyserini.search.lucene \
+            --index {index_save_dir} \
+            --topics {queries_save_path} \
+            --output {result_save_path} \
+            --bm25 \
+            --hits 1000 \
+            --batch-size 128 \
+            --threads 16 \
+        "
+    os.system(cmd)
+
+
+def main():
+    bm25_dir = './bm25_baseline_same_tokenizer'
+    
+    result_save_dir = os.path.join('./search_results', 'bm25_same_tokenizer')
+    if not os.path.exists(result_save_dir):
+        os.makedirs(result_save_dir)
+    
+    for lang in ['ar', 'de', 'en', 'es', 'fr', 'hi', 'it', 'ja', 'ko', 'pt', 'ru', 'th', 'zh']:
+        save_dir = os.path.join(bm25_dir, lang)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+        
+        corpus_save_dir = os.path.join(save_dir, 'corpus')
+        if not os.path.exists(corpus_save_dir):
+            os.makedirs(corpus_save_dir)
+        generate_corpus(lang, corpus_save_dir)
+        
+        index_save_dir = os.path.join(save_dir, 'index')
+        if not os.path.exists(index_save_dir):
+            os.makedirs(index_save_dir)
+        index(corpus_save_dir, index_save_dir)
+        
+        generate_queries(lang, save_dir, split='test')
+        
+        result_save_path = os.path.join(result_save_dir, f'{lang}.txt')
+        search(index_save_dir, save_dir, lang, result_save_path)
+
+
+if __name__ == '__main__':
+    main()
--- a/C_MTEB/MLDR/sparse_retrieval/step0-encode_query-and-corpus.py
+++ b/C_MTEB/MLDR/sparse_retrieval/step0-encode_query-and-corpus.py
+"""
+python step0-encode_query-and-corpus.py \
+--encoder BAAI/bge-m3 \
+--languages ar de en es fr hi it ja ko pt ru th zh \
+--save_dir ./encoded_query-and-corpus \
+--max_query_length 512 \
+--max_passage_length 8192 \
+--batch_size 1024 \
+--corpus_batch_size 4 \
+--pooling_method cls \
+--normalize_embeddings True
+"""
+
+import os
+import json
+import datasets
+import numpy as np
+from tqdm import tqdm
+from FlagEmbedding import BGEM3FlagModel
+from dataclasses import dataclass, field
+from transformers import HfArgumentParser
+
+
+@dataclass
+class ModelArgs:
+    encoder: str = field(
+        default="BAAI/bge-m3",
+        metadata={'help': 'Name or path of encoder'}
+    )
+    pooling_method: str = field(
+        default='cls',
+        metadata={'help': "Pooling method. Avaliable methods: 'cls', 'mean'"}
+    )
+    normalize_embeddings: bool = field(
+        default=True,
+        metadata={'help': "Normalize embeddings or not"}
+    )
+    fp16: bool = field(
+        default=True,
+        metadata={'help': 'Use fp16 in inference?'}
+    )
+
+
+@dataclass
+class EvalArgs:
+    languages: str = field(
+        default="en",
+        metadata={'help': 'Languages to evaluate. Avaliable languages: ar de en es fr hi it ja ko pt ru th zh', 
+                  "nargs": "+"}
+    )
+    save_dir: str = field(
+        default='./encoded_query-and-corpus',
+        metadata={'help': 'Dir to save encoded query and corpus. Encoded query and corpus will be saved to `save_dir/{encoder_name}/{lang}/query_embd.tsv` and `save_dir/{encoder_name}/{lang}/corpus/corpus_embd.jsonl`, individually.'}
+    )
+    max_query_length: int = field(
+        default=512,
+        metadata={'help': 'Max query length.'}
+    )
+    max_passage_length: int = field(
+        default=8192,
+        metadata={'help': 'Max passage length.'}
+    )
+    batch_size: int = field(
+        default=256,
+        metadata={'help': 'Inference batch size.'}
+    )
+    corpus_batch_size: int = field(
+        default=4,
+        metadata={'help': 'Inference batch size.'}
+    )
+    overwrite: bool = field(
+        default=False,
+        metadata={'help': 'Whether to overwrite embedding'}
+    )
+
+
+def get_model(model_args: ModelArgs):
+    model = BGEM3FlagModel(
+        model_name_or_path=model_args.encoder,
+        pooling_method=model_args.pooling_method,
+        normalize_embeddings=model_args.normalize_embeddings,
+        use_fp16=model_args.fp16
+    )
+    return model
+
+
+def check_languages(languages):
+    if isinstance(languages, str):
+        languages = [languages]
+    avaliable_languages = ['ar', 'de', 'en', 'es', 'fr', 'hi', 'it', 'ja', 'ko', 'pt', 'ru', 'th', 'zh']
+    for lang in languages:
+        if lang not in avaliable_languages:
+            raise ValueError(f"Language `{lang}` is not supported. Avaliable languages: {avaliable_languages}")
+    return languages
+
+
+def load_corpus(lang: str):
+    corpus = datasets.load_dataset('Shitao/MLDR', f'corpus-{lang}', split='corpus')
+    
+    corpus_list = [{'id': e['docid'], 'content': e['text']} for e in tqdm(corpus, desc="Generating corpus")]
+    corpus = datasets.Dataset.from_list(corpus_list)
+    return corpus
+
+
+def get_queries(lang: str, split: str='test'):
+    dataset = datasets.load_dataset('Shitao/MLDR', lang, split=split)
+    
+    queries_list = []
+    for data in dataset:
+        queries_list.append({
+            'id': data['query_id'],
+            'content': data['query']
+        })
+    
+    queries = datasets.Dataset.from_list(queries_list)
+    return queries
+
+
+def encode_corpus(model: BGEM3FlagModel, corpus: datasets.Dataset, max_passage_length: int=8192, corpus_batch_size: int=4):
+    docids = list(corpus["id"])
+    vectors = model.encode(
+        corpus["content"], 
+        batch_size=corpus_batch_size, 
+        max_length=max_passage_length,
+        return_dense=False,
+        return_sparse=True,
+        return_colbert_vecs=False
+    )['lexical_weights']
+    
+    encoded_corpus_list = []
+    for docid, vector in zip(docids, vectors):
+        for key, value in vector.items():
+            vector[key] = int(np.ceil(value * 100))
+        
+        encoded_corpus_list.append({
+            'id': docid,
+            'contents': '',
+            'vector': vector
+        })
+    return encoded_corpus_list
+
+
+def encode_queries(model: BGEM3FlagModel, queries: datasets.Dataset, max_query_length: int=512, batch_size: int=256):
+    qids = list(queries["id"])
+    vectors = model.encode(
+        queries["content"], 
+        batch_size=batch_size, 
+        max_length=max_query_length,
+        return_dense=False,
+        return_sparse=True,
+        return_colbert_vecs=False
+    )['lexical_weights']
+    
+    encoded_queries_list = []
+    for qid, vector in zip(qids, vectors):
+        for key, value in vector.items():
+            vector[key] = int(np.ceil(value * 100))
+        
+        topic_str = []
+        for token in vector:
+            topic_str += [str(token)] * vector[token]
+        if len(topic_str) == 0:
+            topic_str = "0"
+        else:
+            topic_str = " ".join(topic_str)
+        encoded_queries_list.append(f"{str(qid)}\t{topic_str}\n")
+    return encoded_queries_list
+
+
+def save_result(encoded_queries_list: list, encoded_corpus_list: list, save_dir: str):
+    queries_save_path = os.path.join(save_dir, 'query_embd.tsv')
+    corpus_save_path = os.path.join(save_dir, 'corpus', 'corpus_embd.jsonl')
+    if not os.path.exists(os.path.dirname(corpus_save_path)):
+        os.makedirs(os.path.dirname(corpus_save_path))
+    
+    with open(queries_save_path, 'w', encoding='utf-8') as f:
+        for line in tqdm(encoded_queries_list, desc="Saving encoded queries"):
+            f.write(line)
+    
+    with open(corpus_save_path, 'w', encoding='utf-8') as f:
+        for line in tqdm(encoded_corpus_list, desc="Saving encoded corpus"):
+            f.write(json.dumps(line, ensure_ascii=False) + "\n")
+
+
+def main():
+    parser = HfArgumentParser([ModelArgs, EvalArgs])
+    model_args, eval_args = parser.parse_args_into_dataclasses()
+    model_args: ModelArgs
+    eval_args: EvalArgs
+    
+    languages = check_languages(eval_args.languages)
+    # languages.reverse()
+    
+    if model_args.encoder[-1] == '/':
+        model_args.encoder = model_args.encoder[:-1]
+    
+    model = get_model(model_args=model_args)
+    
+    encoder = model_args.encoder
+    if os.path.basename(encoder).startswith('checkpoint-'):
+        encoder = os.path.dirname(encoder) + '_' + os.path.basename(encoder)
+    
+    print("==================================================")
+    print("Start generating embedding with model:")
+    print(model_args.encoder)
+
+    print('Generate embedding of following languages: ', languages)
+    for lang in languages:
+        print("**************************************************")
+        save_dir = os.path.join(eval_args.save_dir, os.path.basename(encoder), lang)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+        if os.path.exists(os.path.join(save_dir, 'corpus', 'corpus_embd.jsonl')) and not eval_args.overwrite:
+            print(f'Embedding of {lang} already exists. Skip...')
+            continue
+        
+        print(f"Start generating query and corpus embedding of {lang} ...")
+        queries = get_queries(lang, split='test')
+        encoded_queries_list = encode_queries(
+            model=model,
+            queries=queries,
+            max_query_length=eval_args.max_query_length,
+            batch_size=eval_args.batch_size
+        )
+        
+        corpus = load_corpus(lang)
+        encoded_corpus_list = encode_corpus(
+            model=model,
+            corpus=corpus,
+            max_passage_length=eval_args.max_passage_length,
+            corpus_batch_size=eval_args.corpus_batch_size
+        )
+        
+        save_result(
+            encoded_queries_list=encoded_queries_list,
+            encoded_corpus_list=encoded_corpus_list,
+            save_dir=save_dir
+        )
+    
+    print("==================================================")
+    print("Finish generating embeddings with model:")
+    print(model_args.encoder)
+
+
+if __name__ == "__main__":
+    main()
--- a/C_MTEB/MLDR/sparse_retrieval/step1-search_results.py
+++ b/C_MTEB/MLDR/sparse_retrieval/step1-search_results.py
+"""
+python step1-search_results.py \
+--encoder BAAI/bge-m3 \
+--languages ar de en es fr hi it ja ko pt ru th zh \
+--encoded_query_and_corpus_save_dir ./encoded_query-and-corpus \
+--result_save_dir ./search_results \
+--threads 16 \
+--hits 1000
+"""
+import os
+from dataclasses import dataclass, field
+from transformers import HfArgumentParser
+
+
+@dataclass
+class ModelArgs:
+    encoder: str = field(
+        default="BAAI/bge-m3",
+        metadata={'help': 'Name or path of encoder'}
+    )
+
+
+@dataclass
+class EvalArgs:
+    languages: str = field(
+        default="en",
+        metadata={'help': 'Languages to evaluate. Avaliable languages: ar de en es fr hi it ja ko pt ru th zh', 
+                  "nargs": "+"}
+    )
+    encoded_query_and_corpus_save_dir: str = field(
+        default='./encoded_query-and-corpus',
+        metadata={'help': 'Dir to save encoded queries and corpus. Encoded queries and corpus are saved in `save_dir/{encoder_name}/{lang}/query_embd.tsv` and `save_dir/{encoder_name}/{lang}/corpus/corpus_embd.jsonl`, individually.'}
+    )
+    result_save_dir: str = field(
+        default='./search_results',
+        metadata={'help': 'Dir to saving results. Search results will be saved to `result_save_dir/{encoder_name}/{lang}.txt`'}
+    )
+    batch_size: int = field(
+        default=32,
+        metadata={'help': 'Batch size to use during search'}
+    )
+    threads: int = field(
+        default=1,
+        metadata={'help': 'Maximum threads to use during search'}
+    )
+    hits: int = field(
+        default=1000,
+        metadata={'help': 'Number of hits'}
+    )
+    overwrite: bool = field(
+        default=False,
+        metadata={'help': 'Whether to overwrite embedding'}
+    )
+
+
+def check_languages(languages):
+    if isinstance(languages, str):
+        languages = [languages]
+    avaliable_languages = ['ar', 'de', 'en', 'es', 'fr', 'hi', 'it', 'ja', 'ko', 'pt', 'ru', 'th', 'zh']
+    for lang in languages:
+        if lang not in avaliable_languages:
+            raise ValueError(f"Language `{lang}` is not supported. Avaliable languages: {avaliable_languages}")
+    return languages
+
+
+def generate_index(lang: str, corpus_embd_dir: str, index_save_dir: str, threads: int=12):    
+    cmd = f"python -m pyserini.index.lucene \
+            --language {lang} \
+            --collection JsonVectorCollection \
+            --input {corpus_embd_dir} \
+            --index {index_save_dir} \
+            --generator DefaultLuceneDocumentGenerator \
+            --threads {threads} \
+            --impact --pretokenized --optimize \
+        "
+    os.system(cmd)
+
+
+def search_and_save_results(index_save_dir: str, query_embd_path: str, result_save_path: str, batch_size: int = 32, threads: int = 12, hits: int = 1000):
+    cmd = f"python -m pyserini.search.lucene \
+            --index {index_save_dir} \
+            --topics {query_embd_path} \
+            --output {result_save_path} \
+            --output-format trec \
+            --batch {batch_size} \
+            --threads {threads} \
+            --hits {hits} \
+            --impact \
+        "
+    os.system(cmd)
+
+
+def main():
+    parser = HfArgumentParser([ModelArgs, EvalArgs])
+    model_args, eval_args = parser.parse_args_into_dataclasses()
+    model_args: ModelArgs
+    eval_args: EvalArgs
+    
+    languages = check_languages(eval_args.languages)
+    
+    if model_args.encoder[-1] == '/':
+        model_args.encoder = model_args.encoder[:-1]
+    
+    encoder = model_args.encoder
+    if os.path.basename(encoder).startswith('checkpoint-'):
+        encoder = os.path.dirname(encoder) + '_' + os.path.basename(encoder)
+    
+    print("==================================================")
+    print("Start generating search results with model:")
+    print(model_args.encoder)
+
+    print('Generate search results of following languages: ', languages)
+    for lang in languages:
+        print("**************************************************")
+        print(f"Start searching results of {lang} ...")
+        
+        result_save_path = os.path.join(eval_args.result_save_dir, os.path.basename(encoder), f"{lang}.txt")
+        if not os.path.exists(os.path.dirname(result_save_path)):
+            os.makedirs(os.path.dirname(result_save_path))
+        
+        if os.path.exists(result_save_path) and not eval_args.overwrite:
+            print(f'Search results of {lang} already exists. Skip...')
+            continue
+        
+        encoded_query_and_corpus_save_dir = os.path.join(eval_args.encoded_query_and_corpus_save_dir, os.path.basename(encoder), lang)
+        if not os.path.exists(encoded_query_and_corpus_save_dir):
+            raise FileNotFoundError(f"{encoded_query_and_corpus_save_dir} not found")
+        
+        corpus_embd_dir = os.path.join(encoded_query_and_corpus_save_dir, 'corpus')
+        index_save_dir = os.path.join(eval_args.encoded_query_and_corpus_save_dir, os.path.basename(encoder), lang, 'index')
+        if os.path.exists(index_save_dir) and not eval_args.overwrite:
+            print(f'Index of {lang} already exists')
+        else:
+            generate_index(
+                lang=lang,
+                corpus_embd_dir=corpus_embd_dir,
+                index_save_dir=index_save_dir,
+                threads=eval_args.threads
+            )
+        
+        query_embd_path = os.path.join(encoded_query_and_corpus_save_dir, 'query_embd.tsv')
+        
+        search_and_save_results(
+            index_save_dir=index_save_dir,
+            query_embd_path=query_embd_path,
+            result_save_path=result_save_path,
+            batch_size=eval_args.batch_size,
+            threads=eval_args.threads,
+            hits=eval_args.hits
+        )
+    
+    print("==================================================")
+    print("Finish generating search results with model:")
+    print(model_args.encoder)
+
+
+if __name__ == "__main__":
+    main()
--- a/C_MTEB/MLDR/sparse_retrieval/step2-eval_sparse_mldr.py
+++ b/C_MTEB/MLDR/sparse_retrieval/step2-eval_sparse_mldr.py
+"""
+Ref: https://github.com/texttron/tevatron/tree/main/examples/unicoil
+# 1. Generate Query and Corpus Sparse Vector
+python step0-encode_query-and-corpus.py \
+--encoder BAAI/bge-m3 \
+--languages ar de en es fr hi it ja ko pt ru th zh \
+--save_dir ./encoded_query-and-corpus \
+--max_query_length 512 \
+--max_passage_length 8192 \
+--batch_size 1024 \
+--corpus_batch_size 4 \
+--pooling_method cls \
+--normalize_embeddings True
+
+# 2. Output Search Results
+python step1-search_results.py \
+--encoder BAAI/bge-m3 \
+--languages ar de en es fr hi it ja ko pt ru th zh \
+--encoded_query_and_corpus_save_dir ./encoded_query-and-corpus \
+--result_save_dir ./search_results \
+--threads 16 \
+--hits 1000
+
+# 3. Print and Save Evaluation Results
+python step2-eval_sparse_mldr.py \
+--encoder BAAI/bge-m3 \
+--languages ar de es fr hi it ja ko pt ru th en zh \
+--search_result_save_dir ./search_results \
+--qrels_dir ../qrels \
+--eval_result_save_dir ./eval_results \
+--metrics ndcg@10 \
+--pooling_method cls \
+--normalize_embeddings True
+"""
+import os
+import json
+import platform
+import subprocess
+import numpy as np
+from pprint import pprint
+from dataclasses import dataclass, field
+from transformers import HfArgumentParser
+from pyserini.util import download_evaluation_script
+
+
+@dataclass
+class EvalArgs:
+    languages: str = field(
+        default="en",
+        metadata={'help': 'Languages to evaluate. Avaliable languages: ar de en es fr hi it ja ko pt ru th zh', 
+                  "nargs": "+"}
+    )
+    encoder: str = field(
+        default='BAAI/bge-m3',
+        metadata={'help': 'Name or path of encoder'}
+    )
+    pooling_method: str = field(
+        default='cls',
+        metadata={'help': "Pooling method. Avaliable methods: 'cls', 'mean'"}
+    )
+    normalize_embeddings: bool = field(
+        default=True,
+        metadata={'help': "Normalize embeddings or not"}
+    )
+    search_result_save_dir: str = field(
+        default='./search_results',
+        metadata={'help': 'Dir to saving search results. Search results path is `result_save_dir/{encoder}/{lang}.txt`'}
+    )
+    qrels_dir: str = field(
+        default='../qrels',
+        metadata={'help': 'Dir to qrels.'}
+    )
+    metrics: str = field(
+        default="ndcg@10",
+        metadata={'help': 'Metrics to evaluate. Avaliable metrics: ndcg@k, recall@k', 
+                  "nargs": "+"}
+    )
+    eval_result_save_dir: str = field(
+        default='./eval_results',
+        metadata={'help': 'Dir to saving evaluation results. Evaluation results will be saved to `eval_result_save_dir/{encoder}.json`'}
+    )
+
+
+def check_languages(languages):
+    if isinstance(languages, str):
+        languages = [languages]
+    avaliable_languages = ['ar', 'de', 'en', 'es', 'fr', 'hi', 'it', 'ja', 'ko', 'pt', 'ru', 'th', 'zh']
+    for lang in languages:
+        if lang not in avaliable_languages:
+            raise ValueError(f"Language `{lang}` is not supported. Avaliable languages: {avaliable_languages}")
+    return languages
+
+
+def compute_average(results: dict):
+    average_results = {}
+    for _, result in results.items():
+        for metric, score in result.items():
+            if metric not in average_results:
+                average_results[metric] = []
+            average_results[metric].append(score)
+    for metric, scores in average_results.items():
+        average_results[metric] = np.mean(scores)
+    return average_results
+
+
+def save_results(model_name: str, pooling_method: str, normalize_embeddings: bool, results: dict, save_path: str, eval_languages: list):
+    try:
+        results['average'] = compute_average(results)
+    except:
+        results['average'] = None
+        pass
+    pprint(results)
+    if not os.path.exists(os.path.dirname(save_path)):
+        os.makedirs(os.path.dirname(save_path))
+    
+    if 'bm25' in model_name:
+        pooling_method = ''
+        normalize_embeddings = ''
+    results_dict = {
+        'model': model_name,
+        'pooling_method': pooling_method,
+        'normalize_embeddings': normalize_embeddings,
+        'results': results
+    }
+    with open(save_path, 'w', encoding='utf-8') as f:
+        json.dump(results_dict, f, indent=4, ensure_ascii=False)
+    print(f'Results of evaluating `{model_name}` on `{eval_languages}` saved at `{save_path}`')
+
+
+def map_metric(metric: str):
+    metric, k = metric.split('@')
+    if metric.lower() == 'ndcg':
+        return k, f'ndcg_cut.{k}'
+    elif metric.lower() == 'recall':
+        return k, f'recall.{k}'
+    else:
+        raise ValueError(f"Unkown metric: {metric}")
+
+
+def evaluate(script_path, qrels_path, search_result_path, metrics: list):
+    cmd_prefix = ['java', '-jar', script_path]
+    
+    results = {}
+    for metric in metrics:
+        k, mapped_metric = map_metric(metric)
+        args = ['-c', '-M', str(k), '-m', mapped_metric, qrels_path, search_result_path]
+        cmd = cmd_prefix + args
+        
+        # print(f'Running command: {cmd}')
+        shell = platform.system() == "Windows"
+        process = subprocess.Popen(cmd,
+                                stdout=subprocess.PIPE,
+                                stderr=subprocess.PIPE,
+                                shell=shell)
+        stdout, stderr = process.communicate()
+        if stderr:
+            print(stderr.decode("utf-8"))
+        result_str = stdout.decode("utf-8")
+        try:
+            results[metric] = float(result_str.split(' ')[-1].split('\t')[-1])
+        except:
+            results[metric] = result_str
+    return results
+
+
+def main():
+    parser = HfArgumentParser([EvalArgs])
+    eval_args = parser.parse_args_into_dataclasses()[0]
+    eval_args: EvalArgs
+    
+    languages = check_languages(eval_args.languages)
+    
+    script_path = download_evaluation_script('trec_eval')
+    
+    if eval_args.encoder[-1] == '/':
+        eval_args.encoder = eval_args.encoder[:-1]
+    
+    encoder = eval_args.encoder
+    if os.path.basename(encoder).startswith('checkpoint-'):
+        encoder = os.path.dirname(encoder) + '_' + os.path.basename(encoder)
+    
+    results = {}
+    for lang in languages:
+        print("*****************************")
+        print(f"Start evaluating {lang} ...")
+        qrels_path = os.path.join(eval_args.qrels_dir, f"qrels.mldr-v1.0-{lang}-test.tsv")
+        
+        search_result_save_dir = os.path.join(eval_args.search_result_save_dir, os.path.basename(encoder))
+        search_result_path = os.path.join(search_result_save_dir, f"{lang}.txt")
+        
+        result = evaluate(script_path, qrels_path, search_result_path, eval_args.metrics)
+        results[lang] = result
+    
+    save_results(
+        model_name=encoder,
+        pooling_method=eval_args.pooling_method,
+        normalize_embeddings=eval_args.normalize_embeddings,
+        results=results,
+        save_path=os.path.join(eval_args.eval_result_save_dir, f"{os.path.basename(encoder)}.json"),
+        eval_languages=languages
+    )
+    print("==================================================")
+    print("Finish generating evaluation results with model:")
+    print(eval_args.encoder)
+
+
+if __name__ == "__main__":
+    main()
--- a/C_MTEB/README.md
+++ b/C_MTEB/README.md
+<h1 align="center">Chinese Massive Text Embedding Benchmark</h1>
+<p align="center">
+    <a href="https://www.python.org/">
+            <img alt="Build" src="https://img.shields.io/badge/Contribution-Welcome-blue">
+    </a>
+    <a href="https://huggingface.co/C-MTEB">
+        <img alt="Build" src="https://img.shields.io/badge/C_MTEB-🤗-yellow">
+    </a>
+    <a href="https://www.python.org/">
+        <img alt="Build" src="https://img.shields.io/badge/Made with-Python-red">
+    </a>
+</p>
+
+<h4 align="center">
+    <p>
+        <a href=#installation>Installation</a> |
+        <a href=#evaluation>Evaluation</a>  |
+        <a href="#leaderboard">Leaderboard</a> |
+        <a href="#tasks">Tasks</a> |
+        <a href="#acknowledgement">Acknowledgement</a> |
+    <p>
+</h4>
+
+
+## Installation
+C-MTEB is devloped based on [MTEB](https://github.com/embeddings-benchmark/mteb).
+```
+pip install -U C_MTEB
+```
+Or clone this repo and install as editable
+```
+git clone https://github.com/FlagOpen/FlagEmbedding.git
+cd FlagEmbedding/C_MTEB
+pip install -e .
+```
+
+## Evaluation
+
+### Evaluate reranker
+```bash
+python eval_cross_encoder.py --model_name_or_path BAAI/bge-reranker-base
+```
+
+### Evaluate embedding model
+* **With our scripts**
+
+You can **reproduce the results of `baai-general-embedding (bge)`** using the provided python script (see [eval_C-MTEB.py](./eval_C-MTEB.py) )
+```bash
+python eval_C-MTEB.py --model_name_or_path BAAI/bge-large-zh
+
+# for MTEB leaderboard
+python eval_MTEB.py --model_name_or_path BAAI/bge-large-en
+
+```
+
+* **With sentence-transformers**
+
+You can use C-MTEB easily in the same way as [MTEB](https://github.com/embeddings-benchmark/mteb).
+
+Note that the original sentence-transformers model doesn't support instruction.
+So this method cannot test the performance of `bge-*` models.
+
+```python
+from mteb import MTEB
+from C_MTEB import *
+from sentence_transformers import SentenceTransformer
+
+# Define the sentence-transformers model name
+model_name = "bert-base-uncased"
+
+model = SentenceTransformer(model_name)
+evaluation = MTEB(task_langs=['zh'])
+results = evaluation.run(model, output_folder=f"zh_results/{model_name}")
+```
+
+
+* **Using a custom model**
+To evaluate a new model, you can load it via sentence_transformers if it is supported by sentence_transformers.
+Otherwise, models should be implemented like below (implementing an `encode` function taking as input a list of sentences, and returning a list of embeddings (embeddings can be `np.array`, `torch.tensor`, etc.).):
+
+```python
+class MyModel():
+    def encode(self, sentences, batch_size=32, **kwargs):
+        """ Returns a list of embeddings for the given sentences.
+        Args:
+            sentences (`List[str]`): List of sentences to encode
+            batch_size (`int`): Batch size for the encoding
+
+        Returns:
+            `List[np.ndarray]` or `List[tensor]`: List of embeddings for the given sentences
+        """
+        pass
+
+model = MyModel()
+evaluation = MTEB(tasks=["T2Retrival"])
+evaluation.run(model)
+```
+
+## Acknowledgement
+
+We thank the great tool from [Massive Text Embedding Benchmark](https://github.com/embeddings-benchmark/mteb)  and the open-source datasets from Chinese NLP community.
+
+
+## Citation
+
+If you find this repository useful, please consider citation
+
+```
+@misc{c-pack,
+      title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
+      author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff},
+      year={2023},
+      eprint={2309.07597},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
--- a/C_MTEB/README_en.md
+++ b/C_MTEB/README_en.md
+<h1 align="center">Chinese Massive Text Embedding Benchmark</h1>
+<p align="center">
+    <a href="https://www.python.org/">
+            <img alt="Build" src="https://img.shields.io/badge/Contribution-Welcome-blue">
+    </a>
+    <a href="https://huggingface.co/C-MTEB">
+        <img alt="Build" src="https://img.shields.io/badge/C_MTEB-🤗-yellow">
+    </a>
+    <a href="https://www.python.org/">
+        <img alt="Build" src="https://img.shields.io/badge/Made with-Python-red">
+    </a>
+</p>
+
+<h4 align="center">
+    <p>
+        <a href=#installation>Installation</a> |
+        <a href=#evaluation>Evaluation</a>  |
+        <a href="#leaderboard">Leaderboard</a> |
+        <a href="#tasks">Tasks</a> |
+        <a href="#acknowledgement">Acknowledgement</a> |
+    <p>
+</h4>
+
+
+## Installation
+C-MTEB is devloped based on [MTEB](https://github.com/embeddings-benchmark/mteb).
+```
+pip install -U C_MTEB
+```
+Or clone this repo and install as editable
+```
+git clone https://github.com/FlagOpen/FlagEmbedding.git
+cd FlagEmbedding/C_MTEB
+pip install -e .
+```
+
+## Evaluation
+
+### Evaluate reranker
+```bash
+python eval_cross_encoder.py --model_name_or_path BAAI/bge-reranker-base
+```
+
+### Evaluate embedding model
+* **With our scripts**
+
+You can **reproduce the results of `baai-general-embedding (bge)`** using the provided python script (see [eval_C-MTEB.py](./eval_C-MTEB.py) )
+```bash
+python eval_C-MTEB.py --model_name_or_path BAAI/bge-large-zh
+
+# for MTEB leaderboard
+python eval_MTEB.py --model_name_or_path BAAI/bge-large-en
+
+```
+
+* **With sentence-transformers**
+
+You can use C-MTEB easily in the same way as [MTEB](https://github.com/embeddings-benchmark/mteb).
+
+Note that the original sentence-transformers model doesn't support instruction.
+So this method cannot test the performance of `bge-*` models.
+
+```python
+from mteb import MTEB
+from C_MTEB import *
+from sentence_transformers import SentenceTransformer
+
+# Define the sentence-transformers model name
+model_name = "bert-base-uncased"
+
+model = SentenceTransformer(model_name)
+evaluation = MTEB(task_langs=['zh'])
+results = evaluation.run(model, output_folder=f"zh_results/{model_name}")
+```
+
+
+* **Using a custom model**
+To evaluate a new model, you can load it via sentence_transformers if it is supported by sentence_transformers.
+Otherwise, models should be implemented like below (implementing an `encode` function taking as input a list of sentences, and returning a list of embeddings (embeddings can be `np.array`, `torch.tensor`, etc.).):
+
+```python
+class MyModel():
+    def encode(self, sentences, batch_size=32, **kwargs):
+        """ Returns a list of embeddings for the given sentences.
+        Args:
+            sentences (`List[str]`): List of sentences to encode
+            batch_size (`int`): Batch size for the encoding
+
+        Returns:
+            `List[np.ndarray]` or `List[tensor]`: List of embeddings for the given sentences
+        """
+        pass
+
+model = MyModel()
+evaluation = MTEB(tasks=["T2Retrival"])
+evaluation.run(model)
+```
+
+
+## Leaderboard
+
+### 1. Reranker
+
+| Model | T2Reranking | T2RerankingZh2En\* | T2RerankingEn2Zh\* | MMarcoReranking | CMedQAv1 | CMedQAv2 | Avg |
+|:-------------------------------|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|
+| text2vec-base-multilingual | 64.66 | 62.94 | 62.51 | 14.37 | 48.46 | 48.6 | 50.26 |
+| multilingual-e5-small | 65.62 | 60.94 | 56.41 | 29.91 | 67.26 | 66.54 | 57.78 |
+| multilingual-e5-large | 64.55 | 61.61 | 54.28 | 28.6 | 67.42 | 67.92 | 57.4 |
+| multilingual-e5-base | 64.21 | 62.13 | 54.68 | 29.5 | 66.23 | 66.98 | 57.29 |
+| m3e-base | 66.03 | 62.74 | 56.07 | 17.51 | 77.05 | 76.76 | 59.36 |
+| m3e-large | 66.13 | 62.72 | 56.1 | 16.46 | 77.76 | 78.27 | 59.57 |
+| bge-base-zh-v1.5 | 66.49 | 63.25 | 57.02 | 29.74 | 80.47 | 84.88 | 63.64 |
+| bge-large-zh-v1.5 | 65.74 | 63.39 | 57.03 | 28.74 | 83.45 | 85.44 | 63.97 |
+| [BAAI/bge-reranker-base](https://huggingface.co/BAAI/bge-reranker-base) | 67.28 | 63.95 | 60.45 | 35.46 | 81.26 | 84.1 | 65.42 |
+| [BAAI/bge-reranker-large](https://huggingface.co/BAAI/bge-reranker-large) | 67.6 | 64.03 | 61.44 | 37.16 | 82.15 | 84.18 | 66.09 |
+
+\* : T2RerankingZh2En and T2RerankingEn2Zh are cross-language retrieval task
+
+
+### 2. Embedding
+| Model | Embedding dimension | Avg | Retrieval | STS | PairClassification | Classification | Reranking | Clustering |
+|:-------------------------------|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|
+| [BAAI/bge-large-zh-v1.5](https://huggingface.co/BAAI/bge-large-zh-v1.5) | 1024 |  **64.53** | 70.46 | 56.25 | 81.6 | 69.13 | 65.84 | 48.99 |
+| [BAAI/bge-base-zh-v1.5](https://huggingface.co/BAAI/bge-base-zh-v1.5) | 768 |  63.13 | 69.49 | 53.72 | 79.75 | 68.07 | 65.39 | 47.53 |
+| [BAAI/bge-small-zh-v1.5](https://huggingface.co/BAAI/bge-small-zh-v1.5) | 512 | 57.82 | 61.77 | 49.11 | 70.41 | 63.96 | 60.92 | 44.18 |
+| [BAAI/bge-large-zh](https://huggingface.co/BAAI/bge-large-zh) | 1024 | 64.20 | 71.53 | 54.98 | 78.94 | 68.32 | 65.11 | 48.39 |
+| [BAAI/bge-large-zh-noinstruct](https://huggingface.co/BAAI/bge-large-zh-noinstruct) | 1024 | 63.53 | 70.55 | 53 | 76.77 | 68.58 | 64.91 | 50.01 |
+| [BAAI/bge-base-zh](https://huggingface.co/BAAI/bge-base-zh) | 768 | 62.96 | 69.53 | 54.12 | 77.5 | 67.07 | 64.91 | 47.63 |
+| [multilingual-e5-large](https://huggingface.co/intfloat/multilingual-e5-large) | 1024 | 58.79 | 63.66 | 48.44 | 69.89 | 67.34 | 56.00 | 48.23 |
+| [BAAI/bge-small-zh](https://huggingface.co/BAAI/bge-small-zh) | 512 | 58.27 |  63.07 | 49.45 | 70.35 | 63.64 | 61.48 | 45.09 |
+| [m3e-base](https://huggingface.co/moka-ai/m3e-base) | 768 | 57.10 | 56.91 | 50.47 | 63.99 | 67.52 | 59.34 | 47.68 |
+| [m3e-large](https://huggingface.co/moka-ai/m3e-large) | 1024 |  57.05 | 54.75 | 50.42 | 64.3 | 68.2 | 59.66 | 48.88 |
+| [multilingual-e5-base](https://huggingface.co/intfloat/multilingual-e5-base) | 768 | 55.48 | 61.63 | 46.49 | 67.07 | 65.35 | 54.35 | 40.68 |
+| [multilingual-e5-small](https://huggingface.co/intfloat/multilingual-e5-small) | 384 | 55.38 | 59.95 | 45.27 | 66.45 | 65.85 | 53.86 | 45.26 |
+| [text-embedding-ada-002(OpenAI)](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings) | 1536 |  53.02 | 52.0 | 43.35 | 69.56 | 64.31 | 54.28 | 45.68 |
+| [luotuo](https://huggingface.co/silk-road/luotuo-bert-medium) | 1024 | 49.37 |  44.4 | 42.78 | 66.62 | 61 | 49.25 | 44.39 |
+| [text2vec-base](https://huggingface.co/shibing624/text2vec-base-chinese) | 768 |  47.63 | 38.79 | 43.41 | 67.41 | 62.19 | 49.45 | 37.66 |
+| [text2vec-large](https://huggingface.co/GanymedeNil/text2vec-large-chinese) | 1024 | 47.36 | 41.94 | 44.97 | 70.86 | 60.66 | 49.16 | 30.02 |
+
+
+### 2.1. Retrieval
+| Model | T2Retrieval | MMarcoRetrieval | DuRetrieval | CovidRetrieval | CmedqaRetrieval | EcomRetrieval | MedicalRetrieval | VideoRetrieval | Avg |
+|:-------------------------------|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|
+| luotuo-bert-medium | 58.67 | 55.31 | 59.36 | 55.48 | 18.04 | 40.48 | 29.8 | 38.04 | 44.4 |
+| text2vec-large-chinese | 50.52 | 45.96 | 51.87 | 60.48 | 15.53 | 37.58 | 30.93 | 42.65 | 41.94 |
+| text2vec-base-chinese | 51.67 | 44.06 | 52.23 | 44.81 | 15.91 | 34.59 | 27.56 | 39.52 | 38.79 |
+| m3e-base | 73.14 | 65.45 | 75.76 | 66.42 | 30.33 | 50.27 | 42.8 | 51.11 | 56.91 |
+| m3e-large | 72.36 | 61.06 | 74.69 | 61.33 | 30.73 | 45.18 | 48.66 | 44.02 | 54.75 |
+| OpenAI(text-embedding-ada-002) | 69.14 | 69.86 | 71.17 | 57.21 | 22.36 | 44.49 | 37.92 | 43.85 | 52.0 |
+| multilingual-e5-small | 71.39 | 73.17 | 81.35 | 72.82 | 24.38 | 53.56 | 44.84 | 58.09 | 59.95 |
+| multilingual-e5-base | 70.86 | 76.04 | 81.64 | 73.45 | 27.2 | 54.17 | 48.35 | 61.3 | 61.63 |
+| multilingual-e5-large | 76.11 | 79.2 | 85.32 | 75.51 | 28.67 | 54.75 | 51.44 | 58.25 | 63.66 |
+| [BAAI/bge-small-zh](https://huggingface.co/BAAI/bge-small-zh) | 77.59 | 67.56 | 77.89 | 68.95 | 35.18 | 58.17 | 49.9 | 69.33 | 63.07 |
+| [BAAI/bge-base-zh](https://huggingface.co/BAAI/bge-base-zh) | 83.35 | 79.11 | 86.02 | 72.07 | 41.77 | 63.53 | 56.64 | 73.76 | 69.53 |
+| [bge-large-zh-noinstruct](https://huggingface.co/BAAI/bge-large-zh-noinstruct) | 84.39 | 81.38 | 84.68 | 75.07 | 41.03 | 65.6 | 58.28 | 73.94 | 70.55 |
+| [**bge-large-zh**](https://huggingface.co/BAAI/bge-large-zh) | 84.82 | 81.28 | 86.94 | 74.06 | 42.4 | 66.12 | 59.39 | 77.19 | 71.53 |
+
+
+### 2.2.  STS
+| Model | ATEC | BQ | LCQMC | PAWSX | STSB | AFQMC | QBQTC | STS22 (zh) | Avg |
+|:-------------------------------|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|
+| luotuo-bert-medium | 30.84 | 43.33 | 66.74 | 12.31 | 73.22 | 22.24 | 27.2 | 66.4 | 42.78 |
+| text2vec-large-chinese | 32.45 | 44.22 | 69.16 | 14.55 | 79.45 | 24.51 | 29.51 | 65.94 | 44.97 |
+| text2vec-base-chinese | 31.93 | 42.67 | 70.16 | 17.21 | 79.3 | 26.06 | 24.62 | 55.35 | 43.41 |
+| m3e-base | 41.27 | 63.81 | 74.88 | 12.19 | 76.97 | 35.87 | 32.07 | 66.73 | 50.47 |
+| m3e-large | 41.8 | 65.2 | 74.2 | 15.95 | 74.16 | 36.53 | 32.65 | 62.91 | 50.42 |
+| OpenAI(text-embedding-ada-002) | 29.25 | 45.33 | 68.41 | 16.55 | 70.61 | 23.88 | 30.27 | 62.53 | 43.35 |
+| multilingual-e5-small | 35.14 | 43.27 | 72.7 | 11.01 | 77.73 | 25.21 | 30.25 | 66.84 | 45.27 |
+| multilingual-e5-base | 37.01 | 45.45 | 74.15 | 12.14 | 79.05 | 29.67 | 28.81 | 65.64 | 46.49 |
+| multilingual-e5-large | 39.81 | 46.44 | 75.95 | 14.63 | 81.08 | 33.02 | 29.77 | 66.82 | 48.44 |
+| [BAAI/bge-small-zh](https://huggingface.co/BAAI/bge-small-zh) | 43.17 | 55.47 | 72.61 | 9.97 | 76.48 | 33.93 | 36.45 | 67.54 | 49.45 |
+| [BAAI/bge-base-zh](https://huggingface.co/BAAI/bge-base-zh) | 48.28 | 61.21 | 74.98 | 20.65 | 78.66 | 42.53 | 38.01 | 68.64 | 54.12 |
+| [bge-large-zh-noinstruct](https://huggingface.co/BAAI/bge-large-zh-noinstruct) | 48.29 | 60.53 | 74.71 | 16.64 | 78.41 | 43.06 | 35.2 | 67.19 | 53 |
+| [**bge-large-zh**](https://huggingface.co/BAAI/bge-large-zh) | 49.75 | 62.93 | 75.45 | 22.45 | 78.51 | 44.57 | 38.92 | 67.24 | 54.98 |
+
+
+### 2.3. PairClassification
+| Model | Ocnli | Cmnli | Avg |
+|:-------------------------------|:--------:|:--------:|:--------:|
+| luotuo-bert-medium | 60.7 | 72.55 | 66.62 |
+| text2vec-large-chinese | 64.04 | 77.67 | 70.86 |
+| text2vec-base-chinese | 60.95 | 73.87 | 67.41 |
+| m3e-base | 58.0 | 69.98 | 63.99 |
+| m3e-large | 59.33 | 69.27 | 64.3 |
+| OpenAI(text-embedding-ada-002) | 63.08 | 76.03 | 69.56 |
+| multilingual-e5-small | 60.77 | 72.12 | 66.45 |
+| multilingual-e5-base | 59.63 | 74.51 | 67.07 |
+| multilingual-e5-large | 78.18 | 78.18 | 69.89 |
+| [BAAI/bge-small-zh](https://huggingface.co/BAAI/bge-small-zh) | 65.25 | 75.46 | 70.35 |
+| [BAAI/bge-base-zh](https://huggingface.co/BAAI/bge-base-zh) | 73.32 | 81.69 | 77.5 |
+| [bge-large-zh-noinstruct](https://huggingface.co/BAAI/bge-large-zh-noinstruct) | 71.37 | 82.17 | 76.77 |
+| [**bge-large-zh**](https://huggingface.co/BAAI/bge-large-zh) | 75.75 | 82.12 | 78.94 |
+
+
+### 2.4. Classification
+| Model | TNews | IFlyTek | MultilingualSentiment | JDReview | OnlineShopping | Waimai | AmazonReviewsClassification (zh) | MassiveIntentClassification (zh-CN) | MassiveScenarioClassification (zh-CN) | Avg |
+|:-------------------------------|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|
+| luotuo-bert-medium | 45.22 | 41.75 | 61.21 | 79.68 | 84.3 | 79.57 | 34.46 | 57.47 | 65.32 | 61 |
+| text2vec-large-chinese | 38.92 | 41.54 | 58.97 | 81.56 | 83.51 | 76.01 | 33.77 | 63.23 | 68.45 | 60.66 |
+| text2vec-base-chinese | 43.02 | 42.05 | 60.98 | 82.14 | 85.69 | 77.22 | 34.12 | 63.98 | 70.52 | 62.19 |
+| m3e-base | 48.28 | 44.42 | 71.9 | 85.33 | 87.77 | 83.99 | 43.02 | 68.4 | 74.6 | 67.52 |
+| m3e-large | 48.26 | 43.96 | 72.47 | 86.92 | 89.59 | 86.1 | 44.44 | 67.23 | 74.88 | 68.2 |
+| OpenAI(text-embedding-ada-002) | 45.77 | 44.62 | 67.99 | 74.6 | 88.94 | 82.37 | 38.3 | 64.81 | 71.4 | 64.31 |
+| multilingual-e5-small | 48.38 | 47.35 | 64.74 | 79.34 | 88.73 | 83.9 | 37.5 | 68.24 | 74.47 | 65.85 |
+| multilingual-e5-base | 47.06 | 44.93 | 65.28 | 76.21 | 88.4 | 84.42 | 37.23 | 69.16 | 75.42 | 65.35 |
+| multilingual-e5-large | 48.38 | 45.47 | 68.58 | 80.99 | 90.81 | 85.02 | 38.83 | 71.12 | 76.83 | 67.34 |
+| [BAAI/bge-small-zh](https://huggingface.co/BAAI/bge-small-zh) | 47.67 | 42.07 | 65.07 | 80.64 | 87.4 | 83.8 | 37.31 | 61.44 | 67.39 | 63.64 |
+| [BAAI/bge-base-zh](https://huggingface.co/BAAI/bge-base-zh) | 49.97 | 44.54 | 70.63 | 83.92 | 91.38 | 85.46 | 40.68 | 65.72 | 71.3 | 67.07 |
+| [bge-large-zh-noinstruct](https://huggingface.co/BAAI/bge-large-zh-noinstruct) | 52.05 | 45.32 | 73.7 | 85.38 | 91.66 | 86.83 | 41.94 | 66.96 | 73.39 | 68.58 |
+| [**bge-large-zh**](https://huggingface.co/BAAI/bge-large-zh) | 50.84 | 45.09 | 74.41 | 85.08 | 91.6 | 86.54 | 42.39 | 67.18 | 71.76 | 68.32 |
+
+
+### 2.5. Reranking
+| Model | T2Reranking | MmarcoReranking | CMedQAv1 | CMedQAv2 | Avg |
+|:-------------------------------|:--------:|:--------:|:--------:|:--------:|:--------:|
+| luotuo-bert-medium | 65.76 | 14.55 | 57.82 | 58.88 | 49.25 |
+| text2vec-large-chinese | 64.82 | 12.48 | 58.92 | 60.41 | 49.16 |
+| text2vec-base-chinese | 65.95 | 12.76 | 59.26 | 59.82 | 49.45 |
+| m3e-base | 66.03 | 17.51 | 77.05 | 76.76 | 59.34 |
+| m3e-large | 66.13 | 16.46 | 77.76 | 78.27 | 59.66 |
+| OpenAI(text-embedding-ada-002) | 66.65 | 23.39 | 63.08 | 64.02 | 54.28 |
+| multilingual-e5-small | 65.24 | 24.33 | 63.44 | 62.41 | 53.86 |
+| multilingual-e5-base | 64.39 | 21.76 | 65.21 | 66.06 | 54.35 |
+| multilingual-e5-large | 65.83 | 21.34 | 68.25 | 68.56 | 56.00 |
+| [BAAI/bge-small-zh](https://huggingface.co/BAAI/bge-small-zh) | 66.2 | 22.82 | 77.08 | 79.82 | 61.48 |
+| [BAAI/bge-base-zh](https://huggingface.co/BAAI/bge-base-zh) | 66.49 | 28.24 | 80.12 | 84.78 | 64.91 |
+| [bge-large-zh-noinstruct](https://huggingface.co/BAAI/bge-large-zh-noinstruct) | 66.16 | 27.1 | 81.72 | 84.64 | 64.91 |
+| [**bge-large-zh**](https://huggingface.co/BAAI/bge-large-zh) | 66.19 | 26.23 | 83.01 | 85.01 | 65.11 |
+
+### 2.6. Clustering
+| Model | CLSClusteringS2S | CLSClusteringP2P | ThuNewsClusteringS2S | ThuNewsClusteringP2P | Avg |
+|:-------------------------------|:--------:|:--------:|:--------:|:--------:|:--------:|
+| luotuo-bert-medium | 33.46 | 37.01 | 48.26 | 58.83 | 44.39 |
+| text2vec-large-chinese | 28.77 | 30.13 | 26.14 | 35.05 | 30.02 |
+| text2vec-base-chinese | 32.42 | 35.27 | 40.01 | 42.92 | 37.66 |
+| m3e-base | 37.34 | 39.81 | 53.78 | 59.77 | 47.68 |
+| m3e-large | 38.02 | 38.6 | 58.51 | 60.39 | 48.88 |
+| OpenAI(text-embedding-ada-002) | 35.91 | 38.26 | 49.86 | 58.71 | 45.68 |
+| multilingual-e5-small | 37.79 | 39.14 | 48.93 | 55.18 | 45.26 |
+| multilingual-e5-base | 36.99 | 32.41 | 52.36 | 40.98 | 40.68 |
+| multilingual-e5-large | 38.59 | 40.68 | 55.59 | 58.05 | 48.23 |
+| [BAAI/bge-small-zh](https://huggingface.co/BAAI/bge-small-zh) | 34.34 | 38.23 | 51.84 | 55.95 | 45.09 |
+| [BAAI/bge-base-zh](https://huggingface.co/BAAI/bge-base-zh) | 36.59 | 38.79 | 56.16 | 59.0 | 47.63 |
+| [bge-large-zh-noinstruct](https://huggingface.co/BAAI/bge-large-zh-noinstruct) | 40.04 | 41.23 | 56.75 | 62.03 | 50.01 |
+| [**bge-large-zh**](https://huggingface.co/BAAI/bge-large-zh) | 38.05 | 40.92 | 58.79 | 55.79 | 48.39 |
+
+
+
+## Tasks
+
+An overview of tasks and datasets available in MTEB-chinese is provided in the following table:
+
+| Name |  Hub URL | Description | Type | Category |  Test #Samples |
+|-----|-----|---------------------------|-----|-----|-----|
+| [T2Retrieval](https://arxiv.org/abs/2304.03679) | [C-MTEB/T2Retrieval](https://huggingface.co/datasets/C-MTEB/T2Retrieval) |  T2Ranking: A large-scale Chinese Benchmark for Passage Ranking | Retrieval | s2p | 24,832 |
+| [MMarcoRetrieval](https://github.com/unicamp-dl/mMARCO) | [C-MTEB/MMarcoRetrieval](https://huggingface.co/datasets/C-MTEB/MMarcoRetrieval) | mMARCO is a multilingual version of the MS MARCO passage ranking dataset | Retrieval | s2p | 7,437 |
+| [DuRetrieval](https://aclanthology.org/2022.emnlp-main.357.pdf) | [C-MTEB/DuRetrieval](https://huggingface.co/datasets/C-MTEB/DuRetrieval) | A Large-scale Chinese Benchmark for Passage Retrieval from Web Search Engine | Retrieval | s2p | 4,000 |
+| [CovidRetrieval](https://aclanthology.org/2022.emnlp-main.357.pdf) | [C-MTEB/CovidRetrieval](https://huggingface.co/datasets/C-MTEB/CovidRetrieval) | COVID-19 news articles | Retrieval | s2p | 949  |
+| [CmedqaRetrieval](https://aclanthology.org/2022.emnlp-main.357.pdf) | [C-MTEB/CmedqaRetrieval](https://huggingface.co/datasets/C-MTEB/CmedqaRetrieval) |  Online medical consultation text | Retrieval | s2p | 3,999 |
+| [EcomRetrieval](https://arxiv.org/abs/2203.03367) | [C-MTEB/EcomRetrieval](https://huggingface.co/datasets/C-MTEB/EcomRetrieval) | Passage retrieval dataset collected from Alibaba search engine systems in e-commerce domain | Retrieval | s2p | 1,000 |
+| [MedicalRetrieval](https://arxiv.org/abs/2203.03367) | [C-MTEB/MedicalRetrieval](https://huggingface.co/datasets/C-MTEB/MedicalRetrieval) | Passage retrieval dataset collected from Alibaba search engine systems in medical domain | Retrieval | s2p | 1,000  |
+| [VideoRetrieval](https://arxiv.org/abs/2203.03367) | [C-MTEB/VideoRetrieval](https://huggingface.co/datasets/C-MTEB/VideoRetrieval) | Passage retrieval dataset collected from Alibaba search engine systems in video domain | Retrieval | s2p | 1,000  |
+| [T2Reranking](https://arxiv.org/abs/2304.03679) | [C-MTEB/T2Reranking](https://huggingface.co/datasets/C-MTEB/T2Reranking) | T2Ranking: A large-scale Chinese Benchmark for Passage Ranking | Reranking | s2p | 24,382 |
+| [MMarcoReranking](https://github.com/unicamp-dl/mMARCO) | [C-MTEB/MMarco-reranking](https://huggingface.co/datasets/C-MTEB/Mmarco-reranking) | mMARCO is a multilingual version of the MS MARCO passage ranking dataset | Reranking | s2p | 7,437 |
+| [CMedQAv1](https://github.com/zhangsheng93/cMedQA) | [C-MTEB/CMedQAv1-reranking](https://huggingface.co/datasets/C-MTEB/CMedQAv1-reranking) | Chinese community medical question answering | Reranking | s2p |  2,000  |
+| [CMedQAv2](https://github.com/zhangsheng93/cMedQA2) | [C-MTEB/CMedQAv2-reranking](https://huggingface.co/datasets/C-MTEB/C-MTEB/CMedQAv2-reranking) | Chinese community medical question answering | Reranking | s2p |  4,000  |
+| [Ocnli](https://arxiv.org/abs/2010.05444) | [C-MTEB/OCNLI](https://huggingface.co/datasets/C-MTEB/OCNLI) | Original Chinese Natural Language Inference dataset | PairClassification | s2s |  3,000  |
+| [Cmnli](https://huggingface.co/datasets/clue/viewer/cmnli) | [C-MTEB/CMNLI](https://huggingface.co/datasets/C-MTEB/CMNLI) | Chinese Multi-Genre NLI | PairClassification | s2s | 139,000  |
+| [CLSClusteringS2S](https://arxiv.org/abs/2209.05034) | [C-MTEB/CLSClusteringS2S](https://huggingface.co/datasets/C-MTEB/C-MTEB/CLSClusteringS2S) | Clustering of titles from CLS dataset. Clustering of 13 sets, based on the main category. | Clustering | s2s |  10,000  |
+| [CLSClusteringP2P](https://arxiv.org/abs/2209.05034) | [C-MTEB/CLSClusteringP2P](https://huggingface.co/datasets/C-MTEB/CLSClusteringP2P) | Clustering of titles + abstract from CLS dataset. Clustering of 13 sets, based on the main category. | Clustering | p2p | 10,000   |
+| [ThuNewsClusteringS2S](http://thuctc.thunlp.org/) | [C-MTEB/ThuNewsClusteringS2S](https://huggingface.co/datasets/C-MTEB/ThuNewsClusteringS2S) | Clustering of titles from the THUCNews dataset | Clustering | s2s |  10,000  |
+| [ThuNewsClusteringP2P](http://thuctc.thunlp.org/) | [C-MTEB/ThuNewsClusteringP2P](https://huggingface.co/datasets/C-MTEB/ThuNewsClusteringP2P) | Clustering of titles + abstract from the THUCNews dataset | Clustering | p2p |  10,000  |
+| [ATEC](https://github.com/IceFlameWorm/NLP_Datasets/tree/master/ATEC) | [C-MTEB/ATEC](https://huggingface.co/datasets/C-MTEB/ATEC) | ATEC NLP sentence pair similarity competition | STS | s2s |  20,000  |
+| [BQ](https://huggingface.co/datasets/shibing624/nli_zh) | [C-MTEB/BQ](https://huggingface.co/datasets/C-MTEB/BQ) | Bank Question Semantic Similarity | STS | s2s |  10,000  |
+| [LCQMC](https://huggingface.co/datasets/shibing624/nli_zh) | [C-MTEB/LCQMC](https://huggingface.co/datasets/C-MTEB/LCQMC) | A large-scale Chinese question matching corpus. | STS | s2s |  12,500  |
+| [PAWSX](https://arxiv.org/pdf/1908.11828.pdf) | [C-MTEB/PAWSX](https://huggingface.co/datasets/C-MTEB/PAWSX) | Translated PAWS evaluation pairs | STS | s2s |  2,000  |
+| [STSB](https://github.com/pluto-junzeng/CNSD) | [C-MTEB/STSB](https://huggingface.co/datasets/C-MTEB/STSB) | Translate STS-B into Chinese | STS | s2s |  1,360  |
+| [AFQMC](https://github.com/CLUEbenchmark/CLUE) | [C-MTEB/AFQMC](https://huggingface.co/datasets/C-MTEB/AFQMC) | Ant Financial Question Matching Corpus| STS | s2s |  3,861  |
+| [QBQTC](https://github.com/CLUEbenchmark/QBQTC) | [C-MTEB/QBQTC](https://huggingface.co/datasets/C-MTEB/QBQTC) | QQ Browser Query Title Corpus | STS | s2s |  5,000  |
+| [TNews](https://github.com/CLUEbenchmark/CLUE) | [C-MTEB/TNews-classification](https://huggingface.co/datasets/C-MTEB/TNews-classification) | Short Text Classificaiton for News | Classification | s2s |  10,000  |
+| [IFlyTek](https://github.com/CLUEbenchmark/CLUE) | [C-MTEB/IFlyTek-classification](https://huggingface.co/datasets/C-MTEB/IFlyTek-classification) |  Long Text classification for the description of Apps | Classification | s2s |  2,600  |
+| [Waimai](https://github.com/SophonPlus/ChineseNlpCorpus/blob/master/datasets/waimai_10k/intro.ipynb) | [C-MTEB/waimai-classification](https://huggingface.co/datasets/C-MTEB/waimai-classification) | Sentiment Analysis of user reviews on takeaway platforms | Classification | s2s |  1,000  |
+| [OnlineShopping](https://github.com/SophonPlus/ChineseNlpCorpus/blob/master/datasets/online_shopping_10_cats/intro.ipynb) | [C-MTEB/OnlineShopping-classification](https://huggingface.co/datasets/C-MTEB/OnlineShopping-classification) | Sentiment Analysis of User Reviews on Online Shopping Websites | Classification | s2s |  1,000  |
+| [MultilingualSentiment](https://github.com/tyqiangz/multilingual-sentiment-datasets) | [C-MTEB/MultilingualSentiment-classification](https://huggingface.co/datasets/C-MTEB/MultilingualSentiment-classification)  | A collection of multilingual sentiments datasets grouped into 3 classes -- positive, neutral, negative | Classification | s2s |  3,000  |
+| [JDReview](https://huggingface.co/datasets/kuroneko5943/jd21) |  [C-MTEB/JDReview-classification](https://huggingface.co/datasets/C-MTEB/JDReview-classification) | review for iphone | Classification | s2s |  533  |
+
+For retrieval tasks, we sample 100,000 candidates (including the ground truths) from the entire corpus to reduce the inference cost.
+
+## Acknowledgement
+
+We thank the great tool from [Massive Text Embedding Benchmark](https://github.com/embeddings-benchmark/mteb)  and the open-source datasets from Chinese NLP community.
+
+
+## Citation
+
+If you find this repository useful, please consider citation
+
+```
+@misc{c-pack,
+      title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
+      author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff},
+      year={2023},
+      eprint={2309.07597},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
--- a/C_MTEB/eval_C-MTEB.py
+++ b/C_MTEB/eval_C-MTEB.py
+import argparse
+
+from C_MTEB.tasks import *
+from flag_dres_model import FlagDRESModel
+from mteb import MTEB
+
+query_instruction_for_retrieval_dict = {
+    "BAAI/bge-large-zh": "为这个句子生成表示以用于检索相关文章：",
+    "BAAI/bge-large-zh-noinstruct": None,
+    "BAAI/bge-base-zh": "为这个句子生成表示以用于检索相关文章：",
+    "BAAI/bge-small-zh": "为这个句子生成表示以用于检索相关文章：",
+    "BAAI/bge-large-zh-v1.5": "为这个句子生成表示以用于检索相关文章：",
+    "BAAI/bge-base-zh-v1.5": "为这个句子生成表示以用于检索相关文章：",
+    "BAAI/bge-small-zh-v.15": "为这个句子生成表示以用于检索相关文章：",
+}
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_name_or_path', default="BAAI/bge-large-zh", type=str)
+    parser.add_argument('--task_type', default=None, type=str)
+    parser.add_argument('--add_instruction', action='store_true', help="whether to add instruction for query")
+    parser.add_argument('--pooling_method', default='cls', type=str)
+    return parser.parse_args()
+
+
+
+if __name__ == '__main__':
+    args = get_args()
+
+    model = FlagDRESModel(model_name_or_path=args.model_name_or_path,
+                          query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章：",
+                          pooling_method=args.pooling_method)
+
+    task_names = [t.description["name"] for t in MTEB(task_types=args.task_type,
+                                                      task_langs=['zh', 'zh-CN']).tasks]
+
+    for task in task_names:
+        # if task not in ChineseTaskList:
+        #     continue
+        if task in ['T2Retrieval', 'MMarcoRetrieval', 'DuRetrieval',
+                    'CovidRetrieval', 'CmedqaRetrieval',
+                    'EcomRetrieval', 'MedicalRetrieval', 'VideoRetrieval',
+                    'T2Reranking', 'MMarcoReranking', 'CMedQAv1', 'CMedQAv2']:
+            if args.model_name_or_path not in query_instruction_for_retrieval_dict:
+                if args.add_instruction:
+                    instruction = "为这个句子生成表示以用于检索相关文章："
+                else:
+                    instruction = None
+                print(f"{args.model_name_or_path} not in query_instruction_for_retrieval_dict, set instruction={instruction}")
+            else:
+                instruction = query_instruction_for_retrieval_dict[args.model_name_or_path]
+        else:
+            instruction = None
+
+        model.query_instruction_for_retrieval = instruction
+
+        evaluation = MTEB(tasks=[task], task_langs=['zh', 'zh-CN'])
+        evaluation.run(model, output_folder=f"zh_results/{args.model_name_or_path.split('/')[-1]}")
+
+
+
--- a/C_MTEB/eval_MTEB.py
+++ b/C_MTEB/eval_MTEB.py
+import argparse
+
+from flag_dres_model import FlagDRESModel
+from mteb import MTEB
+
+query_instruction_for_retrieval_dict = {
+    "BAAI/bge-large-en": "Represent this sentence for searching relevant passages: ",
+    "BAAI/bge-base-en": "Represent this sentence for searching relevant passages: ",
+    "BAAI/bge-small-en": "Represent this sentence for searching relevant passages: ",
+    "BAAI/bge-large-en-v1.5": "Represent this sentence for searching relevant passages: ",
+    "BAAI/bge-base-en-v1.5": "Represent this sentence for searching relevant passages: ",
+    "BAAI/bge-small-en-v1.5": "Represent this sentence for searching relevant passages: ",
+}
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_name_or_path', default="BAAI/bge-large-en", type=str)
+    parser.add_argument('--task_type', default=None, type=str, help="task type. Default is None, which means using all task types")
+    parser.add_argument('--add_instruction', action='store_true', help="whether to add instruction for query")
+    parser.add_argument('--pooling_method', default='cls', type=str)
+
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = get_args()
+
+    model = FlagDRESModel(model_name_or_path=args.model_name_or_path,
+                          normalize_embeddings=False,  # normlize embedding will harm the performance of classification task
+                          query_instruction_for_retrieval="Represent this sentence for searching relevant passages: ",
+                          pooling_method=args.pooling_method)
+
+    task_names = [t.description["name"] for t in MTEB(task_types=args.task_type,
+                                                      task_langs=['en']).tasks]
+
+    for task in task_names:
+        if task in ['MSMARCOv2']:
+            print('Skip task: {}, since it has no test split'.format(task))
+            continue
+
+        if 'CQADupstack' in task or task in ['Touche2020', 'SciFact', 'TRECCOVID', 'NQ',
+                                             'NFCorpus', 'MSMARCO', 'HotpotQA', 'FiQA2018',
+                                             'FEVER', 'DBPedia', 'ClimateFEVER', 'SCIDOCS', ]:
+            if args.model_name_or_path not in query_instruction_for_retrieval_dict:
+                if args.add_instruction:
+                    instruction = "Represent this sentence for searching relevant passages: "
+                else:
+                    instruction = None
+                print(f"{args.model_name_or_path} not in query_instruction_for_retrieval_dict, set instruction={instruction}")
+            else:
+                instruction = query_instruction_for_retrieval_dict[args.model_name_or_path]
+        else:
+            instruction = None
+
+        model.query_instruction_for_retrieval = instruction
+
+        evaluation = MTEB(tasks=[task], task_langs=['en'], eval_splits = ["test" if task not in ['MSMARCO'] else 'dev'])
+        evaluation.run(model, output_folder=f"en_results/{args.model_name_or_path.split('/')[-1]}")
+
+
+
--- a/C_MTEB/eval_cross_encoder.py
+++ b/C_MTEB/eval_cross_encoder.py
+import argparse
+
+from C_MTEB.tasks import *
+from mteb import MTEB
+
+from FlagEmbedding import FlagReranker
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_name_or_path', default="BAAI/bge-reranker-base", type=str)
+    return parser.parse_args()
+
+
+
+if __name__ == '__main__':
+    args = get_args()
+
+    model = FlagReranker(args.model_name_or_path, use_fp16=True)
+
+    if 'checkpoint-' in args.model_name_or_path:
+        save_name = "_".join(args.model_name_or_path.split('/')[-2:])
+    else:
+        save_name = "_".join(args.model_name_or_path.split('/')[-1:])
+
+    evaluation = MTEB(task_types=["Reranking"], task_langs=['zh', 'zh2en', 'en2zh'])
+    evaluation.run(model, output_folder=f"reranker_results/{save_name}")
+
+
+
--- a/C_MTEB/flag_dres_model.py
+++ b/C_MTEB/flag_dres_model.py
+from typing import cast, List, Dict, Union
+
+import numpy as np
+import torch
+from tqdm import tqdm
+from transformers import AutoModel, AutoTokenizer, is_torch_npu_available
+
+
+class FlagDRESModel:
+    def __init__(
+            self,
+            model_name_or_path: str = None,
+            pooling_method: str = 'cls',
+            normalize_embeddings: bool = True,
+            query_instruction_for_retrieval: str = None,
+            batch_size: int = 256,
+    ) -> None:
+
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+        self.model = AutoModel.from_pretrained(model_name_or_path)
+        self.query_instruction_for_retrieval = query_instruction_for_retrieval
+        self.normalize_embeddings = normalize_embeddings
+        self.pooling_method = pooling_method
+        self.batch_size = batch_size
+
+        if torch.cuda.is_available():
+            self.device = torch.device("cuda")
+        elif is_torch_npu_available():
+            self.device = torch.device("npu")
+        else:
+            self.device = torch.device("cpu")
+        self.model = self.model.to(self.device)
+
+        num_gpus = torch.cuda.device_count()
+        if num_gpus > 1:
+            self.model = torch.nn.DataParallel(self.model)
+            self.batch_size = self.batch_size * num_gpus
+
+
+    def encode_queries(self, queries: List[str], **kwargs) -> np.ndarray:
+        '''
+        This function will be used for retrieval task
+        if there is a instruction for queries, we will add it to the query text
+        '''
+        if self.query_instruction_for_retrieval is not None:
+            input_texts = ['{}{}'.format(self.query_instruction_for_retrieval, q) for q in queries]
+        else:
+            input_texts = queries
+        return self.encode(input_texts)
+
+
+    def encode_corpus(self, corpus: List[Union[Dict[str, str], str]], **kwargs) -> np.ndarray:
+        '''
+        This function will be used for retrieval task
+        encode corpus for retrieval task
+        '''
+        if isinstance(corpus[0], dict):
+            input_texts = ['{} {}'.format(doc.get('title', ''), doc['text']).strip() for doc in corpus]
+        else:
+            input_texts = corpus
+        return self.encode(input_texts)
+
+
+    @torch.no_grad()
+    def encode(self, sentences: List[str], **kwargs) -> np.ndarray:
+        self.model.eval()
+
+        all_embeddings = []
+        for start_index in tqdm(range(0, len(sentences), self.batch_size), desc="Batches", disable=len(sentences)<256):
+            sentences_batch = sentences[start_index:start_index + self.batch_size]
+            inputs = self.tokenizer(
+                sentences_batch,
+                padding=True,
+                truncation=True,
+                return_tensors='pt',
+                max_length=512,
+            ).to(self.device)
+            last_hidden_state = self.model(**inputs, return_dict=True).last_hidden_state
+            embeddings = self.pooling(last_hidden_state, inputs['attention_mask'])
+            if self.normalize_embeddings:
+                embeddings = torch.nn.functional.normalize(embeddings, dim=-1)
+            embeddings = cast(torch.Tensor, embeddings)
+            all_embeddings.append(embeddings.cpu().numpy())
+
+        return np.concatenate(all_embeddings, axis=0)
+
+    def pooling(self,
+                last_hidden_state: torch.Tensor,
+                attention_mask: torch.Tensor=None):
+        if self.pooling_method == 'cls':
+            return last_hidden_state[:, 0]
+        elif self.pooling_method == 'mean':
+            s = torch.sum(last_hidden_state * attention_mask.unsqueeze(-1).float(), dim=1)
+            d = attention_mask.sum(dim=1, keepdim=True).float()
+            return s / d
+
+
--- a/C_MTEB/setup.py
+++ b/C_MTEB/setup.py
+from setuptools import setup, find_packages
+
+with open("README.md", mode="r", encoding="utf-8") as readme_file:
+    readme = readme_file.read()
+
+setup(
+    name='C_MTEB',
+    version='1.1.1',
+    description='Chinese Massive Text Embedding Benchmark',
+    long_description=readme,
+    long_description_content_type="text/markdown",
+    author_email='2906698981@qq.com',
+    url='https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB',
+    packages=find_packages(),
+    install_requires=[
+        'mteb[beir]==1.1.1',
+    ],
+)
--- a/C_MTEB/summarize_results.py
+++ b/C_MTEB/summarize_results.py
+import argparse
+import json
+import os
+from collections import defaultdict
+
+from C_MTEB import *
+from mteb import MTEB
+
+
+def read_results(task_types, except_tasks, args):
+    tasks_results = {}
+    model_dirs = {}
+    for t_type in task_types:
+        tasks_results[t_type] = {}
+        for t in MTEB(task_types=[t_type], task_langs=args.lang).tasks:
+            task_name = t.description["name"]
+            if task_name in except_tasks: continue
+
+            metric = t.description["main_score"]
+            tasks_results[t_type][task_name] = defaultdict(None)
+
+            for model_name in os.listdir(args.results_dir):
+                model_dir = os.path.join(args.results_dir, model_name)
+                if not os.path.isdir(model_dir): continue
+                model_dirs[model_name] = model_dir
+                if os.path.exists(os.path.join(model_dir, task_name + '.json')):
+                    data = json.load(open(os.path.join(model_dir, task_name + '.json')))
+                    for s in ['test', 'dev', 'validation']:
+                        if s in data:
+                            split = s
+                            break
+
+                    if 'en' in args.lang:
+                        if 'en-en' in data[split]:
+                            temp_data = data[split]['en-en']
+                        elif 'en' in data[split]:
+                            temp_data = data[split]['en']
+                        else:
+                            temp_data = data[split]
+                    elif 'zh' in args.lang:
+                        if 'zh' in data[split]:
+                            temp_data = data[split]['zh']
+                        elif 'zh-CN' in data[split]:
+                            temp_data = data[split]['zh-CN']
+                        else:
+                            temp_data = data[split]
+
+                    if metric == 'ap':
+                        tasks_results[t_type][task_name][model_name] = round(temp_data['cos_sim']['ap'] * 100, 2)
+                    elif metric == 'cosine_spearman':
+                        tasks_results[t_type][task_name][model_name] = round(temp_data['cos_sim']['spearman'] * 100, 2)
+                    else:
+                        tasks_results[t_type][task_name][model_name] = round(temp_data[metric] * 100, 2)
+
+    return tasks_results, model_dirs
+
+
+def output_markdown(tasks_results, model_names, save_file):
+    task_type_res = {}
+    with open(save_file, 'w') as f:
+        for t_type, type_results in tasks_results.items():
+            has_CQADupstack = False
+            task_cnt = 0
+            task_type_res[t_type] = defaultdict()
+            f.write(f'Task Type: {t_type}  \n')
+            first_line = "| Model |"
+            second_line = "|:-------------------------------|"
+            for task_name in type_results.keys():
+                if "CQADupstack" in task_name:
+                    has_CQADupstack = True
+                    continue
+                first_line += f" {task_name} |"
+                second_line += ":--------:|"
+                task_cnt += 1
+            if has_CQADupstack:
+                first_line += f" CQADupstack |"
+                second_line += ":--------:|"
+                task_cnt += 1
+            f.write(first_line + ' Avg |  \n')
+            f.write(second_line + ':--------:|  \n')
+
+            for model in model_names:
+                write_line = f"| {model} |"
+                all_res = []
+                cqa_res = []
+                for task_name, results in type_results.items():
+                    if "CQADupstack" in task_name:
+                        if model in results:
+                            cqa_res.append(results[model])
+                        continue
+
+                    if model in results:
+                        write_line += f" {results[model]} |"
+                        all_res.append(results[model])
+                    else:
+                        write_line += f"  |"
+
+                if len(cqa_res) > 0:
+                    write_line += f" {round(sum(cqa_res) / len(cqa_res), 2)} |"
+                    all_res.append(round(sum(cqa_res) / len(cqa_res), 2))
+
+                # if len(all_res) == len(type_results.keys()):
+                if len(all_res) == task_cnt:
+                    write_line += f" {round(sum(all_res) / len(all_res), 2)} |"
+                    task_type_res[t_type][model] = all_res
+                else:
+                    write_line += f"  |"
+                f.write(write_line + '  \n')
+
+        f.write(f'Overall  \n')
+        first_line = "| Model |"
+        second_line = "|:-------------------------------|"
+        for t_type in task_type_res.keys():
+            first_line += f" {t_type} |"
+            second_line += ":--------:|"
+        f.write(first_line + ' Avg |  \n')
+        f.write(second_line + ':--------:|  \n')
+
+        for model in model_names:
+            write_line = f"| {model} |"
+            all_res = []
+            for type_name, results in task_type_res.items():
+                if model in results:
+                    write_line += f" {round(sum(results[model]) / len(results[model]), 2)} |"
+                    all_res.extend(results[model])
+                else:
+                    write_line += f"  |"
+
+            if len(all_res) > 0:
+                write_line += f" {round(sum(all_res) / len(all_res), 2)} |"
+
+            f.write(write_line + '  \n')
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--results_dir', default="./zh_results", type=str)
+    parser.add_argument('--lang', default="zh", type=str)
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = get_args()
+
+    if args.lang == 'zh':
+        task_types = ["Retrieval", "STS", "PairClassification", "Classification", "Reranking", "Clustering"]
+        except_tasks = []
+        args.lang = ['zh', 'zh-CN']
+    elif args.lang == 'en':
+        task_types = ["Retrieval", "Clustering", "PairClassification", "Reranking", "STS", "Summarization",
+                      "Classification"]
+        except_tasks = ['MSMARCOv2']
+        args.lang = ['en']
+    else:
+        raise NotImplementedError(f"args.lang must be zh or en, but{args.lang}")
+
+    task_results, model_dirs = read_results(task_types, except_tasks, args=args)
+
+    output_markdown(task_results, model_dirs.keys(),
+                    save_file=os.path.join(args.results_dir, f'{args.lang[0]}_results.md'))
+
+
--- a/FlagEmbedding/.DS_Store
+++ b/FlagEmbedding/.DS_Store
--- a/FlagEmbedding/BGE_M3/.DS_Store
+++ b/FlagEmbedding/BGE_M3/.DS_Store
--- a/FlagEmbedding/BGE_M3/BGE_M3.pdf
+++ b/FlagEmbedding/BGE_M3/BGE_M3.pdf
--- a/FlagEmbedding/BGE_M3/README.md
+++ b/FlagEmbedding/BGE_M3/README.md
+# BGE-M3 ([paper](https://arxiv.org/pdf/2402.03216.pdf), [code](https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/BGE_M3))
+
+In this project, we introduce BGE-M3, which is distinguished for its versatility in Multi-Functionality, Multi-Linguality, and Multi-Granularity. 
+- Multi-Functionality: It can simultaneously perform the three common retrieval functionalities of embedding model: dense retrieval, multi-vector retrieval, and sparse retrieval. 
+- Multi-Linguality: It can support more than 100 working languages. 
+- Multi-Granularity: It is able to process inputs of different granularities, spanning from short sentences to long documents of up to 8192 tokens. 
+
+For more details, please refer to our paper: [BGE M3-Embedding: Multi-Lingual, Multi-Functionality, Multi-Granularity Text Embeddings Through Self-Knowledge Distillation](https://arxiv.org/pdf/2402.03216.pdf)
+
+
+**Some suggestions for retrieval pipeline in RAG**
+
+We recommend to use following pipeline: hybrid retrieval + re-ranking. 
+- Hybrid retrieval leverages the strengths of various methods, offering higher accuracy and stronger generalization capabilities. 
+A classic example: using both embedding retrieval and the BM25 algorithm. 
+Now, you can try to use BGE-M3, which supports both embedding and sparse retrieval. 
+This allows you to obtain token weights (similar to the BM25) without any additional cost when generate dense embeddings.
+To use hybrid retrieval, you can refer to [Vespa](https://github.com/vespa-engine/pyvespa/blob/master/docs/sphinx/source/examples/mother-of-all-embedding-models-cloud.ipynb
+) and [Milvus](https://github.com/milvus-io/pymilvus/blob/master/examples/hello_hybrid_sparse_dense.py).
+
+- As cross-encoder models, re-ranker demonstrates higher accuracy than bi-encoder embedding model. 
+Utilizing the re-ranking model (e.g., [bge-reranker](https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/reranker), [bge-reranker-v2](https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/llm_reranker)) after retrieval can further filter the selected text.
+
+
+## News:
+
+- 2024/7/1: **We update the MIRACL evaluation results of BGE-M3**. To reproduce the new results, you can refer to: [bge-m3_miracl_2cr](https://huggingface.co/datasets/hanhainebula/bge-m3_miracl_2cr). We have also updated our [paper](https://arxiv.org/pdf/2402.03216) on arXiv.
+  <details>
+  <summary> Details </summary>
+
+  > The previous test results were lower because we mistakenly removed the passages that have the same id as the query from the search results. After correcting this mistake, the overall performance of BGE-M3 on MIRACL is higher than the previous results, but the experimental conclusion remains unchanged. The other results are not affected by this mistake. To reproduce the previous lower results, you need to add the `--remove-query` parameter when using `pyserini.search.faiss` or `pyserini.search.lucene` to search the passages.
+
+  </details>
+- 2024/3/20: **Thanks Milvus team!** Now you can use hybrid retrieval of bge-m3 in Milvus: [pymilvus/examples
+/hello_hybrid_sparse_dense.py](https://github.com/milvus-io/pymilvus/blob/master/examples/hello_hybrid_sparse_dense.py).
+- 2024/3/8: **Thanks for the [experimental results](https://towardsdatascience.com/openai-vs-open-source-multilingual-embedding-models-e5ccb7c90f05) from @[Yannael](https://huggingface.co/Yannael). In this benchmark, BGE-M3 achieves top performance in both English and other languages, surpassing models such as OpenAI.**
+- 2024/3/2: Release unified fine-tuning [example](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/unified_finetune) and [data](https://huggingface.co/datasets/Shitao/bge-m3-data) 
+- 2024/2/6: We release the [MLDR](https://huggingface.co/datasets/Shitao/MLDR) (a long document retrieval dataset covering 13 languages) and [evaluation pipeline](https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB/MLDR). 
+- 2024/2/1: **Thanks for the excellent tool from Vespa.** You can easily use multiple modes of BGE-M3 following this [notebook](https://github.com/vespa-engine/pyvespa/blob/master/docs/sphinx/source/examples/mother-of-all-embedding-models-cloud.ipynb)
+
+
+## Specs
+
+- Model  
+
+| Model Name |  Dimension | Sequence Length | Introduction |
+|:----:|:---:|:---:|:---:|
+| [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3) | 1024 | 8192 | multilingual; unified fine-tuning (dense, sparse, and colbert) from bge-m3-unsupervised|
+| [BAAI/bge-m3-unsupervised](https://huggingface.co/BAAI/bge-m3-unsupervised) | 1024 | 8192 | multilingual; contrastive learning from bge-m3-retromae |
+| [BAAI/bge-m3-retromae](https://huggingface.co/BAAI/bge-m3-retromae) | -- | 8192 | multilingual; extend the max_length of [xlm-roberta](https://huggingface.co/FacebookAI/xlm-roberta-large) to 8192 and further pretrained via [retromae](https://github.com/staoxiao/RetroMAE)| 
+| [BAAI/bge-large-en-v1.5](https://huggingface.co/BAAI/bge-large-en-v1.5) | 1024 | 512 | English model | 
+| [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) |  768 | 512 | English model | 
+| [BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) |  384 | 512 | English model | 
+
+- Data
+
+|                          Dataset                           |                   Introduction                    |
+|:----------------------------------------------------------:|:-------------------------------------------------:|
+|    [MLDR](https://huggingface.co/datasets/Shitao/MLDR)     | Docuemtn Retrieval Dataset, covering 13 languages |
+| [bge-m3-data](https://huggingface.co/datasets/Shitao/bge-m3-data) |          Fine-tuning data used by bge-m3          |
+
+
+
+## FAQ
+
+**1. Introduction for different retrieval methods**
+
+- Dense retrieval: map the text into a single embedding, e.g., [DPR](https://arxiv.org/abs/2004.04906), [BGE-v1.5](https://github.com/FlagOpen/FlagEmbedding)
+- Sparse retrieval (lexical matching): a vector of size equal to the vocabulary, with the majority of positions set to zero, calculating a weight only for tokens present in the text. e.g., BM25, [unicoil](https://arxiv.org/pdf/2106.14807.pdf), and [splade](https://arxiv.org/abs/2107.05720)
+- Multi-vector retrieval: use multiple vectors to represent a text, e.g., [ColBERT](https://arxiv.org/abs/2004.12832).
+
+
+**2. How to use BGE-M3 in other projects?**
+
+For embedding retrieval, you can employ the BGE-M3 model using the same approach as BGE. 
+The only difference is that the BGE-M3 model no longer requires adding instructions to the queries. 
+
+For hybrid retrieval, you can use [Vespa](https://github.com/vespa-engine/pyvespa/blob/master/docs/sphinx/source/examples/mother-of-all-embedding-models-cloud.ipynb
+) and [Milvus](https://github.com/milvus-io/pymilvus/blob/master/examples/hello_hybrid_sparse_dense.py).
+
+
+**3. How to fine-tune bge-M3 model?**
+
+You can follow the common in this [example](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/finetune) 
+to fine-tune the dense embedding.
+
+If you want to fine-tune all embedding function of m3 (dense, sparse and colbert), you can refer to the [unified_fine-tuning example](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/unified_finetune)
+
+
+
+
+
+
+## Usage
+
+Install: 
+```
+git clone https://github.com/FlagOpen/FlagEmbedding.git
+cd FlagEmbedding
+pip install -e .
+```
+or: 
+```
+pip install -U FlagEmbedding
+```
+
+
+
+### Generate Embedding for text
+
+- Dense Embedding
+```python
+from FlagEmbedding import BGEM3FlagModel
+
+model = BGEM3FlagModel('BAAI/bge-m3',  
+                       use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation
+
+sentences_1 = ["What is BGE M3?", "Defination of BM25"]
+sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.", 
+               "BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"]
+
+embeddings_1 = model.encode(sentences_1, 
+                            batch_size=12, 
+                            max_length=8192, # If you don't need such a long length, you can set a smaller value to speed up the encoding process.
+                            )['dense_vecs']
+embeddings_2 = model.encode(sentences_2)['dense_vecs']
+similarity = embeddings_1 @ embeddings_2.T
+print(similarity)
+# [[0.6265, 0.3477], [0.3499, 0.678 ]]
+```
+You also can use sentence-transformers and huggingface transformers to generate dense embeddings.
+Refer to [baai_general_embedding](https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/baai_general_embedding#usage) for details.
+
+
+- Sparse Embedding (Lexical Weight)
+```python
+from FlagEmbedding import BGEM3FlagModel
+
+model = BGEM3FlagModel('BAAI/bge-m3',  use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation
+
+sentences_1 = ["What is BGE M3?", "Defination of BM25"]
+sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.", 
+               "BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"]
+
+output_1 = model.encode(sentences_1, return_dense=True, return_sparse=True, return_colbert_vecs=False)
+output_2 = model.encode(sentences_2, return_dense=True, return_sparse=True, return_colbert_vecs=False)
+
+# you can see the weight for each token:
+print(model.convert_id_to_token(output_1['lexical_weights']))
+# [{'What': 0.08356, 'is': 0.0814, 'B': 0.1296, 'GE': 0.252, 'M': 0.1702, '3': 0.2695, '?': 0.04092}, 
+#  {'De': 0.05005, 'fin': 0.1368, 'ation': 0.04498, 'of': 0.0633, 'BM': 0.2515, '25': 0.3335}]
+
+
+# compute the scores via lexical mathcing
+lexical_scores = model.compute_lexical_matching_score(output_1['lexical_weights'][0], output_2['lexical_weights'][0])
+print(lexical_scores)
+# 0.19554901123046875
+
+print(model.compute_lexical_matching_score(output_1['lexical_weights'][0], output_1['lexical_weights'][1]))
+# 0.0
+```
+
+- Multi-Vector (ColBERT)
+```python
+from FlagEmbedding import BGEM3FlagModel
+
+model = BGEM3FlagModel('BAAI/bge-m3',  use_fp16=True) 
+
+sentences_1 = ["What is BGE M3?", "Defination of BM25"]
+sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.", 
+               "BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"]
+
+output_1 = model.encode(sentences_1, return_dense=True, return_sparse=True, return_colbert_vecs=True)
+output_2 = model.encode(sentences_2, return_dense=True, return_sparse=True, return_colbert_vecs=True)
+
+print(model.colbert_score(output_1['colbert_vecs'][0], output_2['colbert_vecs'][0]))
+print(model.colbert_score(output_1['colbert_vecs'][0], output_2['colbert_vecs'][1]))
+# 0.7797
+# 0.4620
+```
+
+
+### Compute score for text pairs
+Input a list of text pairs, you can get the scores computed by different methods.
+```python
+from FlagEmbedding import BGEM3FlagModel
+
+model = BGEM3FlagModel('BAAI/bge-m3',  use_fp16=True) 
+
+sentences_1 = ["What is BGE M3?", "Defination of BM25"]
+sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.", 
+               "BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"]
+
+sentence_pairs = [[i,j] for i in sentences_1 for j in sentences_2]
+
+print(model.compute_score(sentence_pairs, 
+                          max_passage_length=128, # a smaller max length leads to a lower latency
+                          weights_for_different_modes=[0.4, 0.2, 0.4])) # weights_for_different_modes(w) is used to do weighted sum: w[0]*dense_score + w[1]*sparse_score + w[2]*colbert_score
+
+# {
+#   'colbert': [0.7796499729156494, 0.4621465802192688, 0.4523794651031494, 0.7898575067520142], 
+#   'sparse': [0.195556640625, 0.00879669189453125, 0.0, 0.1802978515625], 
+#   'dense': [0.6259765625, 0.347412109375, 0.349853515625, 0.67822265625], 
+#   'sparse+dense': [0.482503205537796, 0.23454029858112335, 0.2332356721162796, 0.5122477412223816], 
+#   'colbert+sparse+dense': [0.6013619303703308, 0.3255828022956848, 0.32089319825172424, 0.6232916116714478]
+# }
+```
+
+
+
+
+## Evaluation  
+
+We provide the evaluation script for [MKQA](https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB/MKQA) and [MLDR](https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB/MLDR)
+
+
+### Benchmarks from the open-source community
+  ![avatar](./imgs/others.webp)
+ The BGE-M3 model emerged as the top performer on this benchmark (OAI is short for OpenAI). 
+  For more details, please refer to the [article](https://towardsdatascience.com/openai-vs-open-source-multilingual-embedding-models-e5ccb7c90f05) and [Github Repo](https://github.com/Yannael/multilingual-embeddings)
+
+
+### Our results
+- Multilingual (MIRACL dataset) 
+
+![avatar](./imgs/miracl.jpg)
+
+- Cross-lingual (MKQA dataset)
+
+![avatar](./imgs/mkqa.jpg)
+
+- Long Document Retrieval
+  - MLDR:   
+  ![avatar](./imgs/long.jpg)
+  Please note that [MLDR](https://huggingface.co/datasets/Shitao/MLDR) is a document retrieval dataset we constructed via LLM, 
+  covering 13 languages, including test set, validation set, and training set. 
+  We utilized the training set from MLDR to enhance the model's long document retrieval capabilities. 
+  Therefore, comparing baselines with `Dense w.o.long`(fine-tuning without long document dataset) is more equitable. 
+  Additionally, this long document retrieval dataset will be open-sourced to address the current lack of open-source multilingual long text retrieval datasets.
+  We believe that this data will be helpful for the open-source community in training document retrieval models.
+
+  - NarritiveQA:  
+  ![avatar](./imgs/nqa.jpg)
+
+- Comparison with BM25  
+
+We utilized Pyserini to implement BM25, and the test results can be reproduced by this [script](https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB/MLDR#bm25-baseline).
+We tested BM25 using two different tokenizers: 
+one using Lucene Analyzer and the other using the same tokenizer as M3 (i.e., the tokenizer of xlm-roberta). 
+The results indicate that BM25 remains a competitive baseline, 
+especially in long document retrieval.
+
+![avatar](./imgs/bm25.jpg)
+
+
+
+## Training
+- Self-knowledge Distillation: combining multiple outputs from different 
+retrieval modes as reward signal to enhance the performance of single mode(especially for sparse retrieval and multi-vec(colbert) retrival)
+- Efficient Batching: Improve the efficiency when fine-tuning on long text. 
+The small-batch strategy is simple but effective, which also can used to fine-tune large embedding model.
+- MCLS: A simple method to improve the performance on long text without fine-tuning. 
+If you have no enough resource to fine-tuning model with long text, the method is useful.
+
+Refer to our [report](https://arxiv.org/pdf/2402.03216.pdf) for more details. 
+
+
+
+
+
+
+## Acknowledgement
+
+Thanks to the authors of open-sourced datasets, including Miracl, MKQA, NarritiveQA, etc. 
+Thanks to the open-sourced libraries like [Tevatron](https://github.com/texttron/tevatron), [Pyserini](https://github.com/castorini/pyserini).
+
+
+
+## Citation
+
+If you find this repository useful, please consider giving a star :star: and citation
+
+```
+@misc{bge-m3,
+      title={BGE M3-Embedding: Multi-Lingual, Multi-Functionality, Multi-Granularity Text Embeddings Through Self-Knowledge Distillation}, 
+      author={Jianlv Chen and Shitao Xiao and Peitian Zhang and Kun Luo and Defu Lian and Zheng Liu},
+      year={2024},
+      eprint={2402.03216},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
+
+
+
+
+
+
--- a/FlagEmbedding/BGE_M3/__init__.py
+++ b/FlagEmbedding/BGE_M3/__init__.py
+from .modeling import BGEM3Model, BGEM3ForInference, EncoderOutput
+from .trainer import BiTrainer
\ No newline at end of file
--- a/FlagEmbedding/BGE_M3/arguments.py
+++ b/FlagEmbedding/BGE_M3/arguments.py
+import os
+from dataclasses import dataclass, field
+from typing import Optional
+
+from transformers import TrainingArguments
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    )
+
+
+@dataclass
+class DataArguments:
+    knowledge_distillation: bool = field(
+        default=False, metadata={"help": "Use knowledge distillation when `pos_scores` and `neg_scores` are in features of training data"}
+    )
+    train_data: str = field(
+        default=None, metadata={"help": "One or more paths to training data", "nargs": "+"}
+    )
+    cache_path: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the cached data"}
+    )
+    train_group_size: int = field(default=8)
+
+    query_max_len: int = field(
+        default=32,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization for passage. Sequences longer "
+                    "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+
+    passage_max_len: int = field(
+        default=128,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization for passage. Sequences longer "
+                    "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+
+    max_example_num_per_dataset: int = field(
+        default=None, metadata={"help": "the max number of examples for each dataset"}
+    )
+
+    query_instruction_for_retrieval: str= field(
+        default=None, metadata={"help": "instruction for query"}
+    )
+    passage_instruction_for_retrieval: str = field(
+        default=None, metadata={"help": "instruction for passage"}
+    )
+    
+    same_task_within_batch: bool = field(
+            default=False, metadata={"help": "All samples in the same batch comes from the same task."}
+    )
+    shuffle_ratio: float = field(
+            default=0.0, metadata={"help": "The ratio of shuffling the text"}
+    )
+    
+    small_threshold: int = field(
+            default=0, metadata={"help": "The threshold of small dataset. All small dataset in the same directory will be merged into one dataset."}
+    )
+    drop_threshold: int = field(
+            default=0, metadata={"help": "The threshold for dropping merged small dataset. If the number of examples in the merged small dataset is less than this threshold, it will be dropped."}
+    )
+
+    def __post_init__(self):
+        for train_dir in self.train_data:
+            if not os.path.exists(train_dir):
+                raise FileNotFoundError(f"cannot find file: {train_dir}, please set a true path")
+
+@dataclass
+class RetrieverTrainingArguments(TrainingArguments):
+    negatives_cross_device: bool = field(default=False, metadata={"help": "share negatives across devices"})
+    temperature: Optional[float] = field(default=0.02)
+    fix_position_embedding: bool = field(default=False, metadata={"help": "Freeze the parameters of position embeddings"})
+    sentence_pooling_method: str = field(default='cls', metadata={"help": "the pooling method, should be cls or mean"})
+    normlized: bool = field(default=True)
+    enable_sub_batch: bool = field(default=True, metadata={"help": "Freeze the parameters of position embeddings"})
+    
+    unified_finetuning: bool = field(default=False, metadata={"help": "use unify fine-tuning"})
+    use_self_distill: bool = field(default=False, metadata={"help": "use self-distill when using unify fine-tuning"})
+    fix_encoder: bool = field(default=False, metadata={"help": "Freeze the parameters of encoder"})
+    colbert_dim: int = field(default=-1, metadata={"help": "Dim of colbert linear"})
+    self_distill_start_step: int = field(default=-1, metadata={"help": "Num of step when using self-distill"})
--- a/FlagEmbedding/BGE_M3/data.py
+++ b/FlagEmbedding/BGE_M3/data.py
+import math
+import os.path
+import random
+from dataclasses import dataclass
+import torch
+import numpy as np
+import datasets
+from pprint import pprint
+from torch.utils.data import Dataset
+from transformers import DataCollatorWithPadding
+import torch.distributed as dist
+
+from .arguments import DataArguments
+
+
+class SameDatasetTrainDataset(Dataset):
+    """Dataset to yield a batch of data at one time. All samples in the same batch comes from the same task.
+    """
+    def __init__(self, args: DataArguments, batch_size: int, seed: int, process_index: int=0, num_processes: int=1):
+        train_datasets = []
+        each_data_inxs = []
+        batch_size_inxs = []
+        pqloss_flag = []
+        cur_all_num = 0
+        
+        SMALL_THRESHOLD = args.small_threshold
+        DROP_THRESHOLD = args.drop_threshold
+        
+        context_feat = datasets.Features({
+            'query': datasets.Value('string'),
+            'pos': datasets.Sequence(datasets.Value('string')),
+            'neg': datasets.Sequence(datasets.Value('string'))
+        })
+        context_feat_kd = datasets.Features({
+            'query': datasets.Value('string'),
+            'pos': datasets.Sequence(datasets.Value('string')),
+            'neg': datasets.Sequence(datasets.Value('string')),
+            'pos_scores': datasets.Sequence(datasets.Value('float')),
+            'neg_scores': datasets.Sequence(datasets.Value('float')),
+        })
+        assert isinstance(args.train_data, list) and len(args.train_data) >= 1
+        
+        if dist.get_rank() == 0:
+            self.print_batch_size(batch_size=batch_size, train_group_size=args.train_group_size)
+        
+        for data_dir in args.train_data:
+            if not os.path.isdir(data_dir):
+                raise FileNotFoundError(f"{data_dir} is a file, not a directionary")
+            
+            small_datasets = []
+            small_batch_size = math.inf
+            
+            # Add `parallel_` in `data_dir` to indicate that this dataset is parallel corpus
+            flag = 'parallel_' in data_dir
+            for file in os.listdir(data_dir):
+                if not (file.endswith('.json') or file.endswith('.jsonl')):
+                    continue
+                
+                file_path = os.path.join(data_dir, file)
+                if dist.get_rank() == 0:
+                    print(f'loading data from {file_path} ...')
+                try:
+                    temp_dataset = datasets.load_dataset('json', data_files=file_path, split='train', cache_dir=args.cache_path, features=context_feat)
+                except:
+                    temp_dataset = datasets.load_dataset('json', data_files=file_path, split='train', cache_dir=args.cache_path, features=context_feat_kd)
+                    if not args.knowledge_distillation:
+                        temp_dataset = temp_dataset.remove_columns(['pos_scores', 'neg_scores'])
+                
+                if len(temp_dataset) == 0:
+                    continue
+                elif len(temp_dataset) < SMALL_THRESHOLD:
+                    small_datasets.append(temp_dataset)
+                    small_batch_size = min(small_batch_size, self.get_file_batch_size(file, batch_size, train_group_size=args.train_group_size))
+                else:
+                    if args.max_example_num_per_dataset is not None and len(temp_dataset) > args.max_example_num_per_dataset:
+                        temp_dataset = temp_dataset.select(
+                            random.sample(list(range(len(temp_dataset))), args.max_example_num_per_dataset))
+                    train_datasets.append(temp_dataset)
+                    each_data_inxs.append(np.arange(len(temp_dataset)) + cur_all_num)
+                    cur_all_num += len(temp_dataset)
+                    batch_size_inxs.append(self.get_file_batch_size(file, batch_size, train_group_size=args.train_group_size))
+                    pqloss_flag.append(flag)
+            
+            if len(small_datasets) > 0:
+                small_dataset = datasets.concatenate_datasets(small_datasets)
+                if len(small_dataset) >= DROP_THRESHOLD:
+                    train_datasets.append(small_dataset)
+                    each_data_inxs.append(np.arange(len(small_dataset)) + cur_all_num)
+                    cur_all_num += len(small_dataset)
+                    batch_size_inxs.append(small_batch_size)
+                    pqloss_flag.append(flag)
+        
+        self.dataset = datasets.concatenate_datasets(train_datasets)
+        self.each_data_inxs = each_data_inxs
+        self.datasets_inxs = np.arange(len(each_data_inxs))
+        self.batch_size_inxs = batch_size_inxs
+        self.pqloss_flag = pqloss_flag
+        
+        self.process_index = process_index
+        self.num_processes = num_processes
+        self.args = args
+        self.shuffle_ratio = args.shuffle_ratio
+        
+        self.deterministic_generator = np.random.default_rng(seed)
+        self.step = 0
+        self.refresh_epoch()
+    
+    def print_batch_size(self, batch_size: int, train_group_size: int):
+        length_list = ['0-500', '500-1000', '1000-2000', '2000-3000', '3000-4000', '4000-5000', '5000-6000', '6000-7000', '7000-inf']
+        batch_size_dict = {
+            k: self.get_file_batch_size(f"len-{k}.jsonl", batch_size, train_group_size) for k in length_list
+        }
+        batch_size_list = [
+            f'{length}: {batch_size_dict[length]}' for length in length_list
+        ]
+        print("=========================")
+        print("Batch Size Dict:")
+        pprint(batch_size_list)
+        print("=========================")
+    
+    @staticmethod
+    def get_file_batch_size(file: str, batch_size: int, train_group_size: int):
+        if train_group_size == 8:
+            # 80GB
+            if 'len-0-500.jsonl' in file:
+                return 48
+            elif 'len-500-1000.jsonl' in file:
+                return 32
+            elif 'len-1000-2000.jsonl' in file:
+                return 20
+            elif 'len-2000-3000.jsonl' in file:
+                return 18
+            elif 'len-3000-4000.jsonl' in file:
+                return 14
+            elif 'len-4000-5000.jsonl' in file:
+                return 14
+            elif 'len-5000-6000.jsonl' in file:
+                return 12
+            elif 'len-6000-7000.jsonl' in file:
+                return 10
+            elif 'len-7000-inf.jsonl' in file:
+                return 8
+            else:
+                return batch_size
+        elif train_group_size == 1:
+            # 80GB
+            if 'len-0-500.jsonl' in file:
+                return 700
+            elif 'len-500-1000.jsonl' in file:
+                return 570
+            elif 'len-1000-2000.jsonl' in file:
+                return 388
+            elif 'len-2000-3000.jsonl' in file:
+                return 288
+            elif 'len-3000-4000.jsonl' in file:
+                return 224
+            elif 'len-4000-5000.jsonl' in file:
+                return 180
+            elif 'len-5000-6000.jsonl' in file:
+                return 157
+            elif 'len-6000-7000.jsonl' in file:
+                return 128
+            elif 'len-7000-inf.jsonl' in file:
+                return 104
+            else:
+                return batch_size
+        else:
+            return batch_size
+    
+    def refresh_epoch(self):
+        print(f'---------------------------*Rank {self.process_index}: refresh data---------------------------')
+        self.deterministic_generator.shuffle(self.datasets_inxs)
+        # Dynamically adjust batch size
+        batch_datas = []
+        for dataset_inx in self.datasets_inxs:
+            self.deterministic_generator.shuffle(self.each_data_inxs[dataset_inx])
+            cur_batch_size = self.batch_size_inxs[dataset_inx]*self.num_processes
+            flag = self.pqloss_flag[dataset_inx]
+            for start_index in range(0, len(self.each_data_inxs[dataset_inx]), cur_batch_size):
+                # judge the last batch's length
+                if len(self.each_data_inxs[dataset_inx]) - start_index < 2 * self.num_processes:
+                    break
+                batch_datas.append((self.each_data_inxs[dataset_inx][start_index:start_index+cur_batch_size], flag))
+        self.deterministic_generator.shuffle(batch_datas)
+        self.batch_datas = batch_datas
+        self.step = 0
+
+    def __getitem__(self, _):  
+        batch_indices, pqloss_flag = self.batch_datas[self.step]
+        cur_batch_size = int(len(batch_indices) / self.num_processes)
+        batch_indices = batch_indices[self.process_index * cur_batch_size: (self.process_index + 1) * cur_batch_size]
+        batch_data = self.dataset[batch_indices]
+        self.step += 1
+        queries, passages, teacher_scores = self.create_batch_data(batch_raw_data=batch_data)
+        # print('rank, step, flag, query, passage:', dist.get_rank(), self.step, pqloss_flag, queries, passages)
+        return queries, passages, teacher_scores, pqloss_flag
+
+    def shuffle_text(self, text):
+        if self.shuffle_ratio > 0 and len(text) > 100 and random.random() < self.shuffle_ratio:
+            split_text = []
+            chunk_size = len(text)//3 + 1
+            for i in range(0, len(text), chunk_size):
+                split_text.append(text[i:i+chunk_size])
+            random.shuffle(split_text)
+            return " ".join(split_text)
+        else:
+            return text
+
+    def create_batch_data(self, batch_raw_data):
+        queries, passages = [], []
+        teacher_scores = []
+        for i in range(len(batch_raw_data['query'])):            
+            queries.append(batch_raw_data['query'][i])
+            
+            pos_inx = random.choice(list(range(len(batch_raw_data['pos'][i]))))
+            passages.append(self.shuffle_text(batch_raw_data['pos'][i][pos_inx]))
+            if 'pos_scores' in batch_raw_data and batch_raw_data['pos_scores'][i] is not None:
+                teacher_scores.append(batch_raw_data['pos_scores'][i][pos_inx])
+            
+            neg_inx_set = list(range(len(batch_raw_data['neg'][i])))
+            if len(batch_raw_data['neg'][i]) < self.args.train_group_size - 1:
+                num = math.ceil((self.args.train_group_size - 1) / len(batch_raw_data['neg'][i]))
+                neg_inxs = random.sample(neg_inx_set * num, self.args.train_group_size - 1)
+            else:
+                neg_inxs = random.sample(neg_inx_set, self.args.train_group_size - 1)            
+            
+            if 'neg_scores' in batch_raw_data and batch_raw_data['neg_scores'][i] is not None:
+                neg_scores = [(x, batch_raw_data['neg_scores'][i][x]) for x in neg_inxs]
+                neg_scores = sorted(neg_scores, key=lambda x:x[1], reverse=True)
+                neg_inxs = [x[0] for x in neg_scores]
+                teacher_scores.extend([x[1] for x in neg_scores])
+                
+            negs = [batch_raw_data['neg'][i][x] for x in neg_inxs]
+            passages.extend(negs)
+            
+            if len(teacher_scores) > 0 and len(passages) > 0:
+                assert len(teacher_scores) == len(passages)
+
+        if self.args.query_instruction_for_retrieval is not None:
+            queries = [self.args.query_instruction_for_retrieval+q for q in queries]
+        if self.args.passage_instruction_for_retrieval is not None:
+            passages = [self.args.passage_instruction_for_retrieval+p for p in passages]
+        
+        if len(teacher_scores) == 0:
+            teacher_scores = None
+        return queries, passages, teacher_scores
+    
+    def __len__(self):
+        return len(self.batch_datas) * self.num_processes
+
+
+@dataclass
+class EmbedCollator(DataCollatorWithPadding):
+    """
+    Wrapper that does conversion from List[Tuple[encode_qry, encode_psg]] to List[qry], List[psg]
+    and pass batch separately to the actual collator.
+    Abstract out data detail for the model.
+    """
+    query_max_len: int = 32
+    passage_max_len: int = 128
+
+    def __call__(self, features):
+        query = [f[0] for f in features]
+        passage = [f[1] for f in features]
+        
+        teacher_scores = None
+        if len(features[0]) > 2:
+            teacher_scores = [f[2] for f in features]
+            if teacher_scores[0] is None:
+                teacher_scores = None
+            else:
+                teacher_scores = torch.FloatTensor(teacher_scores)
+        
+        flag = None
+        if len(features[0]) == 4:
+            flag = [f[3] for f in features][0]
+            
+        if isinstance(query[0], list):
+            query = sum(query, [])
+        if isinstance(passage[0], list):
+            passage = sum(passage, [])
+
+        q_collated = self.tokenizer(
+            query,
+            # padding='max_length',     # used for adjusting the batch size in `get_file_batch_size()`
+            padding=True,
+            truncation=True,
+            max_length=self.query_max_len,
+            return_tensors="pt",
+        )
+        d_collated = self.tokenizer(
+            passage,
+            # padding='max_length',     # used for adjusting the batch size in `get_file_batch_size()`
+            padding=True,
+            truncation=True,
+            max_length=self.passage_max_len,
+            return_tensors="pt",
+        )
+        if teacher_scores is not None:
+            teacher_scores = teacher_scores.reshape((len(q_collated['input_ids']), -1))
+        return {"query": q_collated, "passage": d_collated, "teacher_scores": teacher_scores, "bi_directions": flag}
--- a/FlagEmbedding/BGE_M3/imgs/.DS_Store
+++ b/FlagEmbedding/BGE_M3/imgs/.DS_Store