pre_process.py

import json
import os
import subprocess

import numpy as np
import torch
from arguments import ModelArguments, DataArguments
from datasets import load_dataset
from torch.utils.data import Dataset, SequentialSampler
from torch_geometric.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer, HfArgumentParser, is_torch_npu_available
from transformers import PreTrainedTokenizer, AutoModel


class EmbDataset(Dataset):
    def __init__(
            self,
            tokenizer: PreTrainedTokenizer,
            path: str
    ):
        self.tokenizer = tokenizer
        with open(path, 'r') as f:
            self.data = json.load(f)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        sentences = self.data[item]['contents']
        batch_dict = self.tokenizer(sentences, max_length=512, padding=True, truncation=True, return_tensors='pt')
        attention_mask = batch_dict['attention_mask'][0].tolist() + [0] * (512 - len(batch_dict['attention_mask'][0]))
        token_type_ids = batch_dict['token_type_ids'][0].tolist() + [0] * (512 - len(batch_dict['token_type_ids'][0]))
        input_ids = batch_dict['input_ids'][0].tolist() + [0] * (512 - len(batch_dict['token_type_ids'][0]))

        return torch.LongTensor(input_ids), torch.LongTensor(token_type_ids), torch.LongTensor(attention_mask)


def inference(json_path, emb_path, model_path):
    if torch.cuda.is_available():
        device = torch.device("cuda")
    elif is_torch_npu_available():
        device = torch.device("npu")
    else:
        device = torch.device("cpu")

    tokenizer = AutoTokenizer.from_pretrained(model_path)

    model = AutoModel.from_pretrained(model_path).to(device)
    model = torch.nn.parallel.DataParallel(model)

    dataset = EmbDataset(tokenizer, json_path)
    loader = DataLoader(dataset=dataset, batch_size=2048, sampler=SequentialSampler(dataset), shuffle=False,
                        drop_last=False, num_workers=16)

    model.eval()
    existing_data = []
    for step, data in enumerate(tqdm(loader, total=len(loader))):
        input_ids, token_type_ids, attention_mask = data
        input_ids = input_ids.to(device)
        token_type_ids = token_type_ids.to(device)
        attention_mask = attention_mask.to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
            batch_vecs = outputs[0][:, 0]
            batch_vecs = torch.nn.functional.normalize(batch_vecs, p=2, dim=-1).detach().cpu().numpy()

            existing_data.append(batch_vecs)

    np.save(emb_path, np.concatenate(existing_data))


def build_bm25_index(dataset, collection_path, index_path):
    title = dataset['title']
    text = dataset['text']
    json_list = []
    for i in range(len(title)):
        json_dict = {'id': i, 'contents': title[i] + ' -- ' + text[i]}
        json_list.append(json_dict)

    with open(os.path.join(collection_path, 'documents.json'), 'w') as f:
        json.dump(json_list, f)

    command = f"python -u -m pyserini.index.lucene   --collection JsonCollection   --input {collection_path}  --index {index_path}  --generator DefaultLuceneDocumentGenerator   --threads 8   --storePositions --storeDocvectors --storeRaw"
    result = subprocess.run(command, capture_output=True, text=True, shell=True)
    if result.returncode == 0:
        output = result.stdout
        print("execute successful!")
        print(output)
    else:
        print("execute false!")
        print(result.stderr)


if __name__ == "__main__":
    parser = HfArgumentParser((ModelArguments, DataArguments))
    model_args, data_args = parser.parse_args_into_dataclasses()
    dataset_path = os.path.join(data_args.data_path, 'dataset')
    collection_path = os.path.join(data_args.data_path, 'collection')
    index_path = os.path.join(data_args.data_path, 'index')
    emb_path = os.path.join(data_args.data_path, 'emb')
    os.makedirs(dataset_path, exist_ok=True)
    os.makedirs(collection_path, exist_ok=True)
    os.makedirs(index_path, exist_ok=True)
    os.makedirs(emb_path, exist_ok=True)
    dataset = load_dataset(f"Cohere/wikipedia-22-12", 'zh', split='train')
    dataset.save_to_disk(dataset_path)
    build_bm25_index(dataset, collection_path, index_path)
    inference(os.path.join(collection_path, 'documents.json'),
              os.path.join(emb_path, 'data.npy'),
              model_args.model_name_or_path)