pre_process.py 4.4 KB
Newer Older
Rayyyyy's avatar
Rayyyyy committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import json
import os
import subprocess

import numpy as np
import torch
from arguments import ModelArguments, DataArguments
from datasets import load_dataset
from torch.utils.data import Dataset, SequentialSampler
from torch_geometric.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer, HfArgumentParser, is_torch_npu_available
from transformers import PreTrainedTokenizer, AutoModel


class EmbDataset(Dataset):
    def __init__(
            self,
            tokenizer: PreTrainedTokenizer,
            path: str
    ):
        self.tokenizer = tokenizer
        with open(path, 'r') as f:
            self.data = json.load(f)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        sentences = self.data[item]['contents']
        batch_dict = self.tokenizer(sentences, max_length=512, padding=True, truncation=True, return_tensors='pt')
        attention_mask = batch_dict['attention_mask'][0].tolist() + [0] * (512 - len(batch_dict['attention_mask'][0]))
        token_type_ids = batch_dict['token_type_ids'][0].tolist() + [0] * (512 - len(batch_dict['token_type_ids'][0]))
        input_ids = batch_dict['input_ids'][0].tolist() + [0] * (512 - len(batch_dict['token_type_ids'][0]))

        return torch.LongTensor(input_ids), torch.LongTensor(token_type_ids), torch.LongTensor(attention_mask)


def inference(json_path, emb_path, model_path):
    if torch.cuda.is_available():
        device = torch.device("cuda")
    elif is_torch_npu_available():
        device = torch.device("npu")
    else:
        device = torch.device("cpu")

    tokenizer = AutoTokenizer.from_pretrained(model_path)

    model = AutoModel.from_pretrained(model_path).to(device)
    model = torch.nn.parallel.DataParallel(model)

    dataset = EmbDataset(tokenizer, json_path)
    loader = DataLoader(dataset=dataset, batch_size=2048, sampler=SequentialSampler(dataset), shuffle=False,
                        drop_last=False, num_workers=16)

    model.eval()
    existing_data = []
    for step, data in enumerate(tqdm(loader, total=len(loader))):
        input_ids, token_type_ids, attention_mask = data
        input_ids = input_ids.to(device)
        token_type_ids = token_type_ids.to(device)
        attention_mask = attention_mask.to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
            batch_vecs = outputs[0][:, 0]
            batch_vecs = torch.nn.functional.normalize(batch_vecs, p=2, dim=-1).detach().cpu().numpy()

            existing_data.append(batch_vecs)

    np.save(emb_path, np.concatenate(existing_data))


def build_bm25_index(dataset, collection_path, index_path):
    title = dataset['title']
    text = dataset['text']
    json_list = []
    for i in range(len(title)):
        json_dict = {'id': i, 'contents': title[i] + ' -- ' + text[i]}
        json_list.append(json_dict)

    with open(os.path.join(collection_path, 'documents.json'), 'w') as f:
        json.dump(json_list, f)

    command = f"python -u -m pyserini.index.lucene   --collection JsonCollection   --input {collection_path}  --index {index_path}  --generator DefaultLuceneDocumentGenerator   --threads 8   --storePositions --storeDocvectors --storeRaw"
    result = subprocess.run(command, capture_output=True, text=True, shell=True)
    if result.returncode == 0:
        output = result.stdout
        print("execute successful!")
        print(output)
    else:
        print("execute false!")
        print(result.stderr)


if __name__ == "__main__":
    parser = HfArgumentParser((ModelArguments, DataArguments))
    model_args, data_args = parser.parse_args_into_dataclasses()
    dataset_path = os.path.join(data_args.data_path, 'dataset')
    collection_path = os.path.join(data_args.data_path, 'collection')
    index_path = os.path.join(data_args.data_path, 'index')
    emb_path = os.path.join(data_args.data_path, 'emb')
    os.makedirs(dataset_path, exist_ok=True)
    os.makedirs(collection_path, exist_ok=True)
    os.makedirs(index_path, exist_ok=True)
    os.makedirs(emb_path, exist_ok=True)
    dataset = load_dataset(f"Cohere/wikipedia-22-12", 'zh', split='train')
    dataset.save_to_disk(dataset_path)
    build_bm25_index(dataset, collection_path, index_path)
    inference(os.path.join(collection_path, 'documents.json'),
              os.path.join(emb_path, 'data.npy'),
              model_args.model_name_or_path)