dialog_dataset.py 5.64 KB
Newer Older
1
2
3
4
5
6
7
8

"""Build Dataset for Controllable Coversational Model"""

import os
import torch
import numpy as np

from megatron import get_tokenizer
9
from megatron import print_rank_0
10
11
12
13
14
15
16
17
18
19
20
21
22

def read_data(tokenizer, data_path, train_module):
    """read and tokenize dialog data"""

    data_list = []
    with open(data_path, "r") as f:
        for i, line in enumerate(f):
            line = line.strip()
            splits = line.split("\t")
            length_split = len(splits)
            assert length_split == 2 or length_split == 3 or length_split == 4

            if train_module == "dialog":
zihanl's avatar
zihanl committed
23
24
25
                # if length_split == 2:
                #     continue

26
                dialog_context = splits[0]
zihanl's avatar
zihanl committed
27
28
                if length_split > 2:
                    ctrl_sent = splits[-2]
29
30
31
                response = splits[-1]
                # only take the last three turns in the dialog context
                turns = dialog_context.split(" [SEP] ")
zihanl's avatar
zihanl committed
32
                # turns = turns[-3:]
33

34
35
36
37
38
39
40
                # input_ids
                for idx, turn in enumerate(turns):
                    if idx == 0:
                        input_ids = tokenizer.tokenize(turn)
                    else:
                        input_ids.extend([tokenizer.sep_id] + tokenizer.tokenize(turn))
                
zihanl's avatar
zihanl committed
41
42
43
44
                if length_split > 2:
                    # when there is control sentence, add it into the input_ids
                    input_ids.extend([tokenizer.ctrl_id] + tokenizer.tokenize(ctrl_sent))

45
                # output_ids
46
                output_ids = tokenizer.tokenize(response)
47

48
49
50
51
52
53
54
55
56
57
58
59
                data_list.append({"input_ids": input_ids, "output_ids": output_ids})

            elif train_module == "control":
                if length_split == 2:
                    continue
                dialog_context = splits[0]
                ctrl_sent = splits[-2]
                ctrl_code = splits[1] if length_split == 4 else None

                turns = dialog_context.split(" [SEP] ")
                last_turn = turns[-1]
                
60
                # input_ids
61
                if ctrl_code:
62
63
64
65
                    input_ids = tokenizer.tokenize(last_turn)
                    ctrl_code_list = ctrl_code.split(" [CTRL] ")
                    for code in ctrl_code_list:
                        input_ids.extend([tokenizer.ctrl_id] + tokenizer.tokenize(code))
66
                else:
67
                    input_ids = tokenizer.tokenize(last_turn)
68

69
70
                # output_ids
                outputs = ctrl_sent
71
                output_ids = tokenizer.tokenize(outputs)
72

73
74
75
76
                data_list.append({"input_ids": input_ids, "output_ids": output_ids})

            else:
                raise ValueError("Please input a correct train-module name! (either dialog or cnotrol))")
zihanl's avatar
zihanl committed
77
                
78
79
80
81
82
83
84
85
86
87
88
89
    return data_list


def data_shuffle(data, seed):
    # set random seed to make the shuffling reproducible
    np.random.seed(seed)
    np.random.shuffle(data)
    return data


class ControlDialogDataset(torch.utils.data.Dataset):

zihanl's avatar
zihanl committed
90
    def __init__(self, data, max_seq_len, sep_id, pad_id, eod_id):
91
92
        # need to deal with padding, label masking
        self.data = data
93
        self.max_seq_len = max_seq_len
zihanl's avatar
zihanl committed
94
        self.sep_id = sep_id
95
96
97
98
99
100
101
102
103
104
        self.pad_id = pad_id
        self.eod_id = eod_id

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        data_dict = self.data[idx]
        input_ids, output_ids = data_dict["input_ids"], data_dict["output_ids"]
        
105
        assert len(input_ids) < self.max_seq_len, "Set a larger max-seq-len!"
106
107

        # length_of_loss_mask == length_of_text - 1
zihanl's avatar
zihanl committed
108
        text = input_ids + [self.sep_id] + output_ids + [self.eod_id]
109
110
111
        loss_mask = [0]*len(input_ids) + [1]*(len(output_ids)+1)

        text_len = len(text)
zihanl's avatar
zihanl committed
112
113
114
        if text_len > self.max_seq_len+1:
            text = text[:self.max_seq_len+1]
            loss_mask = loss_mask[:self.max_seq_len]
115
        else:
zihanl's avatar
zihanl committed
116
117
            text += [self.pad_id] * (self.max_seq_len+1 - text_len)
            loss_mask += [0] * (self.max_seq_len+1 - text_len)
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139

        return {"text": np.array(text, dtype=np.int64), "loss_mask": np.array(loss_mask, dtype=np.int64)}


def build_train_valid_test_datasets(data_folder, dataset_name, train_module, max_seq_len, seed):
    """Build train, valid, and test datasets."""

    dataname_dict = {"wizard_of_wikipedia": {"train": "train_entity_based_control.txt", "valid": "valid_random_split_entity_based_control.txt", "test": "test_random_split_entity_based_control.txt"}}
    
    train_data_path = os.path.join(data_folder, dataset_name+"/processed/"+dataname_dict[dataset_name]["train"])
    valid_data_path = os.path.join(data_folder, dataset_name+"/processed/"+dataname_dict[dataset_name]["valid"])
    test_data_path = os.path.join(data_folder, dataset_name+"/processed/"+dataname_dict[dataset_name]["test"])

    tokenizer = get_tokenizer()
    train_data_list = read_data(tokenizer, train_data_path, train_module)
    valid_data_list = read_data(tokenizer, valid_data_path, train_module)
    test_data_list = read_data(tokenizer, test_data_path, train_module)

    # shuffle the training data
    train_data_list = data_shuffle(train_data_list, seed)

    # build train, valid, and test datasets
zihanl's avatar
zihanl committed
140
141
142
    train_dataset = ControlDialogDataset(train_data_list, max_seq_len, sep_id=tokenizer.sep_id, pad_id=tokenizer.pad_id, eod_id=tokenizer.eod_id)
    valid_dataset = ControlDialogDataset(valid_data_list, max_seq_len, sep_id=tokenizer.sep_id, pad_id=tokenizer.pad_id, eod_id=tokenizer.eod_id)
    test_dataset = ControlDialogDataset(test_data_list, max_seq_len, sep_id=tokenizer.sep_id, pad_id=tokenizer.pad_id, eod_id=tokenizer.eod_id)
143

144
    return train_dataset, valid_dataset, test_dataset