"vscode:/vscode.git/clone" did not exist on "cdd12bde173f1c4589d981e16a7a6f467b137870"
reading_data.py 5.34 KB
Newer Older
ziqiaomeng's avatar
ziqiaomeng committed
1
2
3
import numpy as np
import torch
from download import AminerDataset
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
4
5
from torch.utils.data import Dataset

ziqiaomeng's avatar
ziqiaomeng committed
6
7
np.random.seed(12345)

Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
8

ziqiaomeng's avatar
ziqiaomeng committed
9
10
11
class DataReader:
    NEGATIVE_TABLE_SIZE = 1e8

12
    def __init__(self, dataset, min_count, care_type):
ziqiaomeng's avatar
ziqiaomeng committed
13
14
15
16
17
18
19
20
21
        self.negatives = []
        self.discards = []
        self.negpos = 0
        self.care_type = care_type
        self.word2id = dict()
        self.id2word = dict()
        self.sentences_count = 0
        self.token_count = 0
        self.word_frequency = dict()
22
        self.inputFileName = dataset.fn
ziqiaomeng's avatar
ziqiaomeng committed
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
        self.read_words(min_count)
        self.initTableNegatives()
        self.initTableDiscards()

    def read_words(self, min_count):
        word_frequency = dict()
        for line in open(self.inputFileName, encoding="ISO-8859-1"):
            line = line.split()
            if len(line) > 1:
                self.sentences_count += 1
                for word in line:
                    if len(word) > 0:
                        self.token_count += 1
                        word_frequency[word] = word_frequency.get(word, 0) + 1

                        if self.token_count % 1000000 == 0:
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
39
40
41
42
43
                            print(
                                "Read "
                                + str(int(self.token_count / 1000000))
                                + "M words."
                            )
ziqiaomeng's avatar
ziqiaomeng committed
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78

        wid = 0
        for w, c in word_frequency.items():
            if c < min_count:
                continue
            self.word2id[w] = wid
            self.id2word[wid] = w
            self.word_frequency[wid] = c
            wid += 1

        self.word_count = len(self.word2id)
        print("Total embeddings: " + str(len(self.word2id)))

    def initTableDiscards(self):
        # get a frequency table for sub-sampling. Note that the frequency is adjusted by
        # sub-sampling tricks.
        t = 0.0001
        f = np.array(list(self.word_frequency.values())) / self.token_count
        self.discards = np.sqrt(t / f) + (t / f)

    def initTableNegatives(self):
        # get a table for negative sampling, if word with index 2 appears twice, then 2 will be listed
        # in the table twice.
        pow_frequency = np.array(list(self.word_frequency.values())) ** 0.75
        words_pow = sum(pow_frequency)
        ratio = pow_frequency / words_pow
        count = np.round(ratio * DataReader.NEGATIVE_TABLE_SIZE)
        for wid, c in enumerate(count):
            self.negatives += [wid] * int(c)
        self.negatives = np.array(self.negatives)
        np.random.shuffle(self.negatives)
        self.sampling_prob = ratio

    def getNegatives(self, target, size):  # TODO check equality with target
        if self.care_type == 0:
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
79
            response = self.negatives[self.negpos : self.negpos + size]
ziqiaomeng's avatar
ziqiaomeng committed
80
81
            self.negpos = (self.negpos + size) % len(self.negatives)
            if len(response) != size:
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
82
83
84
                return np.concatenate(
                    (response, self.negatives[0 : self.negpos])
                )
ziqiaomeng's avatar
ziqiaomeng committed
85
86
87
88
89
        return response


# -----------------------------------------------------------------------------------------------------------------

Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
90

ziqiaomeng's avatar
ziqiaomeng committed
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
class Metapath2vecDataset(Dataset):
    def __init__(self, data, window_size):
        # read in data, window_size and input filename
        self.data = data
        self.window_size = window_size
        self.input_file = open(data.inputFileName, encoding="ISO-8859-1")

    def __len__(self):
        # return the number of walks
        return self.data.sentences_count

    def __getitem__(self, idx):
        # return the list of pairs (center, context, 5 negatives)
        while True:
            line = self.input_file.readline()
            if not line:
                self.input_file.seek(0, 0)
                line = self.input_file.readline()

            if len(line) > 1:
                words = line.split()

                if len(words) > 1:
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
114
115
116
117
118
119
120
                    word_ids = [
                        self.data.word2id[w]
                        for w in words
                        if w in self.data.word2id
                        and np.random.rand()
                        < self.data.discards[self.data.word2id[w]]
                    ]
ziqiaomeng's avatar
ziqiaomeng committed
121
122
123
124

                    pair_catch = []
                    for i, u in enumerate(word_ids):
                        for j, v in enumerate(
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
125
126
127
128
129
                            word_ids[
                                max(i - self.window_size, 0) : i
                                + self.window_size
                            ]
                        ):
ziqiaomeng's avatar
ziqiaomeng committed
130
131
132
133
                            assert u < self.data.word_count
                            assert v < self.data.word_count
                            if i == j:
                                continue
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
134
135
136
                            pair_catch.append(
                                (u, v, self.data.getNegatives(v, 5))
                            )
ziqiaomeng's avatar
ziqiaomeng committed
137
138
139
140
141
142
                    return pair_catch

    @staticmethod
    def collate(batches):
        all_u = [u for batch in batches for u, _, _ in batch if len(batch) > 0]
        all_v = [v for batch in batches for _, v, _ in batch if len(batch) > 0]
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
143
144
145
146
147
148
149
150
151
152
153
154
        all_neg_v = [
            neg_v
            for batch in batches
            for _, _, neg_v in batch
            if len(batch) > 0
        ]

        return (
            torch.LongTensor(all_u),
            torch.LongTensor(all_v),
            torch.LongTensor(all_neg_v),
        )