hgt_sample_cpu.cpp 8.95 KB
Newer Older
rusty1s's avatar
rusty1s committed
1
2
#include "hgt_sample_cpu.h"

rusty1s's avatar
rusty1s committed
3
4
#include <process.h>

rusty1s's avatar
rusty1s committed
5
#include "utils.h"
rusty1s's avatar
rusty1s committed
6

rusty1s's avatar
rusty1s committed
7
#define MAX_NEIGHBORS 50
rusty1s's avatar
update  
rusty1s committed
8

rusty1s's avatar
typos  
rusty1s committed
9
10
using namespace std;

rusty1s's avatar
rusty1s committed
11
edge_t split(const rel_t &rel_type) {
rusty1s's avatar
rusty1s committed
12
13
  vector<string> result(3);
  int start = 0, end;
rusty1s's avatar
rusty1s committed
14
  for (int i = 0; i < 3; i++) {
rusty1s's avatar
rusty1s committed
15
    end = rel_type.find("__", start);
rusty1s's avatar
rusty1s committed
16
17
18
    result[i] = rel_type.substr(start, end - start);
    start = end + 2;
  }
rusty1s's avatar
rusty1s committed
19
  return make_tuple(result[0], result[1], result[2]);
rusty1s's avatar
rusty1s committed
20
21
}

rusty1s's avatar
rusty1s committed
22
23
void update_budget_(
    unordered_map<node_t, unordered_map<int64_t, float>> *budget_dict,
rusty1s's avatar
typos  
rusty1s committed
24
    const node_t &node_type, const vector<int64_t> &samples,
rusty1s's avatar
rusty1s committed
25
26
27
    const unordered_map<node_t, unordered_map<int64_t, int64_t>>
        &to_local_node_dict,
    const unordered_map<rel_t, edge_t> &to_edge_type,
rusty1s's avatar
update  
rusty1s committed
28
29
    const c10::Dict<rel_t, torch::Tensor> &colptr_dict,
    const c10::Dict<rel_t, torch::Tensor> &row_dict) {
rusty1s's avatar
rusty1s committed
30

rusty1s's avatar
rusty1s committed
31
32
33
  if (samples.empty())
    return;

rusty1s's avatar
update  
rusty1s committed
34
  for (const auto &kv : colptr_dict) {
rusty1s's avatar
rusty1s committed
35
    const auto &rel_type = kv.key();
rusty1s's avatar
rusty1s committed
36
37
38
    const auto &edge_type = to_edge_type.at(rel_type);
    const auto &src_node_type = get<0>(edge_type);
    const auto &dst_node_type = get<2>(edge_type);
rusty1s's avatar
rusty1s committed
39
40
41
42

    if (node_type != dst_node_type)
      continue;

rusty1s's avatar
rusty1s committed
43
    const auto &to_local_src_node = to_local_node_dict.at(src_node_type);
rusty1s's avatar
update  
rusty1s committed
44
45
    const auto *colptr_data = kv.value().data_ptr<int64_t>();
    const auto *row_data = row_dict.at(rel_type).data_ptr<int64_t>();
rusty1s's avatar
rusty1s committed
46
    auto &src_budget = budget_dict->at(src_node_type);
rusty1s's avatar
rusty1s committed
47

rusty1s's avatar
rusty1s committed
48
49
50
    for (const auto &w : samples) {
      const auto &col_start = colptr_data[w], &col_end = colptr_data[w + 1];
      if (col_end - col_start > MAX_NEIGHBORS) {
rusty1s's avatar
update  
rusty1s committed
51
        // There might be same neighbors with large neighborhood sizes.
rusty1s's avatar
rusty1s committed
52
53
54
55
56
57
        // In order to prevent that we fill our budget with many values of low
        // probability, we instead sample a fixed amount without replacement:
        auto indices = choice(col_end - col_start, MAX_NEIGHBORS, false);
        auto *indices_data = indices.data_ptr<int64_t>();
        for (int64_t i = 0; i < indices.numel(); i++) {
          const auto &v = row_data[col_start + indices_data[i]];
rusty1s's avatar
update  
rusty1s committed
58
          // Only add the neighbor in case we have not yet seen it before:
rusty1s's avatar
rusty1s committed
59
60
          if (to_local_src_node.find(v) == to_local_src_node.end())
            src_budget[v] += 1.f / float(MAX_NEIGHBORS);
rusty1s's avatar
update  
rusty1s committed
61
        }
rusty1s's avatar
rusty1s committed
62
63

      } else if (col_end != col_start) {
rusty1s's avatar
update  
rusty1s committed
64
        const auto inv_deg = 1.f / float(col_end - col_start);
rusty1s's avatar
rusty1s committed
65
66
        for (int64_t i = col_start; i < col_end; i++) {
          const auto &v = row_data[i];
rusty1s's avatar
update  
rusty1s committed
67
          // Only add the neighbor in case we have not yet seen it before:
rusty1s's avatar
rusty1s committed
68
69
          if (to_local_src_node.find(v) == to_local_src_node.end())
            src_budget[v] += inv_deg;
rusty1s's avatar
rusty1s committed
70
71
72
73
74
75
        }
      }
    }
  }
}

rusty1s's avatar
rusty1s committed
76
77
78
79
80
81
vector<int64_t> sample_from(const unordered_map<int64_t, float> &budget,
                            const int64_t num_samples) {
  vector<int64_t> indices;
  vector<float> weights;
  indices.reserve(budget.size());
  weights.reserve(budget.size());
rusty1s's avatar
update  
rusty1s committed
82
  for (const auto &kv : budget) {
rusty1s's avatar
rusty1s committed
83
84
    indices.push_back(kv.first);
    weights.push_back(kv.second * kv.second);
rusty1s's avatar
update  
rusty1s committed
85
  }
rusty1s's avatar
rusty1s committed
86

rusty1s's avatar
rusty1s committed
87
88
89
  const auto weight = from_vector(weights, true);
  const auto sample = choice(budget.size(), num_samples, false, weight);
  const auto *sample_data = sample.data_ptr<int64_t>();
rusty1s's avatar
rusty1s committed
90

rusty1s's avatar
rusty1s committed
91
92
93
  vector<int64_t> out(sample.numel());
  for (int64_t i = 0; i < sample.numel(); i++) {
    out[i] = indices[sample_data[i]];
rusty1s's avatar
rusty1s committed
94
  }
rusty1s's avatar
rusty1s committed
95
  return out;
rusty1s's avatar
rusty1s committed
96
97
}

rusty1s's avatar
rusty1s committed
98
99
tuple<c10::Dict<node_t, torch::Tensor>, c10::Dict<rel_t, torch::Tensor>,
      c10::Dict<rel_t, torch::Tensor>, c10::Dict<rel_t, torch::Tensor>>
rusty1s's avatar
update  
rusty1s committed
100
101
hgt_sample_cpu(const c10::Dict<rel_t, torch::Tensor> &colptr_dict,
               const c10::Dict<rel_t, torch::Tensor> &row_dict,
rusty1s's avatar
rusty1s committed
102
               const c10::Dict<node_t, torch::Tensor> &input_node_dict,
rusty1s's avatar
rusty1s committed
103
104
               const c10::Dict<node_t, vector<int64_t>> &num_samples_dict,
               const int64_t num_hops) {
rusty1s's avatar
rusty1s committed
105

rusty1s's avatar
rusty1s committed
106
107
  srand(time(NULL) + 1000 * getpid()); // Initialize random seed.

rusty1s's avatar
rusty1s committed
108
  // Create a mapping to convert single string relations to edge type triplets:
rusty1s's avatar
rusty1s committed
109
  unordered_map<rel_t, edge_t> to_edge_type;
rusty1s's avatar
update  
rusty1s committed
110
  for (const auto &kv : colptr_dict) {
rusty1s's avatar
rusty1s committed
111
    const auto &rel_type = kv.key();
rusty1s's avatar
rusty1s committed
112
    to_edge_type[rel_type] = split(rel_type);
rusty1s's avatar
rusty1s committed
113
114
  }

rusty1s's avatar
rusty1s committed
115
116
117
118
  // Initialize some necessary data structures for the sampling process:
  unordered_map<node_t, vector<int64_t>> nodes_dict;
  unordered_map<node_t, unordered_map<int64_t, int64_t>> to_local_node_dict;
  unordered_map<node_t, unordered_map<int64_t, float>> budget_dict;
rusty1s's avatar
fix  
rusty1s committed
119
120
  for (const auto &kv : num_samples_dict) {
    const auto &node_type = kv.key();
rusty1s's avatar
rusty1s committed
121
122
    nodes_dict[node_type];
    to_local_node_dict[node_type];
rusty1s's avatar
fix  
rusty1s committed
123
124
    budget_dict[node_type];
  }
rusty1s's avatar
rusty1s committed
125

rusty1s's avatar
rusty1s committed
126
  // Add the input nodes to the sampled output nodes (line 1):
rusty1s's avatar
rusty1s committed
127
128
129
130
131
  for (const auto &kv : input_node_dict) {
    const auto &node_type = kv.key();
    const auto &input_node = kv.value();
    const auto *input_node_data = input_node.data_ptr<int64_t>();

rusty1s's avatar
rusty1s committed
132
133
    auto &nodes = nodes_dict.at(node_type);
    auto &to_local_node = to_local_node_dict.at(node_type);
rusty1s's avatar
rusty1s committed
134
    for (int64_t i = 0; i < input_node.numel(); i++) {
rusty1s's avatar
rusty1s committed
135
136
137
      const auto &v = input_node_data[i];
      nodes.push_back(v);
      to_local_node[v] = i;
rusty1s's avatar
rusty1s committed
138
    }
rusty1s's avatar
update  
rusty1s committed
139
  }
rusty1s's avatar
rusty1s committed
140
141
142
143
144
145
146

  // Update the budget based on the initial input set (line 3-5):
  for (const auto &kv : nodes_dict) {
    const auto &node_type = kv.first;
    const auto &last_samples = kv.second;
    update_budget_(&budget_dict, node_type, last_samples, to_local_node_dict,
                   to_edge_type, colptr_dict, row_dict);
rusty1s's avatar
rusty1s committed
147
148
149
  }

  for (int64_t ell = 0; ell < num_hops; ell++) {
rusty1s's avatar
rusty1s committed
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
    unordered_map<node_t, vector<int64_t>> samples_dict;
    for (auto &kv : budget_dict) {
      const auto &node_type = kv.first;
      auto &budget = kv.second;
      const auto num_samples = num_samples_dict.at(node_type)[ell];

      // Sample `num_samples` nodes, according to the budget (line 9-11):
      const auto samples = sample_from(budget, num_samples);
      samples_dict[node_type] = samples;

      // Add samples to the sampled output nodes, and erase them from the budget
      // (line 13/15):
      auto &nodes = nodes_dict.at(node_type);
      auto &to_local_node = to_local_node_dict.at(node_type);
      for (const auto &v : samples) {
        to_local_node[v] = nodes.size();
        nodes.push_back(v);
        budget.erase(v);
      }
rusty1s's avatar
update  
rusty1s committed
169
    }
rusty1s's avatar
rusty1s committed
170

rusty1s's avatar
rusty1s committed
171
172
173
174
175
176
177
178
    if (ell < num_hops - 1) {
      // Add neighbors of newly sampled nodes to the budget (line 14):
      // Note that we do not need to update the budget in the last iteration.
      for (const auto &kv : samples_dict) {
        const auto &node_type = kv.first;
        const auto &last_samples = kv.second;
        update_budget_(&budget_dict, node_type, last_samples,
                       to_local_node_dict, to_edge_type, colptr_dict, row_dict);
rusty1s's avatar
update  
rusty1s committed
179
      }
rusty1s's avatar
rusty1s committed
180
181
182
    }
  }

rusty1s's avatar
rusty1s committed
183
184
185
186
  c10::Dict<node_t, torch::Tensor> out_node_dict;
  c10::Dict<rel_t, torch::Tensor> out_row_dict;
  c10::Dict<rel_t, torch::Tensor> out_col_dict;
  c10::Dict<rel_t, torch::Tensor> out_edge_dict;
rusty1s's avatar
update  
rusty1s committed
187

rusty1s's avatar
rusty1s committed
188
  // Reconstruct the sampled adjacency matrix among the sampled nodes (line 19):
rusty1s's avatar
update  
rusty1s committed
189
  for (const auto &kv : colptr_dict) {
rusty1s's avatar
rusty1s committed
190
    const auto &rel_type = kv.key();
rusty1s's avatar
rusty1s committed
191
192
193
    const auto &edge_type = to_edge_type.at(rel_type);
    const auto &src_node_type = get<0>(edge_type);
    const auto &dst_node_type = get<2>(edge_type);
rusty1s's avatar
rusty1s committed
194

rusty1s's avatar
update  
rusty1s committed
195
196
    const auto *colptr_data = kv.value().data_ptr<int64_t>();
    const auto *row_data = row_dict.at(rel_type).data_ptr<int64_t>();
rusty1s's avatar
rusty1s committed
197

rusty1s's avatar
rusty1s committed
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
    const auto &dst_nodes = nodes_dict.at(dst_node_type);
    const auto &to_local_src_node = to_local_node_dict.at(src_node_type);

    vector<int64_t> rows, cols, edges;
    for (int64_t i = 0; i < (int64_t)dst_nodes.size(); i++) {
      const auto &w = dst_nodes[i];
      const auto &col_start = colptr_data[w], &col_end = colptr_data[w + 1];
      if (col_end - col_start > MAX_NEIGHBORS) {
        auto indices = choice(col_end - col_start, MAX_NEIGHBORS, false);
        auto *indices_data = indices.data_ptr<int64_t>();
        for (int64_t j = 0; j < indices.numel(); j++) {
          const auto &v = row_data[col_start + indices_data[j]];
          if (to_local_src_node.find(v) != to_local_src_node.end()) {
            rows.push_back(to_local_src_node.at(v));
            cols.push_back(i);
            edges.push_back(col_start + j);
          }
        }
      } else {
        for (int64_t j = col_start; j < col_end; j++) {
          const auto &v = row_data[j];
          if (to_local_src_node.find(v) != to_local_src_node.end()) {
            rows.push_back(to_local_src_node.at(v));
            cols.push_back(i);
            edges.push_back(j);
          }
rusty1s's avatar
rusty1s committed
224
225
226
        }
      }
    }
rusty1s's avatar
update  
rusty1s committed
227
    if (rows.size() > 0) {
rusty1s's avatar
rusty1s committed
228
229
230
      out_row_dict.insert(rel_type, from_vector<int64_t>(rows));
      out_col_dict.insert(rel_type, from_vector<int64_t>(cols));
      out_edge_dict.insert(rel_type, from_vector<int64_t>(edges));
rusty1s's avatar
update  
rusty1s committed
231
    }
rusty1s's avatar
rusty1s committed
232
233
  }

rusty1s's avatar
rusty1s committed
234
235
236
237
238
239
  // Generate tensor-valued output node dictionary (line 20):
  for (const auto &kv : nodes_dict) {
    const auto &node_type = kv.first;
    const auto &nodes = kv.second;
    if (!nodes.empty())
      out_node_dict.insert(node_type, from_vector<int64_t>(nodes));
rusty1s's avatar
rusty1s committed
240
241
  }

rusty1s's avatar
rusty1s committed
242
  return make_tuple(out_node_dict, out_row_dict, out_col_dict, out_edge_dict);
rusty1s's avatar
rusty1s committed
243
}