hgt_sample_cpu.cpp 8.97 KB
Newer Older
rusty1s's avatar
rusty1s committed
1
2
#include "hgt_sample_cpu.h"

rusty1s's avatar
rusty1s committed
3
#include "utils.h"
rusty1s's avatar
rusty1s committed
4

rusty1s's avatar
rusty1s committed
5
6
7
8
#ifdef _WIN32
#include <process.h>
#endif

rusty1s's avatar
rusty1s committed
9
#define MAX_NEIGHBORS 50
rusty1s's avatar
update  
rusty1s committed
10

rusty1s's avatar
typos  
rusty1s committed
11
12
using namespace std;

rusty1s's avatar
rusty1s committed
13
edge_t split(const rel_t &rel_type) {
rusty1s's avatar
rusty1s committed
14
15
  vector<string> result(3);
  int start = 0, end;
rusty1s's avatar
rusty1s committed
16
  for (int i = 0; i < 3; i++) {
rusty1s's avatar
rusty1s committed
17
    end = rel_type.find("__", start);
rusty1s's avatar
rusty1s committed
18
19
20
    result[i] = rel_type.substr(start, end - start);
    start = end + 2;
  }
rusty1s's avatar
rusty1s committed
21
  return make_tuple(result[0], result[1], result[2]);
rusty1s's avatar
rusty1s committed
22
23
}

rusty1s's avatar
rusty1s committed
24
25
void update_budget_(
    unordered_map<node_t, unordered_map<int64_t, float>> *budget_dict,
rusty1s's avatar
typos  
rusty1s committed
26
    const node_t &node_type, const vector<int64_t> &samples,
rusty1s's avatar
rusty1s committed
27
28
29
    const unordered_map<node_t, unordered_map<int64_t, int64_t>>
        &to_local_node_dict,
    const unordered_map<rel_t, edge_t> &to_edge_type,
rusty1s's avatar
update  
rusty1s committed
30
31
    const c10::Dict<rel_t, torch::Tensor> &colptr_dict,
    const c10::Dict<rel_t, torch::Tensor> &row_dict) {
rusty1s's avatar
rusty1s committed
32

rusty1s's avatar
rusty1s committed
33
34
35
  if (samples.empty())
    return;

rusty1s's avatar
update  
rusty1s committed
36
  for (const auto &kv : colptr_dict) {
rusty1s's avatar
rusty1s committed
37
    const auto &rel_type = kv.key();
rusty1s's avatar
rusty1s committed
38
39
40
    const auto &edge_type = to_edge_type.at(rel_type);
    const auto &src_node_type = get<0>(edge_type);
    const auto &dst_node_type = get<2>(edge_type);
rusty1s's avatar
rusty1s committed
41
42
43
44

    if (node_type != dst_node_type)
      continue;

rusty1s's avatar
rusty1s committed
45
    const auto &to_local_src_node = to_local_node_dict.at(src_node_type);
rusty1s's avatar
update  
rusty1s committed
46
47
    const auto *colptr_data = kv.value().data_ptr<int64_t>();
    const auto *row_data = row_dict.at(rel_type).data_ptr<int64_t>();
rusty1s's avatar
rusty1s committed
48
    auto &src_budget = budget_dict->at(src_node_type);
rusty1s's avatar
rusty1s committed
49

rusty1s's avatar
rusty1s committed
50
51
52
    for (const auto &w : samples) {
      const auto &col_start = colptr_data[w], &col_end = colptr_data[w + 1];
      if (col_end - col_start > MAX_NEIGHBORS) {
rusty1s's avatar
update  
rusty1s committed
53
        // There might be same neighbors with large neighborhood sizes.
rusty1s's avatar
rusty1s committed
54
55
56
57
58
59
        // In order to prevent that we fill our budget with many values of low
        // probability, we instead sample a fixed amount without replacement:
        auto indices = choice(col_end - col_start, MAX_NEIGHBORS, false);
        auto *indices_data = indices.data_ptr<int64_t>();
        for (int64_t i = 0; i < indices.numel(); i++) {
          const auto &v = row_data[col_start + indices_data[i]];
rusty1s's avatar
update  
rusty1s committed
60
          // Only add the neighbor in case we have not yet seen it before:
rusty1s's avatar
rusty1s committed
61
62
          if (to_local_src_node.find(v) == to_local_src_node.end())
            src_budget[v] += 1.f / float(MAX_NEIGHBORS);
rusty1s's avatar
update  
rusty1s committed
63
        }
rusty1s's avatar
rusty1s committed
64
65

      } else if (col_end != col_start) {
rusty1s's avatar
update  
rusty1s committed
66
        const auto inv_deg = 1.f / float(col_end - col_start);
rusty1s's avatar
rusty1s committed
67
68
        for (int64_t i = col_start; i < col_end; i++) {
          const auto &v = row_data[i];
rusty1s's avatar
update  
rusty1s committed
69
          // Only add the neighbor in case we have not yet seen it before:
rusty1s's avatar
rusty1s committed
70
71
          if (to_local_src_node.find(v) == to_local_src_node.end())
            src_budget[v] += inv_deg;
rusty1s's avatar
rusty1s committed
72
73
74
75
76
77
        }
      }
    }
  }
}

rusty1s's avatar
rusty1s committed
78
79
80
81
82
83
vector<int64_t> sample_from(const unordered_map<int64_t, float> &budget,
                            const int64_t num_samples) {
  vector<int64_t> indices;
  vector<float> weights;
  indices.reserve(budget.size());
  weights.reserve(budget.size());
rusty1s's avatar
update  
rusty1s committed
84
  for (const auto &kv : budget) {
rusty1s's avatar
rusty1s committed
85
86
    indices.push_back(kv.first);
    weights.push_back(kv.second * kv.second);
rusty1s's avatar
update  
rusty1s committed
87
  }
rusty1s's avatar
rusty1s committed
88

rusty1s's avatar
rusty1s committed
89
90
91
  const auto weight = from_vector(weights, true);
  const auto sample = choice(budget.size(), num_samples, false, weight);
  const auto *sample_data = sample.data_ptr<int64_t>();
rusty1s's avatar
rusty1s committed
92

rusty1s's avatar
rusty1s committed
93
94
95
  vector<int64_t> out(sample.numel());
  for (int64_t i = 0; i < sample.numel(); i++) {
    out[i] = indices[sample_data[i]];
rusty1s's avatar
rusty1s committed
96
  }
rusty1s's avatar
rusty1s committed
97
  return out;
rusty1s's avatar
rusty1s committed
98
99
}

rusty1s's avatar
rusty1s committed
100
101
tuple<c10::Dict<node_t, torch::Tensor>, c10::Dict<rel_t, torch::Tensor>,
      c10::Dict<rel_t, torch::Tensor>, c10::Dict<rel_t, torch::Tensor>>
rusty1s's avatar
update  
rusty1s committed
102
103
hgt_sample_cpu(const c10::Dict<rel_t, torch::Tensor> &colptr_dict,
               const c10::Dict<rel_t, torch::Tensor> &row_dict,
rusty1s's avatar
rusty1s committed
104
               const c10::Dict<node_t, torch::Tensor> &input_node_dict,
rusty1s's avatar
rusty1s committed
105
106
               const c10::Dict<node_t, vector<int64_t>> &num_samples_dict,
               const int64_t num_hops) {
rusty1s's avatar
rusty1s committed
107

rusty1s's avatar
rusty1s committed
108
109
  srand(time(NULL) + 1000 * getpid()); // Initialize random seed.

rusty1s's avatar
rusty1s committed
110
  // Create a mapping to convert single string relations to edge type triplets:
rusty1s's avatar
rusty1s committed
111
  unordered_map<rel_t, edge_t> to_edge_type;
rusty1s's avatar
update  
rusty1s committed
112
  for (const auto &kv : colptr_dict) {
rusty1s's avatar
rusty1s committed
113
    const auto &rel_type = kv.key();
rusty1s's avatar
rusty1s committed
114
    to_edge_type[rel_type] = split(rel_type);
rusty1s's avatar
rusty1s committed
115
116
  }

rusty1s's avatar
rusty1s committed
117
118
119
120
  // Initialize some necessary data structures for the sampling process:
  unordered_map<node_t, vector<int64_t>> nodes_dict;
  unordered_map<node_t, unordered_map<int64_t, int64_t>> to_local_node_dict;
  unordered_map<node_t, unordered_map<int64_t, float>> budget_dict;
rusty1s's avatar
fix  
rusty1s committed
121
122
  for (const auto &kv : num_samples_dict) {
    const auto &node_type = kv.key();
rusty1s's avatar
rusty1s committed
123
124
    nodes_dict[node_type];
    to_local_node_dict[node_type];
rusty1s's avatar
fix  
rusty1s committed
125
126
    budget_dict[node_type];
  }
rusty1s's avatar
rusty1s committed
127

rusty1s's avatar
rusty1s committed
128
  // Add the input nodes to the sampled output nodes (line 1):
rusty1s's avatar
rusty1s committed
129
130
131
132
133
  for (const auto &kv : input_node_dict) {
    const auto &node_type = kv.key();
    const auto &input_node = kv.value();
    const auto *input_node_data = input_node.data_ptr<int64_t>();

rusty1s's avatar
rusty1s committed
134
135
    auto &nodes = nodes_dict.at(node_type);
    auto &to_local_node = to_local_node_dict.at(node_type);
rusty1s's avatar
rusty1s committed
136
    for (int64_t i = 0; i < input_node.numel(); i++) {
rusty1s's avatar
rusty1s committed
137
138
139
      const auto &v = input_node_data[i];
      nodes.push_back(v);
      to_local_node[v] = i;
rusty1s's avatar
rusty1s committed
140
    }
rusty1s's avatar
update  
rusty1s committed
141
  }
rusty1s's avatar
rusty1s committed
142
143
144
145
146
147
148

  // Update the budget based on the initial input set (line 3-5):
  for (const auto &kv : nodes_dict) {
    const auto &node_type = kv.first;
    const auto &last_samples = kv.second;
    update_budget_(&budget_dict, node_type, last_samples, to_local_node_dict,
                   to_edge_type, colptr_dict, row_dict);
rusty1s's avatar
rusty1s committed
149
150
151
  }

  for (int64_t ell = 0; ell < num_hops; ell++) {
rusty1s's avatar
rusty1s committed
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
    unordered_map<node_t, vector<int64_t>> samples_dict;
    for (auto &kv : budget_dict) {
      const auto &node_type = kv.first;
      auto &budget = kv.second;
      const auto num_samples = num_samples_dict.at(node_type)[ell];

      // Sample `num_samples` nodes, according to the budget (line 9-11):
      const auto samples = sample_from(budget, num_samples);
      samples_dict[node_type] = samples;

      // Add samples to the sampled output nodes, and erase them from the budget
      // (line 13/15):
      auto &nodes = nodes_dict.at(node_type);
      auto &to_local_node = to_local_node_dict.at(node_type);
      for (const auto &v : samples) {
        to_local_node[v] = nodes.size();
        nodes.push_back(v);
        budget.erase(v);
      }
rusty1s's avatar
update  
rusty1s committed
171
    }
rusty1s's avatar
rusty1s committed
172

rusty1s's avatar
rusty1s committed
173
174
175
176
177
178
179
180
    if (ell < num_hops - 1) {
      // Add neighbors of newly sampled nodes to the budget (line 14):
      // Note that we do not need to update the budget in the last iteration.
      for (const auto &kv : samples_dict) {
        const auto &node_type = kv.first;
        const auto &last_samples = kv.second;
        update_budget_(&budget_dict, node_type, last_samples,
                       to_local_node_dict, to_edge_type, colptr_dict, row_dict);
rusty1s's avatar
update  
rusty1s committed
181
      }
rusty1s's avatar
rusty1s committed
182
183
184
    }
  }

rusty1s's avatar
rusty1s committed
185
186
187
188
  c10::Dict<node_t, torch::Tensor> out_node_dict;
  c10::Dict<rel_t, torch::Tensor> out_row_dict;
  c10::Dict<rel_t, torch::Tensor> out_col_dict;
  c10::Dict<rel_t, torch::Tensor> out_edge_dict;
rusty1s's avatar
update  
rusty1s committed
189

rusty1s's avatar
rusty1s committed
190
  // Reconstruct the sampled adjacency matrix among the sampled nodes (line 19):
rusty1s's avatar
update  
rusty1s committed
191
  for (const auto &kv : colptr_dict) {
rusty1s's avatar
rusty1s committed
192
    const auto &rel_type = kv.key();
rusty1s's avatar
rusty1s committed
193
194
195
    const auto &edge_type = to_edge_type.at(rel_type);
    const auto &src_node_type = get<0>(edge_type);
    const auto &dst_node_type = get<2>(edge_type);
rusty1s's avatar
rusty1s committed
196

rusty1s's avatar
update  
rusty1s committed
197
198
    const auto *colptr_data = kv.value().data_ptr<int64_t>();
    const auto *row_data = row_dict.at(rel_type).data_ptr<int64_t>();
rusty1s's avatar
rusty1s committed
199

rusty1s's avatar
rusty1s committed
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
    const auto &dst_nodes = nodes_dict.at(dst_node_type);
    const auto &to_local_src_node = to_local_node_dict.at(src_node_type);

    vector<int64_t> rows, cols, edges;
    for (int64_t i = 0; i < (int64_t)dst_nodes.size(); i++) {
      const auto &w = dst_nodes[i];
      const auto &col_start = colptr_data[w], &col_end = colptr_data[w + 1];
      if (col_end - col_start > MAX_NEIGHBORS) {
        auto indices = choice(col_end - col_start, MAX_NEIGHBORS, false);
        auto *indices_data = indices.data_ptr<int64_t>();
        for (int64_t j = 0; j < indices.numel(); j++) {
          const auto &v = row_data[col_start + indices_data[j]];
          if (to_local_src_node.find(v) != to_local_src_node.end()) {
            rows.push_back(to_local_src_node.at(v));
            cols.push_back(i);
            edges.push_back(col_start + j);
          }
        }
      } else {
        for (int64_t j = col_start; j < col_end; j++) {
          const auto &v = row_data[j];
          if (to_local_src_node.find(v) != to_local_src_node.end()) {
            rows.push_back(to_local_src_node.at(v));
            cols.push_back(i);
            edges.push_back(j);
          }
rusty1s's avatar
rusty1s committed
226
227
228
        }
      }
    }
rusty1s's avatar
update  
rusty1s committed
229
    if (rows.size() > 0) {
rusty1s's avatar
rusty1s committed
230
231
232
      out_row_dict.insert(rel_type, from_vector<int64_t>(rows));
      out_col_dict.insert(rel_type, from_vector<int64_t>(cols));
      out_edge_dict.insert(rel_type, from_vector<int64_t>(edges));
rusty1s's avatar
update  
rusty1s committed
233
    }
rusty1s's avatar
rusty1s committed
234
235
  }

rusty1s's avatar
rusty1s committed
236
237
238
239
240
241
  // Generate tensor-valued output node dictionary (line 20):
  for (const auto &kv : nodes_dict) {
    const auto &node_type = kv.first;
    const auto &nodes = kv.second;
    if (!nodes.empty())
      out_node_dict.insert(node_type, from_vector<int64_t>(nodes));
rusty1s's avatar
rusty1s committed
242
243
  }

rusty1s's avatar
rusty1s committed
244
  return make_tuple(out_node_dict, out_row_dict, out_col_dict, out_edge_dict);
rusty1s's avatar
rusty1s committed
245
}