hgt_sample_cpu.cpp 8.91 KB
Newer Older
rusty1s's avatar
rusty1s committed
1
2
#include "hgt_sample_cpu.h"

rusty1s's avatar
rusty1s committed
3
#include "utils.h"
rusty1s's avatar
rusty1s committed
4

rusty1s's avatar
rusty1s committed
5
6
7
8
#ifdef _WIN32
#include <process.h>
#endif

rusty1s's avatar
rusty1s committed
9
#define MAX_NEIGHBORS 50
rusty1s's avatar
update  
rusty1s committed
10

rusty1s's avatar
typos  
rusty1s committed
11
12
using namespace std;

rusty1s's avatar
rusty1s committed
13
edge_t split(const rel_t &rel_type) {
rusty1s's avatar
rusty1s committed
14
15
  vector<string> result(3);
  int start = 0, end;
rusty1s's avatar
rusty1s committed
16
  for (int i = 0; i < 3; i++) {
rusty1s's avatar
rusty1s committed
17
    end = rel_type.find("__", start);
rusty1s's avatar
rusty1s committed
18
19
20
    result[i] = rel_type.substr(start, end - start);
    start = end + 2;
  }
rusty1s's avatar
rusty1s committed
21
  return make_tuple(result[0], result[1], result[2]);
rusty1s's avatar
rusty1s committed
22
23
}

rusty1s's avatar
rusty1s committed
24
25
void update_budget_(
    unordered_map<node_t, unordered_map<int64_t, float>> *budget_dict,
rusty1s's avatar
typos  
rusty1s committed
26
    const node_t &node_type, const vector<int64_t> &samples,
rusty1s's avatar
rusty1s committed
27
28
29
    const unordered_map<node_t, unordered_map<int64_t, int64_t>>
        &to_local_node_dict,
    const unordered_map<rel_t, edge_t> &to_edge_type,
rusty1s's avatar
update  
rusty1s committed
30
31
    const c10::Dict<rel_t, torch::Tensor> &colptr_dict,
    const c10::Dict<rel_t, torch::Tensor> &row_dict) {
rusty1s's avatar
rusty1s committed
32

rusty1s's avatar
rusty1s committed
33
34
35
  if (samples.empty())
    return;

rusty1s's avatar
update  
rusty1s committed
36
  for (const auto &kv : colptr_dict) {
rusty1s's avatar
rusty1s committed
37
    const auto &rel_type = kv.key();
rusty1s's avatar
rusty1s committed
38
39
40
    const auto &edge_type = to_edge_type.at(rel_type);
    const auto &src_node_type = get<0>(edge_type);
    const auto &dst_node_type = get<2>(edge_type);
rusty1s's avatar
rusty1s committed
41
42
43
44

    if (node_type != dst_node_type)
      continue;

rusty1s's avatar
rusty1s committed
45
    const auto &to_local_src_node = to_local_node_dict.at(src_node_type);
rusty1s's avatar
update  
rusty1s committed
46
47
    const auto *colptr_data = kv.value().data_ptr<int64_t>();
    const auto *row_data = row_dict.at(rel_type).data_ptr<int64_t>();
rusty1s's avatar
rusty1s committed
48
    auto &src_budget = budget_dict->at(src_node_type);
rusty1s's avatar
rusty1s committed
49

rusty1s's avatar
rusty1s committed
50
51
52
    for (const auto &w : samples) {
      const auto &col_start = colptr_data[w], &col_end = colptr_data[w + 1];
      if (col_end - col_start > MAX_NEIGHBORS) {
rusty1s's avatar
update  
rusty1s committed
53
        // There might be same neighbors with large neighborhood sizes.
rusty1s's avatar
rusty1s committed
54
55
56
57
58
59
        // In order to prevent that we fill our budget with many values of low
        // probability, we instead sample a fixed amount without replacement:
        auto indices = choice(col_end - col_start, MAX_NEIGHBORS, false);
        auto *indices_data = indices.data_ptr<int64_t>();
        for (int64_t i = 0; i < indices.numel(); i++) {
          const auto &v = row_data[col_start + indices_data[i]];
rusty1s's avatar
update  
rusty1s committed
60
          // Only add the neighbor in case we have not yet seen it before:
rusty1s's avatar
rusty1s committed
61
62
          if (to_local_src_node.find(v) == to_local_src_node.end())
            src_budget[v] += 1.f / float(MAX_NEIGHBORS);
rusty1s's avatar
update  
rusty1s committed
63
        }
rusty1s's avatar
rusty1s committed
64
65

      } else if (col_end != col_start) {
rusty1s's avatar
update  
rusty1s committed
66
        const auto inv_deg = 1.f / float(col_end - col_start);
rusty1s's avatar
rusty1s committed
67
68
        for (int64_t i = col_start; i < col_end; i++) {
          const auto &v = row_data[i];
rusty1s's avatar
update  
rusty1s committed
69
          // Only add the neighbor in case we have not yet seen it before:
rusty1s's avatar
rusty1s committed
70
71
          if (to_local_src_node.find(v) == to_local_src_node.end())
            src_budget[v] += inv_deg;
rusty1s's avatar
rusty1s committed
72
73
74
75
76
77
        }
      }
    }
  }
}

rusty1s's avatar
rusty1s committed
78
79
80
81
82
83
vector<int64_t> sample_from(const unordered_map<int64_t, float> &budget,
                            const int64_t num_samples) {
  vector<int64_t> indices;
  vector<float> weights;
  indices.reserve(budget.size());
  weights.reserve(budget.size());
rusty1s's avatar
update  
rusty1s committed
84
  for (const auto &kv : budget) {
rusty1s's avatar
rusty1s committed
85
86
    indices.push_back(kv.first);
    weights.push_back(kv.second * kv.second);
rusty1s's avatar
update  
rusty1s committed
87
  }
rusty1s's avatar
rusty1s committed
88

rusty1s's avatar
rusty1s committed
89
90
91
  const auto weight = from_vector(weights, true);
  const auto sample = choice(budget.size(), num_samples, false, weight);
  const auto *sample_data = sample.data_ptr<int64_t>();
rusty1s's avatar
rusty1s committed
92

rusty1s's avatar
rusty1s committed
93
94
95
  vector<int64_t> out(sample.numel());
  for (int64_t i = 0; i < sample.numel(); i++) {
    out[i] = indices[sample_data[i]];
rusty1s's avatar
rusty1s committed
96
  }
rusty1s's avatar
rusty1s committed
97
  return out;
rusty1s's avatar
rusty1s committed
98
99
}

rusty1s's avatar
rusty1s committed
100
101
tuple<c10::Dict<node_t, torch::Tensor>, c10::Dict<rel_t, torch::Tensor>,
      c10::Dict<rel_t, torch::Tensor>, c10::Dict<rel_t, torch::Tensor>>
rusty1s's avatar
update  
rusty1s committed
102
103
hgt_sample_cpu(const c10::Dict<rel_t, torch::Tensor> &colptr_dict,
               const c10::Dict<rel_t, torch::Tensor> &row_dict,
rusty1s's avatar
rusty1s committed
104
               const c10::Dict<node_t, torch::Tensor> &input_node_dict,
rusty1s's avatar
rusty1s committed
105
106
               const c10::Dict<node_t, vector<int64_t>> &num_samples_dict,
               const int64_t num_hops) {
rusty1s's avatar
rusty1s committed
107

rusty1s's avatar
rusty1s committed
108
  // Create a mapping to convert single string relations to edge type triplets:
rusty1s's avatar
rusty1s committed
109
  unordered_map<rel_t, edge_t> to_edge_type;
rusty1s's avatar
update  
rusty1s committed
110
  for (const auto &kv : colptr_dict) {
rusty1s's avatar
rusty1s committed
111
    const auto &rel_type = kv.key();
rusty1s's avatar
rusty1s committed
112
    to_edge_type[rel_type] = split(rel_type);
rusty1s's avatar
rusty1s committed
113
114
  }

rusty1s's avatar
rusty1s committed
115
116
117
118
  // Initialize some necessary data structures for the sampling process:
  unordered_map<node_t, vector<int64_t>> nodes_dict;
  unordered_map<node_t, unordered_map<int64_t, int64_t>> to_local_node_dict;
  unordered_map<node_t, unordered_map<int64_t, float>> budget_dict;
rusty1s's avatar
fix  
rusty1s committed
119
120
  for (const auto &kv : num_samples_dict) {
    const auto &node_type = kv.key();
rusty1s's avatar
rusty1s committed
121
122
    nodes_dict[node_type];
    to_local_node_dict[node_type];
rusty1s's avatar
fix  
rusty1s committed
123
124
    budget_dict[node_type];
  }
rusty1s's avatar
rusty1s committed
125

rusty1s's avatar
rusty1s committed
126
  // Add the input nodes to the sampled output nodes (line 1):
rusty1s's avatar
rusty1s committed
127
128
129
130
131
  for (const auto &kv : input_node_dict) {
    const auto &node_type = kv.key();
    const auto &input_node = kv.value();
    const auto *input_node_data = input_node.data_ptr<int64_t>();

rusty1s's avatar
rusty1s committed
132
133
    auto &nodes = nodes_dict.at(node_type);
    auto &to_local_node = to_local_node_dict.at(node_type);
rusty1s's avatar
rusty1s committed
134
    for (int64_t i = 0; i < input_node.numel(); i++) {
rusty1s's avatar
rusty1s committed
135
136
137
      const auto &v = input_node_data[i];
      nodes.push_back(v);
      to_local_node[v] = i;
rusty1s's avatar
rusty1s committed
138
    }
rusty1s's avatar
update  
rusty1s committed
139
  }
rusty1s's avatar
rusty1s committed
140
141
142
143
144
145
146

  // Update the budget based on the initial input set (line 3-5):
  for (const auto &kv : nodes_dict) {
    const auto &node_type = kv.first;
    const auto &last_samples = kv.second;
    update_budget_(&budget_dict, node_type, last_samples, to_local_node_dict,
                   to_edge_type, colptr_dict, row_dict);
rusty1s's avatar
rusty1s committed
147
148
149
  }

  for (int64_t ell = 0; ell < num_hops; ell++) {
rusty1s's avatar
rusty1s committed
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
    unordered_map<node_t, vector<int64_t>> samples_dict;
    for (auto &kv : budget_dict) {
      const auto &node_type = kv.first;
      auto &budget = kv.second;
      const auto num_samples = num_samples_dict.at(node_type)[ell];

      // Sample `num_samples` nodes, according to the budget (line 9-11):
      const auto samples = sample_from(budget, num_samples);
      samples_dict[node_type] = samples;

      // Add samples to the sampled output nodes, and erase them from the budget
      // (line 13/15):
      auto &nodes = nodes_dict.at(node_type);
      auto &to_local_node = to_local_node_dict.at(node_type);
      for (const auto &v : samples) {
        to_local_node[v] = nodes.size();
        nodes.push_back(v);
        budget.erase(v);
      }
rusty1s's avatar
update  
rusty1s committed
169
    }
rusty1s's avatar
rusty1s committed
170

rusty1s's avatar
rusty1s committed
171
172
173
174
175
176
177
178
    if (ell < num_hops - 1) {
      // Add neighbors of newly sampled nodes to the budget (line 14):
      // Note that we do not need to update the budget in the last iteration.
      for (const auto &kv : samples_dict) {
        const auto &node_type = kv.first;
        const auto &last_samples = kv.second;
        update_budget_(&budget_dict, node_type, last_samples,
                       to_local_node_dict, to_edge_type, colptr_dict, row_dict);
rusty1s's avatar
update  
rusty1s committed
179
      }
rusty1s's avatar
rusty1s committed
180
181
182
    }
  }

rusty1s's avatar
rusty1s committed
183
184
185
186
  c10::Dict<node_t, torch::Tensor> out_node_dict;
  c10::Dict<rel_t, torch::Tensor> out_row_dict;
  c10::Dict<rel_t, torch::Tensor> out_col_dict;
  c10::Dict<rel_t, torch::Tensor> out_edge_dict;
rusty1s's avatar
update  
rusty1s committed
187

rusty1s's avatar
rusty1s committed
188
  // Reconstruct the sampled adjacency matrix among the sampled nodes (line 19):
rusty1s's avatar
update  
rusty1s committed
189
  for (const auto &kv : colptr_dict) {
rusty1s's avatar
rusty1s committed
190
    const auto &rel_type = kv.key();
rusty1s's avatar
rusty1s committed
191
192
193
    const auto &edge_type = to_edge_type.at(rel_type);
    const auto &src_node_type = get<0>(edge_type);
    const auto &dst_node_type = get<2>(edge_type);
rusty1s's avatar
rusty1s committed
194

rusty1s's avatar
update  
rusty1s committed
195
196
    const auto *colptr_data = kv.value().data_ptr<int64_t>();
    const auto *row_data = row_dict.at(rel_type).data_ptr<int64_t>();
rusty1s's avatar
rusty1s committed
197

rusty1s's avatar
rusty1s committed
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
    const auto &dst_nodes = nodes_dict.at(dst_node_type);
    const auto &to_local_src_node = to_local_node_dict.at(src_node_type);

    vector<int64_t> rows, cols, edges;
    for (int64_t i = 0; i < (int64_t)dst_nodes.size(); i++) {
      const auto &w = dst_nodes[i];
      const auto &col_start = colptr_data[w], &col_end = colptr_data[w + 1];
      if (col_end - col_start > MAX_NEIGHBORS) {
        auto indices = choice(col_end - col_start, MAX_NEIGHBORS, false);
        auto *indices_data = indices.data_ptr<int64_t>();
        for (int64_t j = 0; j < indices.numel(); j++) {
          const auto &v = row_data[col_start + indices_data[j]];
          if (to_local_src_node.find(v) != to_local_src_node.end()) {
            rows.push_back(to_local_src_node.at(v));
            cols.push_back(i);
            edges.push_back(col_start + j);
          }
        }
      } else {
        for (int64_t j = col_start; j < col_end; j++) {
          const auto &v = row_data[j];
          if (to_local_src_node.find(v) != to_local_src_node.end()) {
            rows.push_back(to_local_src_node.at(v));
            cols.push_back(i);
            edges.push_back(j);
          }
rusty1s's avatar
rusty1s committed
224
225
226
        }
      }
    }
rusty1s's avatar
update  
rusty1s committed
227
    if (rows.size() > 0) {
rusty1s's avatar
rusty1s committed
228
229
230
      out_row_dict.insert(rel_type, from_vector<int64_t>(rows));
      out_col_dict.insert(rel_type, from_vector<int64_t>(cols));
      out_edge_dict.insert(rel_type, from_vector<int64_t>(edges));
rusty1s's avatar
update  
rusty1s committed
231
    }
rusty1s's avatar
rusty1s committed
232
233
  }

rusty1s's avatar
rusty1s committed
234
235
236
237
238
239
  // Generate tensor-valued output node dictionary (line 20):
  for (const auto &kv : nodes_dict) {
    const auto &node_type = kv.first;
    const auto &nodes = kv.second;
    if (!nodes.empty())
      out_node_dict.insert(node_type, from_vector<int64_t>(nodes));
rusty1s's avatar
rusty1s committed
240
241
  }

rusty1s's avatar
rusty1s committed
242
  return make_tuple(out_node_dict, out_row_dict, out_col_dict, out_edge_dict);
rusty1s's avatar
rusty1s committed
243
}