Commit 97f2f4e9 authored by quyuanhao123's avatar quyuanhao123
Browse files

Initial commit

parents
Pipeline #189 failed with stages
in 0 seconds
#include "rw_cpu.h"
#include "utils.h"
torch::Tensor random_walk_cpu(torch::Tensor rowptr, torch::Tensor col,
torch::Tensor start, int64_t walk_length) {
CHECK_CPU(rowptr);
CHECK_CPU(col);
CHECK_CPU(start);
CHECK_INPUT(rowptr.dim() == 1);
CHECK_INPUT(col.dim() == 1);
CHECK_INPUT(start.dim() == 1);
auto rand = torch::rand({start.size(0), walk_length},
start.options().dtype(torch::kFloat));
auto L = walk_length + 1;
auto out = torch::full({start.size(0), L}, -1, start.options());
auto rowptr_data = rowptr.data_ptr<int64_t>();
auto col_data = col.data_ptr<int64_t>();
auto start_data = start.data_ptr<int64_t>();
auto rand_data = rand.data_ptr<float>();
auto out_data = out.data_ptr<int64_t>();
for (auto n = 0; n < start.size(0); n++) {
auto cur = start_data[n];
out_data[n * L] = cur;
int64_t row_start, row_end;
for (auto l = 0; l < walk_length; l++) {
row_start = rowptr_data[cur];
row_end = rowptr_data[cur + 1];
cur = col_data[row_start + int64_t(rand_data[n * walk_length + l] *
(row_end - row_start))];
out_data[n * L + l + 1] = cur;
}
}
return out;
}
#pragma once
#include "../extensions.h"
torch::Tensor random_walk_cpu(torch::Tensor rowptr, torch::Tensor col,
torch::Tensor start, int64_t walk_length);
#include "saint_cpu.h"
#include "utils.h"
std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
subgraph_cpu(torch::Tensor idx, torch::Tensor rowptr, torch::Tensor row,
torch::Tensor col) {
CHECK_CPU(idx);
CHECK_CPU(rowptr);
CHECK_CPU(col);
CHECK_INPUT(idx.dim() == 1);
CHECK_INPUT(rowptr.dim() == 1);
CHECK_INPUT(col.dim() == 1);
auto assoc = torch::full({rowptr.size(0) - 1}, -1, idx.options());
assoc.index_copy_(0, idx, torch::arange(idx.size(0), idx.options()));
auto idx_data = idx.data_ptr<int64_t>();
auto rowptr_data = rowptr.data_ptr<int64_t>();
auto col_data = col.data_ptr<int64_t>();
auto assoc_data = assoc.data_ptr<int64_t>();
std::vector<int64_t> rows, cols, indices;
int64_t v, w, w_new, row_start, row_end;
for (int64_t v_new = 0; v_new < idx.size(0); v_new++) {
v = idx_data[v_new];
row_start = rowptr_data[v];
row_end = rowptr_data[v + 1];
for (int64_t j = row_start; j < row_end; j++) {
w = col_data[j];
w_new = assoc_data[w];
if (w_new > -1) {
rows.push_back(v_new);
cols.push_back(w_new);
indices.push_back(j);
}
}
}
int64_t length = rows.size();
row = torch::from_blob(rows.data(), {length}, row.options()).clone();
col = torch::from_blob(cols.data(), {length}, row.options()).clone();
idx = torch::from_blob(indices.data(), {length}, row.options()).clone();
return std::make_tuple(row, col, idx);
}
#pragma once
#include "../extensions.h"
std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
subgraph_cpu(torch::Tensor idx, torch::Tensor rowptr, torch::Tensor row,
torch::Tensor col);
#include "sample_cpu.h"
#include "utils.h"
#ifdef _WIN32
#include <process.h>
#endif
// Returns `rowptr`, `col`, `n_id`, `e_id`
std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
sample_adj_cpu(torch::Tensor rowptr, torch::Tensor col, torch::Tensor idx,
int64_t num_neighbors, bool replace) {
CHECK_CPU(rowptr);
CHECK_CPU(col);
CHECK_CPU(idx);
CHECK_INPUT(idx.dim() == 1);
srand(time(NULL) + 1000 * getpid()); // Initialize random seed.
auto rowptr_data = rowptr.data_ptr<int64_t>();
auto col_data = col.data_ptr<int64_t>();
auto idx_data = idx.data_ptr<int64_t>();
auto out_rowptr = torch::empty(idx.numel() + 1, rowptr.options());
auto out_rowptr_data = out_rowptr.data_ptr<int64_t>();
out_rowptr_data[0] = 0;
std::vector<std::vector<std::tuple<int64_t, int64_t>>> cols; // col, e_id
std::vector<int64_t> n_ids;
std::unordered_map<int64_t, int64_t> n_id_map;
int64_t i;
for (int64_t n = 0; n < idx.numel(); n++) {
i = idx_data[n];
cols.push_back(std::vector<std::tuple<int64_t, int64_t>>());
n_id_map[i] = n;
n_ids.push_back(i);
}
int64_t n, c, e, row_start, row_end, row_count;
if (num_neighbors < 0) { // No sampling ======================================
for (int64_t i = 0; i < idx.numel(); i++) {
n = idx_data[i];
row_start = rowptr_data[n], row_end = rowptr_data[n + 1];
row_count = row_end - row_start;
for (int64_t j = 0; j < row_count; j++) {
e = row_start + j;
c = col_data[e];
if (n_id_map.count(c) == 0) {
n_id_map[c] = n_ids.size();
n_ids.push_back(c);
}
cols[i].push_back(std::make_tuple(n_id_map[c], e));
}
out_rowptr_data[i + 1] = out_rowptr_data[i] + cols[i].size();
}
}
else if (replace) { // Sample with replacement ===============================
for (int64_t i = 0; i < idx.numel(); i++) {
n = idx_data[i];
row_start = rowptr_data[n], row_end = rowptr_data[n + 1];
row_count = row_end - row_start;
if (row_count > 0) {
for (int64_t j = 0; j < num_neighbors; j++) {
e = row_start + rand() % row_count;
c = col_data[e];
if (n_id_map.count(c) == 0) {
n_id_map[c] = n_ids.size();
n_ids.push_back(c);
}
cols[i].push_back(std::make_tuple(n_id_map[c], e));
}
}
out_rowptr_data[i + 1] = out_rowptr_data[i] + cols[i].size();
}
} else { // Sample without replacement via Robert Floyd algorithm ============
for (int64_t i = 0; i < idx.numel(); i++) {
n = idx_data[i];
row_start = rowptr_data[n], row_end = rowptr_data[n + 1];
row_count = row_end - row_start;
std::unordered_set<int64_t> perm;
if (row_count <= num_neighbors) {
for (int64_t j = 0; j < row_count; j++)
perm.insert(j);
} else { // See: https://www.nowherenearithaca.com/2013/05/
// robert-floyds-tiny-and-beautiful.html
for (int64_t j = row_count - num_neighbors; j < row_count; j++) {
if (!perm.insert(rand() % j).second)
perm.insert(j);
}
}
for (const int64_t &p : perm) {
e = row_start + p;
c = col_data[e];
if (n_id_map.count(c) == 0) {
n_id_map[c] = n_ids.size();
n_ids.push_back(c);
}
cols[i].push_back(std::make_tuple(n_id_map[c], e));
}
out_rowptr_data[i + 1] = out_rowptr_data[i] + cols[i].size();
}
}
int64_t N = n_ids.size();
auto out_n_id = torch::from_blob(n_ids.data(), {N}, col.options()).clone();
int64_t E = out_rowptr_data[idx.numel()];
auto out_col = torch::empty(E, col.options());
auto out_col_data = out_col.data_ptr<int64_t>();
auto out_e_id = torch::empty(E, col.options());
auto out_e_id_data = out_e_id.data_ptr<int64_t>();
i = 0;
for (std::vector<std::tuple<int64_t, int64_t>> &col_vec : cols) {
std::sort(col_vec.begin(), col_vec.end(),
[](const std::tuple<int64_t, int64_t> &a,
const std::tuple<int64_t, int64_t> &b) -> bool {
return std::get<0>(a) < std::get<0>(b);
});
for (const std::tuple<int64_t, int64_t> &value : col_vec) {
out_col_data[i] = std::get<0>(value);
out_e_id_data[i] = std::get<1>(value);
i += 1;
}
}
return std::make_tuple(out_rowptr, out_col, out_n_id, out_e_id);
}
#pragma once
#include "../extensions.h"
std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
sample_adj_cpu(torch::Tensor rowptr, torch::Tensor col, torch::Tensor idx,
int64_t num_neighbors, bool replace);
#include "spmm_cpu.h"
#include <ATen/Parallel.h>
#include "reducer.h"
#include "utils.h"
std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
spmm_cpu(torch::Tensor rowptr, torch::Tensor col,
torch::optional<torch::Tensor> optional_value, torch::Tensor mat,
std::string reduce) {
CHECK_CPU(rowptr);
CHECK_CPU(col);
if (optional_value.has_value())
CHECK_CPU(optional_value.value());
CHECK_CPU(mat);
CHECK_INPUT(rowptr.dim() == 1);
CHECK_INPUT(col.dim() == 1);
if (optional_value.has_value()) {
CHECK_INPUT(optional_value.value().dim() == 1);
CHECK_INPUT(optional_value.value().size(0) == col.size(0));
}
CHECK_INPUT(mat.dim() >= 2);
mat = mat.contiguous();
auto sizes = mat.sizes().vec();
sizes[mat.dim() - 2] = rowptr.numel() - 1;
auto out = torch::empty(sizes, mat.options());
torch::optional<torch::Tensor> arg_out = torch::nullopt;
int64_t *arg_out_data = nullptr;
if (reduce2REDUCE.at(reduce) == MIN || reduce2REDUCE.at(reduce) == MAX) {
arg_out = torch::full_like(out, col.numel(), rowptr.options());
arg_out_data = arg_out.value().data_ptr<int64_t>();
}
auto rowptr_data = rowptr.data_ptr<int64_t>();
auto col_data = col.data_ptr<int64_t>();
auto M = rowptr.numel() - 1;
auto N = mat.size(-2);
auto K = mat.size(-1);
auto B = mat.numel() / (N * K);
AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, mat.scalar_type(), "_", [&] {
scalar_t *value_data = nullptr;
auto mat_data = mat.data_ptr<scalar_t>();
auto out_data = out.data_ptr<scalar_t>();
AT_DISPATCH_REDUCTION_TYPES(reduce, [&] {
AT_DISPATCH_HAS_VALUE(optional_value, [&] {
if (HAS_VALUE) {
value_data = optional_value.value().data_ptr<scalar_t>();
}
int64_t grain_size = at::internal::GRAIN_SIZE /
(K * std::max(col.numel() / M, (int64_t)1));
at::parallel_for(0, B * M, grain_size, [&](int64_t begin, int64_t end) {
scalar_t val;
std::vector<scalar_t> vals(K);
int64_t row_start, row_end, b, m, c;
std::vector<int64_t> args(K);
for (auto i = begin; i < end; i++) {
b = i / M, m = i % M;
row_start = rowptr_data[m], row_end = rowptr_data[m + 1];
for (auto k = 0; k < K; k++)
vals[k] = Reducer<scalar_t, REDUCE>::init();
auto offset = b * N * K;
for (auto e = row_start; e < row_end; e++) {
c = col_data[e];
if (HAS_VALUE)
val = value_data[e];
for (auto k = 0; k < K; k++) {
if (HAS_VALUE)
Reducer<scalar_t, REDUCE>::update(
&vals[k], val * mat_data[offset + c * K + k], &args[k],
e);
else
Reducer<scalar_t, REDUCE>::update(
&vals[k], mat_data[offset + c * K + k], &args[k], e);
}
}
offset = b * M * K + m * K;
for (auto k = 0; k < K; k++)
Reducer<scalar_t, REDUCE>::write(out_data + offset + k, vals[k],
arg_out_data + offset + k,
args[k], row_end - row_start);
}
});
});
});
});
return std::make_tuple(out, arg_out);
}
torch::Tensor spmm_value_bw_cpu(torch::Tensor row, torch::Tensor rowptr,
torch::Tensor col, torch::Tensor mat,
torch::Tensor grad, std::string reduce) {
CHECK_CPU(row);
CHECK_CPU(rowptr);
CHECK_CPU(col);
CHECK_CPU(mat);
CHECK_CPU(grad);
mat = mat.contiguous();
grad = grad.contiguous();
auto M = grad.size(-2);
auto N = mat.size(-2);
auto E = row.numel();
auto K = mat.size(-1);
auto B = mat.numel() / (N * K);
auto out = torch::zeros(row.numel(), grad.options());
auto row_data = row.data_ptr<int64_t>();
auto rowptr_data = rowptr.data_ptr<int64_t>();
auto col_data = col.data_ptr<int64_t>();
AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, mat.scalar_type(), "_", [&] {
auto mat_data = mat.data_ptr<scalar_t>();
auto grad_data = grad.data_ptr<scalar_t>();
auto out_data = out.data_ptr<scalar_t>();
scalar_t val;
int64_t row, col;
AT_DISPATCH_REDUCTION_TYPES(reduce, [&] {
for (int b = 0; b < B; b++) {
for (int e = 0; e < E; e++) {
row = row_data[e], col = col_data[e], val = (scalar_t)0;
for (int k = 0; k < K; k++) {
val += mat_data[b * N * K + col * K + k] *
grad_data[b * M * K + row * K + k];
}
if (REDUCE == MEAN) {
int row_start = rowptr_data[row], row_end = rowptr_data[row + 1];
val /= (scalar_t)std::max(row_end - row_start, 1);
}
out_data[e] += val;
}
}
});
});
return out;
}
#pragma once
#include "../extensions.h"
std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
spmm_cpu(torch::Tensor rowptr, torch::Tensor col,
torch::optional<torch::Tensor> optional_value, torch::Tensor mat,
std::string reduce);
torch::Tensor spmm_value_bw_cpu(torch::Tensor row, torch::Tensor rowptr,
torch::Tensor col, torch::Tensor mat,
torch::Tensor grad, std::string reduce);
#include "spspmm_cpu.h"
#include "utils.h"
std::tuple<torch::Tensor, torch::Tensor, torch::optional<torch::Tensor>>
spspmm_cpu(torch::Tensor rowptrA, torch::Tensor colA,
torch::optional<torch::Tensor> optional_valueA,
torch::Tensor rowptrB, torch::Tensor colB,
torch::optional<torch::Tensor> optional_valueB, int64_t K,
std::string reduce) {
CHECK_CPU(rowptrA);
CHECK_CPU(colA);
if (optional_valueA.has_value())
CHECK_CPU(optional_valueA.value());
CHECK_CPU(rowptrB);
CHECK_CPU(colB);
if (optional_valueB.has_value())
CHECK_CPU(optional_valueB.value());
CHECK_INPUT(rowptrA.dim() == 1);
CHECK_INPUT(colA.dim() == 1);
if (optional_valueA.has_value()) {
CHECK_INPUT(optional_valueA.value().dim() == 1);
CHECK_INPUT(optional_valueA.value().size(0) == colA.size(0));
}
CHECK_INPUT(rowptrB.dim() == 1);
CHECK_INPUT(colB.dim() == 1);
if (optional_valueB.has_value()) {
CHECK_INPUT(optional_valueB.value().dim() == 1);
CHECK_INPUT(optional_valueB.value().size(0) == colB.size(0));
}
if (!optional_valueA.has_value() && optional_valueB.has_value())
optional_valueA =
torch::ones(colA.numel(), optional_valueB.value().options());
if (!optional_valueB.has_value() && optional_valueA.has_value())
optional_valueB =
torch::ones(colB.numel(), optional_valueA.value().options());
auto scalar_type = torch::ScalarType::Float;
if (optional_valueA.has_value())
scalar_type = optional_valueA.value().scalar_type();
auto rowptrA_data = rowptrA.data_ptr<int64_t>();
auto colA_data = colA.data_ptr<int64_t>();
auto rowptrB_data = rowptrB.data_ptr<int64_t>();
auto colB_data = colB.data_ptr<int64_t>();
auto rowptrC = torch::empty_like(rowptrA);
auto rowptrC_data = rowptrC.data_ptr<int64_t>();
rowptrC_data[0] = 0;
torch::Tensor colC;
torch::optional<torch::Tensor> optional_valueC = torch::nullopt;
AT_DISPATCH_ALL_TYPES(scalar_type, "spspmm", [&] {
AT_DISPATCH_HAS_VALUE(optional_valueA, [&] {
scalar_t *valA_data = nullptr, *valB_data = nullptr;
if (HAS_VALUE) {
valA_data = optional_valueA.value().data_ptr<scalar_t>();
valB_data = optional_valueB.value().data_ptr<scalar_t>();
}
int64_t nnz = 0, cA, cB;
std::vector<scalar_t> tmp_vals(K, 0);
std::vector<int64_t> cols;
std::vector<scalar_t> vals;
for (auto rA = 0; rA < rowptrA.numel() - 1; rA++) {
for (auto eA = rowptrA_data[rA]; eA < rowptrA_data[rA + 1]; eA++) {
cA = colA_data[eA];
for (auto eB = rowptrB_data[cA]; eB < rowptrB_data[cA + 1]; eB++) {
cB = colB_data[eB];
if (HAS_VALUE)
tmp_vals[cB] += valA_data[eA] * valB_data[eB];
else
tmp_vals[cB]++;
}
}
for (auto k = 0; k < K; k++) {
if (tmp_vals[k] != 0) {
cols.push_back(k);
if (HAS_VALUE)
vals.push_back(tmp_vals[k]);
nnz++;
}
tmp_vals[k] = (scalar_t)0;
}
rowptrC_data[rA + 1] = nnz;
}
colC = torch::from_blob(cols.data(), {nnz}, colA.options()).clone();
if (HAS_VALUE) {
optional_valueC = torch::from_blob(vals.data(), {nnz},
optional_valueA.value().options());
optional_valueC = optional_valueC.value().clone();
}
});
});
return std::make_tuple(rowptrC, colC, optional_valueC);
}
#pragma once
#include "../extensions.h"
std::tuple<torch::Tensor, torch::Tensor, torch::optional<torch::Tensor>>
spspmm_cpu(torch::Tensor rowptrA, torch::Tensor colA,
torch::optional<torch::Tensor> optional_valueA,
torch::Tensor rowptrB, torch::Tensor colB,
torch::optional<torch::Tensor> optional_valueB, int64_t K,
std::string reduce);
#pragma once
#include "../extensions.h"
#define CHECK_CPU(x) AT_ASSERTM(x.device().is_cpu(), #x " must be CPU tensor")
#define CHECK_INPUT(x) AT_ASSERTM(x, "Input mismatch")
#define AT_DISPATCH_HAS_VALUE(optional_value, ...) \
[&] { \
if (optional_value.has_value()) { \
const bool HAS_VALUE = true; \
return __VA_ARGS__(); \
} else { \
const bool HAS_VALUE = false; \
return __VA_ARGS__(); \
} \
}()
template <typename scalar_t>
inline torch::Tensor from_vector(const std::vector<scalar_t> &vec,
bool inplace = false) {
const auto size = (int64_t)vec.size();
const auto out = torch::from_blob((scalar_t *)vec.data(), {size},
c10::CppTypeToScalarType<scalar_t>::value);
return inplace ? out : out.clone();
}
template <typename key_t, typename scalar_t>
inline c10::Dict<key_t, torch::Tensor>
from_vector(const std::unordered_map<key_t, std::vector<scalar_t>> &vec_dict,
bool inplace = false) {
c10::Dict<key_t, torch::Tensor> out_dict;
for (const auto &kv : vec_dict)
out_dict.insert(kv.first, from_vector<scalar_t>(kv.second, inplace));
return out_dict;
}
inline torch::Tensor
choice(int64_t population, int64_t num_samples, bool replace = false,
torch::optional<torch::Tensor> weight = torch::nullopt) {
if (population == 0 || num_samples == 0)
return torch::empty({0}, at::kLong);
if (!replace && num_samples >= population)
return torch::arange(population, at::kLong);
if (weight.has_value())
return torch::multinomial(weight.value(), num_samples, replace);
if (replace) {
const auto out = torch::empty(num_samples, at::kLong);
auto *out_data = out.data_ptr<int64_t>();
for (int64_t i = 0; i < num_samples; i++) {
out_data[i] = rand() % population;
}
return out;
} else {
// Sample without replacement via Robert Floyd algorithm:
// https://www.nowherenearithaca.com/2013/05/
// robert-floyds-tiny-and-beautiful.html
const auto out = torch::empty(num_samples, at::kLong);
auto *out_data = out.data_ptr<int64_t>();
std::unordered_set<int64_t> samples;
for (int64_t i = population - num_samples; i < population; i++) {
int64_t sample = rand() % i;
if (!samples.insert(sample).second) {
sample = i;
samples.insert(sample);
}
out_data[i - population + num_samples] = sample;
}
return out;
}
}
template <bool replace>
inline void
uniform_choice(const int64_t population, const int64_t num_samples,
const int64_t *idx_data, std::vector<int64_t> *samples,
std::unordered_map<int64_t, int64_t> *to_local_node) {
if (population == 0 || num_samples == 0)
return;
if (replace) {
for (int64_t i = 0; i < num_samples; i++) {
const int64_t &v = idx_data[rand() % population];
if (to_local_node->insert({v, samples->size()}).second)
samples->push_back(v);
}
} else if (num_samples >= population) {
for (int64_t i = 0; i < population; i++) {
const int64_t &v = idx_data[i];
if (to_local_node->insert({v, samples->size()}).second)
samples->push_back(v);
}
} else {
std::unordered_set<int64_t> indices;
for (int64_t i = population - num_samples; i < population; i++) {
int64_t j = rand() % i;
if (!indices.insert(j).second) {
j = i;
indices.insert(j);
}
const int64_t &v = idx_data[j];
if (to_local_node->insert({v, samples->size()}).second)
samples->push_back(v);
}
}
}
#ifdef WITH_PYTHON
#include <Python.h>
#endif
#include <torch/script.h>
#include "cpu/diag_cpu.h"
#ifdef WITH_HIP
#include "hip/diag_hip.h"
#endif
#ifdef _WIN32
#ifdef WITH_PYTHON
#ifdef WITH_HIP
PyMODINIT_FUNC PyInit__diag_cuda(void) { return NULL; }
#else
PyMODINIT_FUNC PyInit__diag_cpu(void) { return NULL; }
#endif
#endif
#endif
SPARSE_API torch::Tensor non_diag_mask(torch::Tensor row, torch::Tensor col, int64_t M,
int64_t N, int64_t k) {
if (row.device().is_cuda()) {
#ifdef WITH_HIP
return non_diag_mask_cuda(row, col, M, N, k);
#else
AT_ERROR("Not compiled with CUDA support");
#endif
} else {
return non_diag_mask_cpu(row, col, M, N, k);
}
}
static auto registry = torch::RegisterOperators().op(
"torch_sparse::non_diag_mask", &non_diag_mask);
#ifdef WITH_PYTHON
#include <Python.h>
#endif
#include <torch/script.h>
#include "cpu/ego_sample_cpu.h"
#ifdef _WIN32
#ifdef WITH_PYTHON
#ifdef WITH_HIP
PyMODINIT_FUNC PyInit__ego_sample_cuda(void) { return NULL; }
#else
PyMODINIT_FUNC PyInit__ego_sample_cpu(void) { return NULL; }
#endif
#endif
#endif
// Returns `rowptr`, `col`, `n_id`, `e_id`, `ptr`, `root_n_id`
SPARSE_API std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor,
torch::Tensor, torch::Tensor>
ego_k_hop_sample_adj(torch::Tensor rowptr, torch::Tensor col, torch::Tensor idx,
int64_t depth, int64_t num_neighbors, bool replace) {
if (rowptr.device().is_cuda()) {
#ifdef WITH_HIP
AT_ERROR("No CUDA version supported");
#else
AT_ERROR("Not compiled with CUDA support");
#endif
} else {
return ego_k_hop_sample_adj_cpu(rowptr, col, idx, depth, num_neighbors,
replace);
}
}
static auto registry = torch::RegisterOperators().op(
"torch_sparse::ego_k_hop_sample_adj", &ego_k_hop_sample_adj);
#include <torch/torch.h>
#include "sparse.h"
// for getpid()
#ifdef _WIN32
#include <process.h>
#else
#include <unistd.h>
#endif
#ifdef WITH_PYTHON
#include <Python.h>
#endif
#include <torch/script.h>
#include "cpu/hgt_sample_cpu.h"
#ifdef _WIN32
#ifdef WITH_PYTHON
#ifdef WITH_HIP
PyMODINIT_FUNC PyInit__hgt_sample_cuda(void) { return NULL; }
#else
PyMODINIT_FUNC PyInit__hgt_sample_cpu(void) { return NULL; }
#endif
#endif
#endif
// Returns 'output_node_dict', 'row_dict', 'col_dict', 'output_edge_dict'
SPARSE_API std::tuple<c10::Dict<node_t, torch::Tensor>, c10::Dict<rel_t, torch::Tensor>,
c10::Dict<rel_t, torch::Tensor>, c10::Dict<rel_t, torch::Tensor>>
hgt_sample(const c10::Dict<std::string, torch::Tensor> &colptr_dict,
const c10::Dict<std::string, torch::Tensor> &row_dict,
const c10::Dict<std::string, torch::Tensor> &input_node_dict,
const c10::Dict<std::string, std::vector<int64_t>> &num_samples_dict,
const int64_t num_hops) {
return hgt_sample_cpu(colptr_dict, row_dict, input_node_dict,
num_samples_dict, num_hops);
}
static auto registry =
torch::RegisterOperators().op("torch_sparse::hgt_sample", &hgt_sample);
#pragma once
static inline __device__ void atomAdd(float *address, float val) {
atomicAdd(address, val);
}
static inline __device__ void atomAdd(double *address, double val) {
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || TORCH_HIP_VERSION < 8000)
unsigned long long int *address_as_ull = (unsigned long long int *)address;
unsigned long long int old = *address_as_ull;
unsigned long long int assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val + __longlong_as_double(assumed)));
} while (assumed != old);
#else
atomicAdd(address, val);
#endif
}
#pragma once
#include "../extensions.h"
torch::Tensor ind2ptr_cuda(torch::Tensor ind, int64_t M);
torch::Tensor ptr2ind_cuda(torch::Tensor ptr, int64_t E);
#include "hip/hip_runtime.h"
#include "convert_hip.h"
#include <ATen/hip/HIPContext.h>
#include "utils.cuh"
#define THREADS 256
__global__ void ind2ptr_kernel(const int64_t *ind_data, int64_t *out_data,
int64_t M, int64_t numel) {
int64_t thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
if (thread_idx == 0) {
for (int64_t i = 0; i <= ind_data[0]; i++)
out_data[i] = 0;
} else if (thread_idx < numel) {
for (int64_t i = ind_data[thread_idx - 1]; i < ind_data[thread_idx]; i++)
out_data[i + 1] = thread_idx;
} else if (thread_idx == numel) {
for (int64_t i = ind_data[numel - 1] + 1; i < M + 1; i++)
out_data[i] = numel;
}
}
torch::Tensor ind2ptr_cuda(torch::Tensor ind, int64_t M) {
CHECK_CUDA(ind);
hipSetDevice(ind.get_device());
auto out = torch::empty(M + 1, ind.options());
if (ind.numel() == 0)
return out.zero_();
auto ind_data = ind.data_ptr<int64_t>();
auto out_data = out.data_ptr<int64_t>();
auto stream = at::cuda::getCurrentCUDAStream();
ind2ptr_kernel<<<(ind.numel() + 2 + THREADS - 1) / THREADS, THREADS, 0,
stream>>>(ind_data, out_data, M, ind.numel());
return out;
}
__global__ void ptr2ind_kernel(const int64_t *ptr_data, int64_t *out_data,
int64_t E, int64_t numel) {
int64_t thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
if (thread_idx < numel) {
int64_t idx = ptr_data[thread_idx], next_idx = ptr_data[thread_idx + 1];
for (int64_t i = idx; i < next_idx; i++) {
out_data[i] = thread_idx;
}
}
}
torch::Tensor ptr2ind_cuda(torch::Tensor ptr, int64_t E) {
CHECK_CUDA(ptr);
hipSetDevice(ptr.get_device());
auto out = torch::empty(E, ptr.options());
auto ptr_data = ptr.data_ptr<int64_t>();
auto out_data = out.data_ptr<int64_t>();
auto stream = at::cuda::getCurrentCUDAStream();
ptr2ind_kernel<<<(ptr.numel() - 1 + THREADS - 1) / THREADS, THREADS, 0,
stream>>>(ptr_data, out_data, E, ptr.numel() - 1);
return out;
}
#include "hip/hip_runtime.h"
#include "convert_hip.h"
#include <ATen/hip/HIPContext.h>
#include "utils.cuh"
#define THREADS 256
__global__ void ind2ptr_kernel(const int64_t *ind_data, int64_t *out_data,
int64_t M, int64_t numel) {
int64_t thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
if (thread_idx == 0) {
for (int64_t i = 0; i <= ind_data[0]; i++)
out_data[i] = 0;
} else if (thread_idx < numel) {
for (int64_t i = ind_data[thread_idx - 1]; i < ind_data[thread_idx]; i++)
out_data[i + 1] = thread_idx;
} else if (thread_idx == numel) {
for (int64_t i = ind_data[numel - 1] + 1; i < M + 1; i++)
out_data[i] = numel;
}
}
torch::Tensor ind2ptr_cuda(torch::Tensor ind, int64_t M) {
CHECK_CUDA(ind);
hipSetDevice(ind.get_device());
auto out = torch::empty(M + 1, ind.options());
if (ind.numel() == 0)
return out.zero_();
auto ind_data = ind.data_ptr<int64_t>();
auto out_data = out.data_ptr<int64_t>();
auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
hipLaunchKernelGGL(( ind2ptr_kernel), dim3((ind.numel() + 2 + THREADS - 1) / THREADS), dim3(THREADS), 0,
stream, ind_data, out_data, M, ind.numel());
return out;
}
__global__ void ptr2ind_kernel(const int64_t *ptr_data, int64_t *out_data,
int64_t E, int64_t numel) {
int64_t thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
if (thread_idx < numel) {
int64_t idx = ptr_data[thread_idx], next_idx = ptr_data[thread_idx + 1];
for (int64_t i = idx; i < next_idx; i++) {
out_data[i] = thread_idx;
}
}
}
torch::Tensor ptr2ind_cuda(torch::Tensor ptr, int64_t E) {
CHECK_CUDA(ptr);
hipSetDevice(ptr.get_device());
auto out = torch::empty(E, ptr.options());
auto ptr_data = ptr.data_ptr<int64_t>();
auto out_data = out.data_ptr<int64_t>();
auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
hipLaunchKernelGGL(( ptr2ind_kernel), dim3((ptr.numel() - 1 + THREADS - 1) / THREADS), dim3(THREADS), 0,
stream, ptr_data, out_data, E, ptr.numel() - 1);
return out;
}
#pragma once
#include "../extensions.h"
torch::Tensor non_diag_mask_cuda(torch::Tensor row, torch::Tensor col,
int64_t M, int64_t N, int64_t k);
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment