Commit c5f5be51 authored by rusty1s's avatar rusty1s
Browse files

implementing convert

parent 3c6dbfa1
Copyright (c) 2019 Matthias Fey <matthias.fey@tu-dortmund.de> Copyright (c) 2020 Matthias Fey <matthias.fey@tu-dortmund.de>
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal
......
#ifdef VERSION_GE_1_3
#define DATA_PTR data_ptr
#else
#define DATA_PTR data
#endif
#include <torch/extension.h>
#include "compat.h"
at::Tensor degree(at::Tensor row, int64_t num_nodes) {
auto zero = at::zeros(num_nodes, row.options());
auto one = at::ones(row.size(0), row.options());
return zero.scatter_add_(0, row, one);
}
std::tuple<at::Tensor, at::Tensor> to_csr(at::Tensor row, at::Tensor col,
int64_t num_nodes) {
// Assert already coalesced input.
row = degree(row, num_nodes).cumsum(0);
row = at::cat({at::zeros(1, row.options()), row}, 0); // Prepend zero.
return std::make_tuple(row, col);
}
at::Tensor spspmm_bw(at::Tensor index, at::Tensor indexA, at::Tensor valueA,
at::Tensor indexB, at::Tensor valueB, size_t rowA_max,
size_t rowB_max) {
int64_t *index_data = index.DATA_PTR<int64_t>();
auto value = at::zeros(index.size(1), valueA.options());
at::Tensor rowA, colA;
std::tie(rowA, colA) = to_csr(indexA[0], indexA[1], rowA_max);
int64_t *rowA_data = rowA.DATA_PTR<int64_t>();
int64_t *colA_data = colA.DATA_PTR<int64_t>();
at::Tensor rowB, colB;
std::tie(rowB, colB) = to_csr(indexB[0], indexB[1], rowB_max);
int64_t *rowB_data = rowB.DATA_PTR<int64_t>();
int64_t *colB_data = colB.DATA_PTR<int64_t>();
AT_DISPATCH_FLOATING_TYPES(valueA.scalar_type(), "spspmm_bw", [&] {
scalar_t *value_data = value.DATA_PTR<scalar_t>();
scalar_t *valueA_data = valueA.DATA_PTR<scalar_t>();
scalar_t *valueB_data = valueB.DATA_PTR<scalar_t>();
for (int64_t e = 0; e < value.size(0); e++) {
int64_t i = index_data[e], j = index_data[value.size(0) + e];
for (ptrdiff_t dA = rowA_data[i]; dA < rowA_data[i + 1]; dA++) {
int64_t cA = colA_data[dA];
for (ptrdiff_t dB = rowB_data[j]; dB < rowB_data[j + 1]; dB++) {
int64_t cB = colB_data[dB];
if (cA == cB) {
value_data[e] += valueA_data[dA] * valueB_data[dB];
}
if (cB >= cA) {
break;
}
}
}
}
});
return value;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("spspmm_bw", &spspmm_bw,
"Sparse-Sparse Matrix Multiplication Backward (CPU)");
}
#include <torch/script.h>
#include "cpu/convert_cpu.h"
#ifdef WITH_CUDA
#include "cuda/convert_cuda.h"
#endif
torch::Tensor ind2ptr(torch::Tensor ind, int64_t M) {
if (ind.device().is_cuda()) {
#ifdef WITH_CUDA
return ind2ptr_cuda(ind, M);
#else
AT_ERROR("Not compiled with CUDA support");
#endif
} else {
return ind2ptr_cpu(ind, M);
}
}
torch::Tensor ptr2ind(torch::Tensor ptr, int64_t E) {
if (ptr.device().is_cuda()) {
#ifdef WITH_CUDA
return ptr2ind_cuda(ptr, E);
#else
AT_ERROR("Not compiled with CUDA support");
#endif
} else {
return ptr2ind_cpu(ptr, E);
}
}
static auto registry = torch::RegisterOperators()
.op("torch_sparse::ind2ptr", &ind2ptr)
.op("torch_sparse::ptr2ind", &ptr2ind);
#include <torch/script.h> #include "convert_cpu.h"
#include "compat.h" #include "utils.h"
#define CHECK_CPU(x) AT_ASSERTM(x.device().is_cpu(), #x " must be CPU tensor") torch::Tensor ind2ptr_cpu(torch::Tensor ind, int64_t M) {
torch::Tensor ind2ptr(torch::Tensor ind, int64_t M) {
CHECK_CPU(ind); CHECK_CPU(ind);
auto out = torch::empty(M + 1, ind.options()); auto out = torch::empty(M + 1, ind.options());
auto ind_data = ind.DATA_PTR<int64_t>(); auto ind_data = ind.data_ptr<int64_t>();
auto out_data = out.DATA_PTR<int64_t>(); auto out_data = out.data_ptr<int64_t>();
int64_t numel = ind.numel(), idx = ind_data[0], next_idx; int64_t numel = ind.numel(), idx = ind_data[0], next_idx;
for (auto i = 0; i <= idx; i++)
for (int64_t i = 0; i <= idx; i++)
out_data[i] = 0; out_data[i] = 0;
for (int64_t i = 0; i < numel - 1; i++) { for (auto i = 0; i < numel - 1; i++) {
next_idx = ind_data[i + 1]; next_idx = ind_data[i + 1];
for (int64_t j = idx; j < next_idx; j++) for (auto j = idx; j < next_idx; j++)
out_data[j + 1] = i + 1; out_data[j + 1] = i + 1;
idx = next_idx; idx = next_idx;
} }
for (int64_t i = idx + 1; i < M + 1; i++) for (auto i = idx + 1; i < M + 1; i++)
out_data[i] = numel; out_data[i] = numel;
return out; return out;
} }
torch::Tensor ptr2ind(torch::Tensor ptr, int64_t E) { torch::Tensor ptr2ind_cpu(torch::Tensor ptr, int64_t E) {
CHECK_CPU(ptr); CHECK_CPU(ptr);
auto out = torch::empty(E, ptr.options()); auto out = torch::empty(E, ptr.options());
auto ptr_data = ptr.DATA_PTR<int64_t>(); auto ptr_data = ptr.data_ptr<int64_t>();
auto out_data = out.DATA_PTR<int64_t>(); auto out_data = out.data_ptr<int64_t>();
int64_t idx = ptr_data[0], next_idx; int64_t idx = ptr_data[0], next_idx;
for (int64_t i = 0; i < ptr.numel() - 1; i++) { for (auto i = 0; i < ptr.numel() - 1; i++) {
next_idx = ptr_data[i + 1]; next_idx = ptr_data[i + 1];
for (int64_t e = idx; e < next_idx; e++) for (auto e = idx; e < next_idx; e++)
out_data[e] = i; out_data[e] = i;
idx = next_idx; idx = next_idx;
} }
return out; return out;
} }
static auto registry =
torch::RegisterOperators("torch_sparse_cpu::ind2ptr", &ind2ptr)
.op("torch_sparse_cpu::ptr2ind", &ptr2ind);
#pragma once
#include <torch/extension.h>
torch::Tensor ind2ptr_cpu(torch::Tensor ind, int64_t M);
torch::Tensor ptr2ind_cpu(torch::Tensor ptr, int64_t E);
#pragma once
#include <torch/extension.h>
#define CHECK_CPU(x) AT_ASSERTM(x.device().is_cpu(), #x " must be CPU tensor")
#define CHECK_INPUT(x) AT_ASSERTM(x, "Input mismatch")
#include "convert_cuda.h"
#include <ATen/cuda/CUDAContext.h>
#include "utils.cuh"
#define THREADS 256
__global__ void ind2ptr_kernel(const int64_t *ind_data, int64_t *out_data,
int64_t M, int64_t numel) {
int64_t thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
if (thread_idx == 0) {
for (int64_t i = 0; i <= ind_data[0]; i++)
out_data[i] = 0;
} else if (thread_idx < numel) {
for (int64_t i = ind_data[thread_idx - 1]; i < ind_data[thread_idx]; i++)
out_data[i + 1] = thread_idx;
} else if (thread_idx == numel) {
for (int64_t i = ind_data[numel - 1] + 1; i < M + 1; i++)
out_data[i] = numel;
}
}
torch::Tensor ind2ptr_cuda(torch::Tensor ind, int64_t M) {
CHECK_CUDA(ind);
cudaSetDevice(ind.get_device());
auto out = torch::empty(M + 1, ind.options());
auto ind_data = ind.data_ptr<int64_t>();
auto out_data = out.data_ptr<int64_t>();
auto stream = at::cuda::getCurrentCUDAStream();
ind2ptr_kernel<<<(ind.numel() + 2 + THREADS - 1) / THREADS, THREADS, 0,
stream>>>(ind_data, out_data, M, ind.numel());
return out;
}
__global__ void ptr2ind_kernel(const int64_t *ptr_data, int64_t *out_data,
int64_t E, int64_t numel) {
int64_t thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
if (thread_idx < numel) {
int64_t idx = ptr_data[thread_idx], next_idx = ptr_data[thread_idx + 1];
for (int64_t i = idx; i < next_idx; i++) {
out_data[i] = thread_idx;
}
}
}
torch::Tensor ptr2ind_cuda(torch::Tensor ptr, int64_t E) {
CHECK_CUDA(ptr);
cudaSetDevice(ptr.get_device());
auto out = torch::empty(E, ptr.options());
auto ptr_data = ptr.data_ptr<int64_t>();
auto out_data = out.data_ptr<int64_t>();
auto stream = at::cuda::getCurrentCUDAStream();
ptr2ind_kernel<<<(ptr.numel() + THREADS - 1) / THREADS, THREADS, 0, stream>>>(
ptr_data, out_data, E, ptr.numel());
return out;
}
#pragma once
#include <torch/extension.h>
torch::Tensor ind2ptr_cuda(torch::Tensor ind, int64_t M);
torch::Tensor ptr2ind_cuda(torch::Tensor ptr, int64_t E);
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment