implementing convert

c5f5be51 · rusty1s · 3c6dbfa1 · c5f5be51 · 3c6dbfa1 · 3c6dbfa1
Commit c5f5be51 authored Jan 31, 2020 by rusty1s
20 changed files
--- a/LICENSE
+++ b/LICENSE
-Copyright (c) 2019 Matthias Fey <matthias.fey@tu-dortmund.de>
+Copyright (c) 2020 Matthias Fey <matthias.fey@tu-dortmund.de>
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

--- a/cpu/compat.h
+++ b/cpu/compat.h
-#ifdef VERSION_GE_1_3
-#define DATA_PTR data_ptr
-#else
-#define DATA_PTR data
-#endif
--- a/cpu/spspmm.cpp
+++ b/cpu/spspmm.cpp
-#include <torch/extension.h>
-#include "compat.h"
-at::Tensor degree(at::Tensor row, int64_t num_nodes) {
-  auto zero = at::zeros(num_nodes, row.options());
-  auto one = at::ones(row.size(0), row.options());
-  return zero.scatter_add_(0, row, one);
-}
-std::tuple<at::Tensor, at::Tensor> to_csr(at::Tensor row, at::Tensor col,
-                                          int64_t num_nodes) {
-  // Assert already coalesced input.
-  row = degree(row, num_nodes).cumsum(0);
-  row = at::cat({at::zeros(1, row.options()), row}, 0); // Prepend zero.
-  return std::make_tuple(row, col);
-}
-at::Tensor spspmm_bw(at::Tensor index, at::Tensor indexA, at::Tensor valueA,
-                     at::Tensor indexB, at::Tensor valueB, size_t rowA_max,
-                     size_t rowB_max) {
-  int64_t *index_data = index.DATA_PTR<int64_t>();
-  auto value = at::zeros(index.size(1), valueA.options());
-  at::Tensor rowA, colA;
-  std::tie(rowA, colA) = to_csr(indexA[0], indexA[1], rowA_max);
-  int64_t *rowA_data = rowA.DATA_PTR<int64_t>();
-  int64_t *colA_data = colA.DATA_PTR<int64_t>();
-  at::Tensor rowB, colB;
-  std::tie(rowB, colB) = to_csr(indexB[0], indexB[1], rowB_max);
-  int64_t *rowB_data = rowB.DATA_PTR<int64_t>();
-  int64_t *colB_data = colB.DATA_PTR<int64_t>();
-  AT_DISPATCH_FLOATING_TYPES(valueA.scalar_type(), "spspmm_bw", [&] {
-    scalar_t *value_data = value.DATA_PTR<scalar_t>();
-    scalar_t *valueA_data = valueA.DATA_PTR<scalar_t>();
-    scalar_t *valueB_data = valueB.DATA_PTR<scalar_t>();
-    for (int64_t e = 0; e < value.size(0); e++) {
-      int64_t i = index_data[e], j = index_data[value.size(0) + e];
-      for (ptrdiff_t dA = rowA_data[i]; dA < rowA_data[i + 1]; dA++) {
-        int64_t cA = colA_data[dA];
-        for (ptrdiff_t dB = rowB_data[j]; dB < rowB_data[j + 1]; dB++) {
-          int64_t cB = colB_data[dB];
-          if (cA == cB) {
-            value_data[e] += valueA_data[dA] * valueB_data[dB];
-          }
-          if (cB >= cA) {
-            break;
-          }
-        }
-      }
-    }
-  });
-  return value;
-}
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("spspmm_bw", &spspmm_bw,
-        "Sparse-Sparse Matrix Multiplication Backward (CPU)");
-}
--- a/csrc/convert.cpp
+++ b/csrc/convert.cpp
+#include <torch/script.h>
+#include "cpu/convert_cpu.h"
+#ifdef WITH_CUDA
+#include "cuda/convert_cuda.h"
+#endif
+torch::Tensor ind2ptr(torch::Tensor ind, int64_t M) {
+  if (ind.device().is_cuda()) {
+#ifdef WITH_CUDA
+    return ind2ptr_cuda(ind, M);
+#else
+    AT_ERROR("Not compiled with CUDA support");
+#endif
+  } else {
+    return ind2ptr_cpu(ind, M);
+  }
+}
+torch::Tensor ptr2ind(torch::Tensor ptr, int64_t E) {
+  if (ptr.device().is_cuda()) {
+#ifdef WITH_CUDA
+    return ptr2ind_cuda(ptr, E);
+#else
+    AT_ERROR("Not compiled with CUDA support");
+#endif
+  } else {
+    return ptr2ind_cpu(ptr, E);
+  }
+}
+static auto registry = torch::RegisterOperators()
+                           .op("torch_sparse::ind2ptr", &ind2ptr)
+                           .op("torch_sparse::ptr2ind", &ptr2ind);
--- a/cpu/convert.cpp
+++ b/cpu/convert.cpp
-#include <torch/script.h>
+#include "convert_cpu.h"
-#include "compat.h"
+#include "utils.h"
-#define CHECK_CPU(x) AT_ASSERTM(x.device().is_cpu(), #x " must be CPU tensor")
+torch::Tensor ind2ptr_cpu(torch::Tensor ind, int64_t M) {
-torch::Tensor ind2ptr(torch::Tensor ind, int64_t M) {
  CHECK_CPU(ind);
  auto out = torch::empty(M + 1, ind.options());
-  auto ind_data = ind.DATA_PTR<int64_t>();
+  auto ind_data = ind.data_ptr<int64_t>();
-  auto out_data = out.DATA_PTR<int64_t>();
+  auto out_data = out.data_ptr<int64_t>();
  int64_t numel = ind.numel(), idx = ind_data[0], next_idx;
+  for (auto i = 0; i <= idx; i++)
-  for (int64_t i = 0; i <= idx; i++)
    out_data[i] = 0;
-  for (int64_t i = 0; i < numel - 1; i++) {
+  for (auto i = 0; i < numel - 1; i++) {
    next_idx = ind_data[i + 1];
-    for (int64_t j = idx; j < next_idx; j++)
+    for (auto j = idx; j < next_idx; j++)
      out_data[j + 1] = i + 1;
    idx = next_idx;
  }
-  for (int64_t i = idx + 1; i < M + 1; i++)
+  for (auto i = idx + 1; i < M + 1; i++)
    out_data[i] = numel;
  return out;
 }
-torch::Tensor ptr2ind(torch::Tensor ptr, int64_t E) {
+torch::Tensor ptr2ind_cpu(torch::Tensor ptr, int64_t E) {
  CHECK_CPU(ptr);
  auto out = torch::empty(E, ptr.options());
-  auto ptr_data = ptr.DATA_PTR<int64_t>();
+  auto ptr_data = ptr.data_ptr<int64_t>();
-  auto out_data = out.DATA_PTR<int64_t>();
+  auto out_data = out.data_ptr<int64_t>();
  int64_t idx = ptr_data[0], next_idx;
-  for (int64_t i = 0; i < ptr.numel() - 1; i++) {
+  for (auto i = 0; i < ptr.numel() - 1; i++) {
    next_idx = ptr_data[i + 1];
-    for (int64_t e = idx; e < next_idx; e++)
+    for (auto e = idx; e < next_idx; e++)
      out_data[e] = i;
    idx = next_idx;
  }
  return out;
 }
-static auto registry =
-    torch::RegisterOperators("torch_sparse_cpu::ind2ptr", &ind2ptr)
-        .op("torch_sparse_cpu::ptr2ind", &ptr2ind);
--- a/csrc/cpu/convert_cpu.h
+++ b/csrc/cpu/convert_cpu.h
+#pragma once
+#include <torch/extension.h>
+torch::Tensor ind2ptr_cpu(torch::Tensor ind, int64_t M);
+torch::Tensor ptr2ind_cpu(torch::Tensor ptr, int64_t E);
--- a/csrc/cpu/diag.h
+++ b/csrc/cpu/diag.h
--- a/cpu/diag.cpp
+++ b/cpu/diag.cpp
--- a/cpu/spmm.cpp
+++ b/cpu/spmm.cpp
--- a/csrc/cpu/spmm_cpu.h
+++ b/csrc/cpu/spmm_cpu.h
--- a/csrc/cpu/utils.h
+++ b/csrc/cpu/utils.h
+#pragma once
+#include <torch/extension.h>
+#define CHECK_CPU(x) AT_ASSERTM(x.device().is_cpu(), #x " must be CPU tensor")
+#define CHECK_INPUT(x) AT_ASSERTM(x, "Input mismatch")
--- a/cuda/compat.cuh
+++ b/cuda/compat.cuh
--- a/csrc/cuda/convert_cuda.cu
+++ b/csrc/cuda/convert_cuda.cu
+#include "convert_cuda.h"
+#include <ATen/cuda/CUDAContext.h>
+#include "utils.cuh"
+#define THREADS 256
+__global__ void ind2ptr_kernel(const int64_t *ind_data, int64_t *out_data,
+                               int64_t M, int64_t numel) {
+  int64_t thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (thread_idx == 0) {
+    for (int64_t i = 0; i <= ind_data[0]; i++)
+      out_data[i] = 0;
+  } else if (thread_idx < numel) {
+    for (int64_t i = ind_data[thread_idx - 1]; i < ind_data[thread_idx]; i++)
+      out_data[i + 1] = thread_idx;
+  } else if (thread_idx == numel) {
+    for (int64_t i = ind_data[numel - 1] + 1; i < M + 1; i++)
+      out_data[i] = numel;
+  }
+}
+torch::Tensor ind2ptr_cuda(torch::Tensor ind, int64_t M) {
+  CHECK_CUDA(ind);
+  cudaSetDevice(ind.get_device());
+  auto out = torch::empty(M + 1, ind.options());
+  auto ind_data = ind.data_ptr<int64_t>();
+  auto out_data = out.data_ptr<int64_t>();
+  auto stream = at::cuda::getCurrentCUDAStream();
+  ind2ptr_kernel<<<(ind.numel() + 2 + THREADS - 1) / THREADS, THREADS, 0,
+                   stream>>>(ind_data, out_data, M, ind.numel());
+  return out;
+}
+__global__ void ptr2ind_kernel(const int64_t *ptr_data, int64_t *out_data,
+                               int64_t E, int64_t numel) {
+  int64_t thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (thread_idx < numel) {
+    int64_t idx = ptr_data[thread_idx], next_idx = ptr_data[thread_idx + 1];
+    for (int64_t i = idx; i < next_idx; i++) {
+      out_data[i] = thread_idx;
+    }
+  }
+}
+torch::Tensor ptr2ind_cuda(torch::Tensor ptr, int64_t E) {
+  CHECK_CUDA(ptr);
+  cudaSetDevice(ptr.get_device());
+  auto out = torch::empty(E, ptr.options());
+  auto ptr_data = ptr.data_ptr<int64_t>();
+  auto out_data = out.data_ptr<int64_t>();
+  auto stream = at::cuda::getCurrentCUDAStream();
+  ptr2ind_kernel<<<(ptr.numel() + THREADS - 1) / THREADS, THREADS, 0, stream>>>(
+      ptr_data, out_data, E, ptr.numel());
+  return out;
+}
--- a/csrc/cuda/convert_cuda.h
+++ b/csrc/cuda/convert_cuda.h
+#pragma once
+#include <torch/extension.h>
+torch::Tensor ind2ptr_cuda(torch::Tensor ind, int64_t M);
+torch::Tensor ptr2ind_cuda(torch::Tensor ptr, int64_t E);
--- a/cuda/convert_kernel.cu
+++ b/cuda/convert_kernel.cu
--- a/cuda/diag.cpp
+++ b/cuda/diag.cpp
--- a/cuda/diag_kernel.cu
+++ b/cuda/diag_kernel.cu
--- a/cuda/spmm.cpp
+++ b/cuda/spmm.cpp
--- a/cuda/spmm_kernel.cu
+++ b/cuda/spmm_kernel.cu
--- a/cuda/spspmm.cpp
+++ b/cuda/spspmm.cpp