push 1.6.0 version

4d4e064b · yangzhong · 6907f8b7 · 4d4e064b · 4d4e064b · 4d4e064b
Commit 4d4e064b authored Jan 22, 2024 by yangzhong
20 changed files
--- a/csrc/hip/rw_hip.hip
+++ b/csrc/hip/rw_hip.hip
-#include "hip/hip_runtime.h"
-#include "rw_hip.h"
+#include "rw_cuda.h"

-#include <ATen/hip/HIPContext.h>
-#include <hiprand.h>
-#include <hiprand_kernel.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <curand.h>
+#include <curand_kernel.h>

 #include "utils.cuh"

@@ -46,8 +45,8 @@ rejection_sampling_kernel(unsigned int seed, const int64_t *rowptr,
                          const int64_t walk_length, const int64_t numel,
                          const double p, const double q) {

-  hiprandState_t state;
-  hiprand_init(seed, 0, 0, &state);
+  curandState_t state;
+  curand_init(seed, 0, 0, &state);

  double max_prob = fmax(fmax(1. / p, 1.), 1. / q);
  double prob_0 = 1. / p / max_prob;
@@ -66,7 +65,7 @@ rejection_sampling_kernel(unsigned int seed, const int64_t *rowptr,
      e_cur = -1;
      v = t;
    } else {
-      e_cur = row_start + (hiprand(&state) % (row_end - row_start));
+      e_cur = row_start + (curand(&state) % (row_end - row_start));
      v = col[e_cur];
    }

@@ -84,10 +83,10 @@ rejection_sampling_kernel(unsigned int seed, const int64_t *rowptr,
        x = col[e_cur];
      } else {
        while (true) {
-          e_cur = row_start + (hiprand(&state) % (row_end - row_start));
+          e_cur = row_start + (curand(&state) % (row_end - row_start));
          x = col[e_cur];

-          double r = hiprand_uniform(&state); // (0, 1]
+          double r = curand_uniform(&state); // (0, 1]

          if (x == t && r < prob_0)
            break;
@@ -122,7 +121,7 @@ random_walk_cuda(torch::Tensor rowptr, torch::Tensor col, torch::Tensor start,
  CHECK_CUDA(rowptr);
  CHECK_CUDA(col);
  CHECK_CUDA(start);
-  hipSetDevice(rowptr.get_device());
+  cudaSetDevice(rowptr.get_device());

  CHECK_INPUT(rowptr.dim() == 1);
  CHECK_INPUT(col.dim() == 1);

--- a/csrc/hip/rw_hip.h
+++ b/csrc/hip/rw_hip.h
--- a/csrc/hip/utils.cuh
+++ b/csrc/hip/utils.cuh
--- a/csrc/fps.cpp
+++ b/csrc/fps.cpp
@@ -3,12 +3,12 @@

 #include "cpu/fps_cpu.h"

-#ifdef WITH_HIP
-#include "hip/fps_hip.h"
+#ifdef WITH_CUDA
+#include "cuda/fps_cuda.h"
 #endif

 #ifdef _WIN32
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__fps_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__fps_cpu(void) { return NULL; }
@@ -18,7 +18,7 @@ PyMODINIT_FUNC PyInit__fps_cpu(void) { return NULL; }
 torch::Tensor fps(torch::Tensor src, torch::Tensor ptr, torch::Tensor ratio,
                  bool random_start) {
  if (src.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    return fps_cuda(src, ptr, ratio, random_start);
 #else
    AT_ERROR("Not compiled with CUDA support");

--- a/csrc/graclus.cpp
+++ b/csrc/graclus.cpp
@@ -3,12 +3,12 @@

 #include "cpu/graclus_cpu.h"

-#ifdef WITH_HIP
-#include "hip/graclus_hip.h"
+#ifdef WITH_CUDA
+#include "cuda/graclus_cuda.h"
 #endif

 #ifdef _WIN32
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__graclus_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__graclus_cpu(void) { return NULL; }
@@ -18,7 +18,7 @@ PyMODINIT_FUNC PyInit__graclus_cpu(void) { return NULL; }
 torch::Tensor graclus(torch::Tensor rowptr, torch::Tensor col,
                      torch::optional<torch::Tensor> optional_weight) {
  if (rowptr.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    return graclus_cuda(rowptr, col, optional_weight);
 #else
    AT_ERROR("Not compiled with CUDA support");

--- a/csrc/grid.cpp
+++ b/csrc/grid.cpp
@@ -3,12 +3,12 @@

 #include "cpu/grid_cpu.h"

-#ifdef WITH_HIP
-#include "hip/grid_hip.h"
+#ifdef WITH_CUDA
+#include "cuda/grid_cuda.h"
 #endif

 #ifdef _WIN32
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__grid_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__grid_cpu(void) { return NULL; }
@@ -19,7 +19,7 @@ torch::Tensor grid(torch::Tensor pos, torch::Tensor size,
                   torch::optional<torch::Tensor> optional_start,
                   torch::optional<torch::Tensor> optional_end) {
  if (pos.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    return grid_cuda(pos, size, optional_start, optional_end);
 #else
    AT_ERROR("Not compiled with CUDA support");

--- a/csrc/hip/fps_hip_hip.hip
+++ b/csrc/hip/fps_hip_hip.hip
-#include "hip/hip_runtime.h"
-#include "fps_hip.h"
-
-#include <ATen/hip/HIPContext.h>
-
-#include "utils.cuh"
-
-#define THREADS 256
-
-template <typename scalar_t>
-__global__ void fps_kernel(const scalar_t *src, const int64_t *ptr,
-                           const int64_t *out_ptr, const int64_t *start,
-                           scalar_t *dist, int64_t *out, int64_t dim) {
-
-  const int64_t thread_idx = threadIdx.x;
-  const int64_t batch_idx = blockIdx.x;
-
-  const int64_t start_idx = ptr[batch_idx];
-  const int64_t end_idx = ptr[batch_idx + 1];
-
-  __shared__ scalar_t best_dist[THREADS];
-  __shared__ int64_t best_dist_idx[THREADS];
-
-  if (thread_idx == 0) {
-    out[out_ptr[batch_idx]] = start_idx + start[batch_idx];
-  }
-
-  for (int64_t m = out_ptr[batch_idx] + 1; m < out_ptr[batch_idx + 1]; m++) {
-    __syncthreads();
-    int64_t old = out[m - 1];
-
-    scalar_t best = (scalar_t)-1.;
-    int64_t best_idx = 0;
-
-    for (int64_t n = start_idx + thread_idx; n < end_idx; n += THREADS) {
-      scalar_t tmp, dd = (scalar_t)0.;
-      for (int64_t d = 0; d < dim; d++) {
-        tmp = src[dim * old + d] - src[dim * n + d];
-        dd += tmp * tmp;
-      }
-      dd = min(dist[n], dd);
-      dist[n] = dd;
-      if (dd > best) {
-        best = dd;
-        best_idx = n;
-      }
-    }
-
-    best_dist[thread_idx] = best;
-    best_dist_idx[thread_idx] = best_idx;
-
-    for (int64_t i = 1; i < THREADS; i *= 2) {
-      __syncthreads();
-      if ((thread_idx + i) < THREADS &&
-          best_dist[thread_idx] < best_dist[thread_idx + i]) {
-        best_dist[thread_idx] = best_dist[thread_idx + i];
-        best_dist_idx[thread_idx] = best_dist_idx[thread_idx + i];
-      }
-    }
-
-    __syncthreads();
-    if (thread_idx == 0) {
-      out[m] = best_dist_idx[0];
-    }
-  }
-}
-
-torch::Tensor fps_cuda(torch::Tensor src, torch::Tensor ptr,
-                       torch::Tensor ratio, bool random_start) {
-
-  CHECK_CUDA(src);
-  CHECK_CUDA(ptr);
-  CHECK_CUDA(ratio);
-  CHECK_INPUT(ptr.dim() == 1);
-  hipSetDevice(src.get_device());
-
-  src = src.view({src.size(0), -1}).contiguous();
-  ptr = ptr.contiguous();
-  auto batch_size = ptr.numel() - 1;
-
-  auto deg = ptr.narrow(0, 1, batch_size) - ptr.narrow(0, 0, batch_size);
-  auto out_ptr = deg.toType(ratio.scalar_type()) * ratio;
-  out_ptr = out_ptr.ceil().toType(torch::kLong).cumsum(0);
-  out_ptr = torch::cat({torch::zeros(1, ptr.options()), out_ptr}, 0);
-
-  torch::Tensor start;
-  if (random_start) {
-    start = torch::rand(batch_size, src.options());
-    start = (start * deg.toType(ratio.scalar_type())).toType(torch::kLong);
-  } else {
-    start = torch::zeros(batch_size, ptr.options());
-  }
-
-  auto dist = torch::full(src.size(0), 5e4, src.options());
-
-  auto out_size = (int64_t *)malloc(sizeof(int64_t));
-  hipMemcpy(out_size, out_ptr[-1].data_ptr<int64_t>(), sizeof(int64_t),
-             hipMemcpyDeviceToHost);
-  auto out = torch::empty(out_size[0], out_ptr.options());
-
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-  auto scalar_type = src.scalar_type();
-  AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Half, scalar_type, "_", [&] {
-   hipLaunchKernelGGL(( fps_kernel<scalar_t>), dim3(batch_size), dim3(THREADS), 0, stream, 
-        src.data_ptr<scalar_t>(), ptr.data_ptr<int64_t>(),
-        out_ptr.data_ptr<int64_t>(), start.data_ptr<int64_t>(),
-        dist.data_ptr<scalar_t>(), out.data_ptr<int64_t>(), src.size(1));
-  });
-
-  return out;
-}
--- a/csrc/hip/graclus_hip_hip.hip
+++ b/csrc/hip/graclus_hip_hip.hip
-#include "hip/hip_runtime.h"
-#include "graclus_hip.h"
-
-#include <ATen/hip/HIPContext.h>
-
-#include "utils.cuh"
-
-#define THREADS 1024
-#define BLOCKS(N) (N + THREADS - 1) / THREADS
-#define BLUE_P 0.53406
-
-__device__ bool done_d;
-__global__ void init_done_kernel() { done_d = true; }
-__global__ void colorize_kernel(int64_t *out, const float *bernoulli,
-                                int64_t numel) {
-  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (thread_idx < numel) {
-    if (out[thread_idx] < 0) {
-      out[thread_idx] = (int64_t)bernoulli[thread_idx] - 2;
-      done_d = false;
-    }
-  }
-}
-
-bool colorize(torch::Tensor out) {
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
- hipLaunchKernelGGL(( init_done_kernel), dim3(1), dim3(1), 0, stream, );
-
-  auto numel = out.size(0);
-  auto props = torch::full(numel, BLUE_P, out.options().dtype(torch::kFloat));
-  auto bernoulli = props.bernoulli();
-
- hipLaunchKernelGGL(( colorize_kernel), dim3(BLOCKS(numel)), dim3(THREADS), 0, stream, 
-      out.data_ptr<int64_t>(), bernoulli.data_ptr<float>(), numel);
-
-  bool done_h;
-  hipMemcpyFromSymbol(&done_h, HIP_SYMBOL(done_d), sizeof(done_h), 0,
-                       hipMemcpyDeviceToHost);
-  return done_h;
-}
-
-__global__ void propose_kernel(int64_t *out, int64_t *proposal,
-                               const int64_t *rowptr, const int64_t *col,
-                               int64_t numel) {
-
-  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (thread_idx < numel) {
-    if (out[thread_idx] != -1)
-      return; // Only vist blue nodes.
-
-    bool has_unmatched_neighbor = false;
-
-    for (int64_t i = rowptr[thread_idx]; i < rowptr[thread_idx + 1]; i++) {
-      auto v = col[i];
-
-      if (out[v] < 0)
-        has_unmatched_neighbor = true; // Unmatched neighbor found.
-
-      if (out[v] == -2) {
-        proposal[thread_idx] = v; // Propose to first red neighbor.
-        break;
-      }
-    }
-
-    if (!has_unmatched_neighbor)
-      out[thread_idx] = thread_idx;
-  }
-}
-
-template <typename scalar_t>
-__global__ void weighted_propose_kernel(int64_t *out, int64_t *proposal,
-                                        const int64_t *rowptr,
-                                        const int64_t *col,
-                                        const scalar_t *weight, int64_t numel) {
-
-  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (thread_idx < numel) {
-    if (out[thread_idx] != -1)
-      return; // Only vist blue nodes.
-
-    bool has_unmatched_neighbor = false;
-    int64_t v_max = -1;
-    scalar_t w_max = 0;
-
-    for (int64_t i = rowptr[thread_idx]; i < rowptr[thread_idx + 1]; i++) {
-      auto v = col[i];
-
-      if (out[v] < 0)
-        has_unmatched_neighbor = true; // Unmatched neighbor found.
-
-      // Find maximum weighted red neighbor.
-      if (out[v] == -2 && weight[i] >= w_max) {
-        v_max = v;
-        w_max = weight[i];
-      }
-    }
-
-    proposal[thread_idx] = v_max; // Propose.
-
-    if (!has_unmatched_neighbor)
-      out[thread_idx] = thread_idx;
-  }
-}
-
-void propose(torch::Tensor out, torch::Tensor proposal, torch::Tensor rowptr,
-             torch::Tensor col,
-             torch::optional<torch::Tensor> optional_weight) {
-
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-
-  if (!optional_weight.has_value()) {
-   hipLaunchKernelGGL(( propose_kernel), dim3(BLOCKS(out.numel())), dim3(THREADS), 0, stream, 
-        out.data_ptr<int64_t>(), proposal.data_ptr<int64_t>(),
-        rowptr.data_ptr<int64_t>(), col.data_ptr<int64_t>(), out.numel());
-  } else {
-    auto weight = optional_weight.value();
-    auto scalar_type = weight.scalar_type();
-    AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, scalar_type, "_", [&] {
-     hipLaunchKernelGGL(( weighted_propose_kernel<scalar_t>)
-          , dim3(BLOCKS(out.numel())), dim3(THREADS), 0, stream, 
-              out.data_ptr<int64_t>(), proposal.data_ptr<int64_t>(),
-              rowptr.data_ptr<int64_t>(), col.data_ptr<int64_t>(),
-              weight.data_ptr<scalar_t>(), out.numel());
-    });
-  }
-}
-
-__global__ void respond_kernel(int64_t *out, const int64_t *proposal,
-                               const int64_t *rowptr, const int64_t *col,
-                               int64_t numel) {
-  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (thread_idx < numel) {
-    if (out[thread_idx] != -2)
-      return; // Only vist red nodes.
-
-    bool has_unmatched_neighbor = false;
-
-    for (int64_t i = rowptr[thread_idx]; i < rowptr[thread_idx + 1]; i++) {
-      auto v = col[i];
-
-      if (out[v] < 0)
-        has_unmatched_neighbor = true; // Unmatched neighbor found.
-
-      if (out[v] == -1 && proposal[v] == thread_idx) {
-        // Match first blue neighbhor v which proposed to u.
-        out[thread_idx] = min(thread_idx, v);
-        out[v] = min(thread_idx, v);
-        break;
-      }
-    }
-
-    if (!has_unmatched_neighbor)
-      out[thread_idx] = thread_idx;
-  }
-}
-
-template <typename scalar_t>
-__global__ void weighted_respond_kernel(int64_t *out, const int64_t *proposal,
-                                        const int64_t *rowptr,
-                                        const int64_t *col,
-                                        const scalar_t *weight, int64_t numel) {
-  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (thread_idx < numel) {
-    if (out[thread_idx] != -2)
-      return; // Only vist red nodes.
-
-    bool has_unmatched_neighbor = false;
-    int64_t v_max = -1;
-    scalar_t w_max = 0;
-
-    for (int64_t i = rowptr[thread_idx]; i < rowptr[thread_idx + 1]; i++) {
-      auto v = col[i];
-
-      if (out[v] < 0)
-        has_unmatched_neighbor = true; // Unmatched neighbor found.
-
-      if (out[v] == -1 && proposal[v] == thread_idx && weight[i] >= w_max) {
-        // Find maximum weighted blue neighbhor v which proposed to u.
-        v_max = v;
-        w_max = weight[i];
-      }
-    }
-
-    if (v_max >= 0) {
-      out[thread_idx] = min(thread_idx, v_max); // Match neighbors.
-      out[v_max] = min(thread_idx, v_max);
-    }
-
-    if (!has_unmatched_neighbor)
-      out[thread_idx] = thread_idx;
-  }
-}
-
-void respond(torch::Tensor out, torch::Tensor proposal, torch::Tensor rowptr,
-             torch::Tensor col,
-             torch::optional<torch::Tensor> optional_weight) {
-
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-
-  if (!optional_weight.has_value()) {
-   hipLaunchKernelGGL(( respond_kernel), dim3(BLOCKS(out.numel())), dim3(THREADS), 0, stream, 
-        out.data_ptr<int64_t>(), proposal.data_ptr<int64_t>(),
-        rowptr.data_ptr<int64_t>(), col.data_ptr<int64_t>(), out.numel());
-  } else {
-    auto weight = optional_weight.value();
-    auto scalar_type = weight.scalar_type();
-    AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, scalar_type, "_", [&] {
-     hipLaunchKernelGGL(( weighted_respond_kernel<scalar_t>)
-          , dim3(BLOCKS(out.numel())), dim3(THREADS), 0, stream, 
-              out.data_ptr<int64_t>(), proposal.data_ptr<int64_t>(),
-              rowptr.data_ptr<int64_t>(), col.data_ptr<int64_t>(),
-              weight.data_ptr<scalar_t>(), out.numel());
-    });
-  }
-}
-
-torch::Tensor graclus_cuda(torch::Tensor rowptr, torch::Tensor col,
-                           torch::optional<torch::Tensor> optional_weight) {
-  CHECK_CUDA(rowptr);
-  CHECK_CUDA(col);
-  CHECK_INPUT(rowptr.dim() == 1 && col.dim() == 1);
-  if (optional_weight.has_value()) {
-    CHECK_CUDA(optional_weight.value());
-    CHECK_INPUT(optional_weight.value().dim() == 1);
-    CHECK_INPUT(optional_weight.value().numel() == col.numel());
-  }
-  hipSetDevice(rowptr.get_device());
-
-  int64_t num_nodes = rowptr.numel() - 1;
-  auto out = torch::full(num_nodes, -1, rowptr.options());
-  auto proposal = torch::full(num_nodes, -1, rowptr.options());
-
-  while (!colorize(out)) {
-    propose(out, proposal, rowptr, col, optional_weight);
-    respond(out, proposal, rowptr, col, optional_weight);
-  }
-
-  return out;
-}
--- a/csrc/hip/grid_hip_hip.hip
+++ b/csrc/hip/grid_hip_hip.hip
-#include "hip/hip_runtime.h"
-#include "grid_hip.h"
-
-#include <ATen/hip/HIPContext.h>
-
-#include "utils.cuh"
-
-#define THREADS 1024
-#define BLOCKS(N) (N + THREADS - 1) / THREADS
-
-template <typename scalar_t>
-__global__ void grid_kernel(const scalar_t *pos, const scalar_t *size,
-                            const scalar_t *start, const scalar_t *end,
-                            int64_t *out, int64_t D, int64_t numel) {
-  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (thread_idx < numel) {
-    int64_t c = 0, k = 1;
-    for (int64_t d = 0; d < D; d++) {
-      scalar_t p = pos[thread_idx * D + d] - start[d];
-      c += (int64_t)(p / size[d]) * k;
-      k *= (int64_t)((end[d] - start[d]) / size[d]) + 1;
-    }
-    out[thread_idx] = c;
-  }
-}
-
-torch::Tensor grid_cuda(torch::Tensor pos, torch::Tensor size,
-                        torch::optional<torch::Tensor> optional_start,
-                        torch::optional<torch::Tensor> optional_end) {
-  CHECK_CUDA(pos);
-  CHECK_CUDA(size);
-  hipSetDevice(pos.get_device());
-
-  if (optional_start.has_value())
-    CHECK_CUDA(optional_start.value());
-  if (optional_start.has_value())
-    CHECK_CUDA(optional_start.value());
-
-  pos = pos.view({pos.size(0), -1}).contiguous();
-  size = size.contiguous();
-
-  CHECK_INPUT(size.numel() == pos.size(1));
-
-  if (!optional_start.has_value())
-    optional_start = std::get<0>(pos.min(0));
-  else {
-    optional_start = optional_start.value().contiguous();
-    CHECK_INPUT(optional_start.value().numel() == pos.size(1));
-  }
-
-  if (!optional_end.has_value())
-    optional_end = std::get<0>(pos.max(0));
-  else {
-    optional_start = optional_start.value().contiguous();
-    CHECK_INPUT(optional_start.value().numel() == pos.size(1));
-  }
-
-  auto start = optional_start.value();
-  auto end = optional_end.value();
-
-  auto out = torch::empty(pos.size(0), pos.options().dtype(torch::kLong));
-
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, pos.scalar_type(), "_", [&] {
-   hipLaunchKernelGGL(( grid_kernel<scalar_t>), dim3(BLOCKS(out.numel())), dim3(THREADS), 0, stream, 
-        pos.data_ptr<scalar_t>(), size.data_ptr<scalar_t>(),
-        start.data_ptr<scalar_t>(), end.data_ptr<scalar_t>(),
-        out.data_ptr<int64_t>(), pos.size(1), out.numel());
-  });
-
-  return out;
-}
--- a/csrc/hip/knn_hip_hip.hip
+++ b/csrc/hip/knn_hip_hip.hip
-#include "hip/hip_runtime.h"
-#include "radius_hip.h"
-
-#include <ATen/hip/HIPContext.h>
-
-#include "utils.cuh"
-
-#define THREADS 256
-
-template <typename scalar_t> struct Cosine {
-  static inline __device__ scalar_t dot(const scalar_t *a, const scalar_t *b,
-                                        int64_t n_a, int64_t n_b,
-                                        int64_t size) {
-    scalar_t result = 0;
-    for (int64_t i = 0; i < size; i++) {
-      result += a[n_a * size + i] * b[n_b * size + i];
-    }
-    return result;
-  }
-
-  static inline __device__ scalar_t norm(const scalar_t *a, int64_t n_a,
-                                         int64_t size) {
-    scalar_t result = 0;
-    for (int64_t i = 0; i < size; i++) {
-      result += a[n_a * size + i] * a[n_a * size + i];
-    }
-    return sqrt(result);
-  }
-};
-
-template <typename scalar_t>
-__global__ void
-knn_kernel(const scalar_t *__restrict__ x, const scalar_t *__restrict__ y,
-           const int64_t *__restrict__ ptr_x, const int64_t *__restrict__ ptr_y,
-           int64_t *__restrict__ row, int64_t *__restrict__ col,
-           const int64_t k, const int64_t n, const int64_t m, const int64_t dim,
-           const int64_t num_examples, const bool cosine) {
-
-  const int64_t n_y = blockIdx.x * blockDim.x + threadIdx.x;
-  if (n_y >= m)
-    return;
-
-  const int64_t example_idx = get_example_idx(n_y, ptr_y, num_examples);
-
-  scalar_t best_dist[100];
-  int64_t best_idx[100];
-
-  for (int e = 0; e < k; e++) {
-    best_dist[e] = 5e4;
-    best_idx[e] = -1;
-  }
-
-  for (int64_t n_x = ptr_x[example_idx]; n_x < ptr_x[example_idx + 1]; n_x++) {
-    scalar_t tmp_dist = 0;
-
-    if (cosine) {
-      tmp_dist = Cosine<scalar_t>::dot(x, y, n_x, n_y, dim) /
-                 (Cosine<scalar_t>::norm(x, n_x, dim) *
-                  Cosine<scalar_t>::norm(y, n_y, dim));
-      tmp_dist = 1. - tmp_dist;
-    } else {
-      for (int64_t d = 0; d < dim; d++) {
-        tmp_dist += (x[n_x * dim + d] - y[n_y * dim + d]) *
-                    (x[n_x * dim + d] - y[n_y * dim + d]);
-      }
-    }
-
-    for (int64_t e1 = 0; e1 < k; e1++) {
-      if (best_dist[e1] > tmp_dist) {
-        for (int64_t e2 = k - 1; e2 > e1; e2--) {
-          best_dist[e2] = best_dist[e2 - 1];
-          best_idx[e2] = best_idx[e2 - 1];
-        }
-        best_dist[e1] = tmp_dist;
-        best_idx[e1] = n_x;
-        break;
-      }
-    }
-  }
-
-  for (int64_t e = 0; e < k; e++) {
-    row[n_y * k + e] = n_y;
-    col[n_y * k + e] = best_idx[e];
-  }
-}
-
-torch::Tensor knn_cuda(const torch::Tensor x, const torch::Tensor y,
-                       torch::optional<torch::Tensor> ptr_x,
-                       torch::optional<torch::Tensor> ptr_y, const int64_t k,
-                       const bool cosine) {
-
-  CHECK_CUDA(x);
-  CHECK_CONTIGUOUS(x);
-  CHECK_INPUT(x.dim() == 2);
-  CHECK_CUDA(y);
-  CHECK_CONTIGUOUS(y);
-  CHECK_INPUT(y.dim() == 2);
-  CHECK_INPUT(x.size(1) == y.size(1));
-  AT_ASSERTM(k <= 100, "`k` needs to smaller than or equal to 100");
-
-  if (ptr_x.has_value()) {
-    CHECK_CUDA(ptr_x.value());
-    CHECK_INPUT(ptr_x.value().dim() == 1);
-  } else
-    ptr_x = torch::arange(0, x.size(0) + 1, x.size(0),
-                          x.options().dtype(torch::kLong));
-
-  if (ptr_y.has_value()) {
-    CHECK_CUDA(ptr_y.value());
-    CHECK_INPUT(ptr_y.value().dim() == 1);
-  } else
-    ptr_y = torch::arange(0, y.size(0) + 1, y.size(0),
-                          y.options().dtype(torch::kLong));
-
-  CHECK_INPUT(ptr_x.value().numel() == ptr_y.value().numel());
-
-  hipSetDevice(x.get_device());
-
-  auto row = torch::empty(y.size(0) * k, ptr_y.value().options());
-  auto col = torch::full(y.size(0) * k, -1, ptr_y.value().options());
-
-  dim3 BLOCKS((y.size(0) + THREADS - 1) / THREADS);
-
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-  auto scalar_type = x.scalar_type();
-  AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Half, scalar_type, "_", [&] {
-   hipLaunchKernelGGL(( knn_kernel<scalar_t>), dim3(BLOCKS), dim3(THREADS), 0, stream, 
-        x.data_ptr<scalar_t>(), y.data_ptr<scalar_t>(),
-        ptr_x.value().data_ptr<int64_t>(), ptr_y.value().data_ptr<int64_t>(),
-        row.data_ptr<int64_t>(), col.data_ptr<int64_t>(), k, x.size(0),
-        y.size(0), x.size(1), ptr_x.value().numel() - 1, cosine);
-  });
-
-  auto mask = col != -1;
-  return torch::stack({row.masked_select(mask), col.masked_select(mask)}, 0);
-}
--- a/csrc/hip/nearest_hip_hip.hip
+++ b/csrc/hip/nearest_hip_hip.hip
-#include "hip/hip_runtime.h"
-#include "nearest_hip.h"
-
-#include <ATen/hip/HIPContext.h>
-
-#include "utils.cuh"
-
-#define THREADS 1024
-
-template <typename scalar_t>
-__global__ void nearest_kernel(const scalar_t *x, const scalar_t *y,
-                               const int64_t *ptr_x, const int64_t *ptr_y,
-                               int64_t *out, int64_t batch_size, int64_t dim) {
-
-  const int64_t thread_idx = threadIdx.x;
-  const int64_t n_x = blockIdx.x;
-
-  int64_t batch_idx;
-  for (int64_t b = 0; b < batch_size; b++) {
-    if (n_x >= ptr_x[b] && n_x < ptr_x[b + 1]) {
-      batch_idx = b;
-      break;
-    }
-  }
-
-  const int64_t y_start_idx = ptr_y[batch_idx];
-  const int64_t y_end_idx = ptr_y[batch_idx + 1];
-
-  __shared__ scalar_t best_dist[THREADS];
-  __shared__ int64_t best_dist_idx[THREADS];
-
-  scalar_t best = 1e38;
-  int64_t best_idx = 0;
-  for (int64_t n_y = y_start_idx + thread_idx; n_y < y_end_idx;
-       n_y += THREADS) {
-    scalar_t dist = 0;
-    for (int64_t d = 0; d < dim; d++) {
-      dist += (x[n_x * dim + d] - y[n_y * dim + d]) *
-              (x[n_x * dim + d] - y[n_y * dim + d]);
-    }
-
-    if (dist < best) {
-      best = dist;
-      best_idx = n_y;
-    }
-  }
-
-  best_dist[thread_idx] = best;
-  best_dist_idx[thread_idx] = best_idx;
-
-  for (int64_t u = 0; (1 << u) < THREADS; u++) {
-    __syncthreads();
-    if (thread_idx < (THREADS >> (u + 1))) {
-      int64_t idx_1 = (thread_idx * 2) << u;
-      int64_t idx_2 = (thread_idx * 2 + 1) << u;
-      if (best_dist[idx_1] > best_dist[idx_2]) {
-        best_dist[idx_1] = best_dist[idx_2];
-        best_dist_idx[idx_1] = best_dist_idx[idx_2];
-      }
-    }
-  }
-
-  __syncthreads();
-  if (thread_idx == 0) {
-    out[n_x] = best_dist_idx[0];
-  }
-}
-
-torch::Tensor nearest_cuda(torch::Tensor x, torch::Tensor y,
-                           torch::Tensor ptr_x, torch::Tensor ptr_y) {
-  CHECK_CUDA(x);
-  CHECK_CUDA(y);
-  CHECK_CUDA(ptr_x);
-  CHECK_CUDA(ptr_y);
-  hipSetDevice(x.get_device());
-
-  x = x.view({x.size(0), -1}).contiguous();
-  y = y.view({y.size(0), -1}).contiguous();
-
-  auto out = torch::empty({x.size(0)}, ptr_x.options());
-
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-  auto scalar_type = x.scalar_type();
-  AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Half, scalar_type, "_", [&] {
-   hipLaunchKernelGGL(( nearest_kernel<scalar_t>), dim3(x.size(0)), dim3(THREADS), 0, stream, 
-        x.data_ptr<scalar_t>(), y.data_ptr<scalar_t>(),
-        ptr_x.data_ptr<int64_t>(), ptr_y.data_ptr<int64_t>(),
-        out.data_ptr<int64_t>(), ptr_x.size(0) - 1, x.size(1));
-  });
-
-  return out;
-}
--- a/csrc/hip/radius_hip_hip.hip
+++ b/csrc/hip/radius_hip_hip.hip
-#include "hip/hip_runtime.h"
-#include "radius_hip.h"
-
-#include <ATen/hip/HIPContext.h>
-
-#include "utils.cuh"
-
-#define THREADS 256
-
-template <typename scalar_t>
-__global__ void
-radius_kernel(const scalar_t *__restrict__ x, const scalar_t *__restrict__ y,
-              const int64_t *__restrict__ ptr_x,
-              const int64_t *__restrict__ ptr_y, int64_t *__restrict__ row,
-              int64_t *__restrict__ col, const scalar_t r, const int64_t n,
-              const int64_t m, const int64_t dim, const int64_t num_examples,
-              const int64_t max_num_neighbors) {
-
-  const int64_t n_y = blockIdx.x * blockDim.x + threadIdx.x;
-  if (n_y >= m)
-    return;
-
-  int64_t count = 0;
-  const int64_t example_idx = get_example_idx(n_y, ptr_y, num_examples);
-
-  for (int64_t n_x = ptr_x[example_idx]; n_x < ptr_x[example_idx + 1]; n_x++) {
-    scalar_t dist = 0;
-    for (int64_t d = 0; d < dim; d++) {
-      dist += (x[n_x * dim + d] - y[n_y * dim + d]) *
-              (x[n_x * dim + d] - y[n_y * dim + d]);
-    }
-
-    if (dist < r) {
-      row[n_y * max_num_neighbors + count] = n_y;
-      col[n_y * max_num_neighbors + count] = n_x;
-      count++;
-    }
-
-    if (count >= max_num_neighbors)
-      break;
-  }
-}
-
-torch::Tensor radius_cuda(const torch::Tensor x, const torch::Tensor y,
-                          torch::optional<torch::Tensor> ptr_x,
-                          torch::optional<torch::Tensor> ptr_y, const double r,
-                          const int64_t max_num_neighbors) {
-  CHECK_CUDA(x);
-  CHECK_CONTIGUOUS(x);
-  CHECK_INPUT(x.dim() == 2);
-  CHECK_CUDA(y);
-  CHECK_CONTIGUOUS(y);
-  CHECK_INPUT(y.dim() == 2);
-  CHECK_INPUT(x.size(1) == y.size(1));
-
-  hipSetDevice(x.get_device());
-
-  if (ptr_x.has_value()) {
-    CHECK_CUDA(ptr_x.value());
-    CHECK_INPUT(ptr_x.value().dim() == 1);
-  } else
-    ptr_x = torch::arange(0, x.size(0) + 1, x.size(0),
-                          x.options().dtype(torch::kLong));
-
-  if (ptr_y.has_value()) {
-    CHECK_CUDA(ptr_y.value());
-    CHECK_INPUT(ptr_y.value().dim() == 1);
-  } else
-    ptr_y = torch::arange(0, y.size(0) + 1, y.size(0),
-                          y.options().dtype(torch::kLong));
-
-  CHECK_INPUT(ptr_x.value().numel() == ptr_y.value().numel());
-
-  hipSetDevice(x.get_device());
-
-  auto row =
-      torch::full(y.size(0) * max_num_neighbors, -1, ptr_y.value().options());
-  auto col =
-      torch::full(y.size(0) * max_num_neighbors, -1, ptr_y.value().options());
-
-  dim3 BLOCKS((y.size(0) + THREADS - 1) / THREADS);
-
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-  auto scalar_type = x.scalar_type();
-  AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Half, scalar_type, "_", [&] {
-   hipLaunchKernelGGL(( radius_kernel<scalar_t>), dim3(BLOCKS), dim3(THREADS), 0, stream, 
-        x.data_ptr<scalar_t>(), y.data_ptr<scalar_t>(),
-        ptr_x.value().data_ptr<int64_t>(), ptr_y.value().data_ptr<int64_t>(),
-        row.data_ptr<int64_t>(), col.data_ptr<int64_t>(), r * r, x.size(0),
-        y.size(0), x.size(1), ptr_x.value().numel() - 1, max_num_neighbors);
-  });
-
-  auto mask = row != -1;
-  return torch::stack({row.masked_select(mask), col.masked_select(mask)}, 0);
-}
--- a/csrc/hip/rw_hip_hip.hip
+++ b/csrc/hip/rw_hip_hip.hip
-#include "hip/hip_runtime.h"
-#include "rw_hip.h"
-
-#include <ATen/hip/HIPContext.h>
-#include <hiprand.h>
-#include <hiprand_kernel.h>
-
-#include "utils.cuh"
-
-#define THREADS 1024
-#define BLOCKS(N) (N + THREADS - 1) / THREADS
-
-__global__ void uniform_sampling_kernel(const int64_t *rowptr,
-                                        const int64_t *col,
-                                        const int64_t *start, const float *rand,
-                                        int64_t *n_out, int64_t *e_out,
-                                        const int64_t walk_length,
-                                        const int64_t numel) {
-
-  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (thread_idx < numel) {
-    int64_t n_cur = start[thread_idx], e_cur, row_start, row_end, rnd;
-
-    n_out[thread_idx] = n_cur;
-
-    for (int64_t l = 0; l < walk_length; l++) {
-      row_start = rowptr[n_cur], row_end = rowptr[n_cur + 1];
-      if (row_end - row_start == 0) {
-        e_cur = -1;
-      } else {
-        rnd = int64_t(rand[l * numel + thread_idx] * (row_end - row_start));
-        e_cur = row_start + rnd;
-        n_cur = col[e_cur];
-      }
-      n_out[(l + 1) * numel + thread_idx] = n_cur;
-      e_out[l * numel + thread_idx] = e_cur;
-    }
-  }
-}
-
-__global__ void
-rejection_sampling_kernel(unsigned int seed, const int64_t *rowptr,
-                          const int64_t *col, const int64_t *start,
-                          int64_t *n_out, int64_t *e_out,
-                          const int64_t walk_length, const int64_t numel,
-                          const double p, const double q) {
-
-  hiprandState_t state;
-  hiprand_init(seed, 0, 0, &state);
-
-  double max_prob = fmax(fmax(1. / p, 1.), 1. / q);
-  double prob_0 = 1. / p / max_prob;
-  double prob_1 = 1. / max_prob;
-  double prob_2 = 1. / q / max_prob;
-
-  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (thread_idx < numel) {
-    int64_t t = start[thread_idx], v, x, e_cur, row_start, row_end;
-
-    n_out[thread_idx] = t;
-
-    row_start = rowptr[t], row_end = rowptr[t + 1];
-    if (row_end - row_start == 0) {
-      e_cur = -1;
-      v = t;
-    } else {
-      e_cur = row_start + (hiprand(&state) % (row_end - row_start));
-      v = col[e_cur];
-    }
-
-    n_out[numel + thread_idx] = v;
-    e_out[thread_idx] = e_cur;
-
-    for (int64_t l = 1; l < walk_length; l++) {
-      row_start = rowptr[v], row_end = rowptr[v + 1];
-
-      if (row_end - row_start == 0) {
-        e_cur = -1;
-        x = v;
-      } else if (row_end - row_start == 1) {
-        e_cur = row_start;
-        x = col[e_cur];
-      } else {
-        while (true) {
-          e_cur = row_start + (hiprand(&state) % (row_end - row_start));
-          x = col[e_cur];
-
-          double r = hiprand_uniform(&state); // (0, 1]
-
-          if (x == t && r < prob_0)
-            break;
-
-          bool is_neighbor = false;
-          row_start = rowptr[x], row_end = rowptr[x + 1];
-          for (int64_t i = row_start; i < row_end; i++) {
-            if (col[i] == t) {
-              is_neighbor = true;
-              break;
-            }
-          }
-
-          if (is_neighbor && r < prob_1)
-            break;
-          else if (r < prob_2)
-            break;
-        }
-      }
-
-      n_out[(l + 1) * numel + thread_idx] = x;
-      e_out[l * numel + thread_idx] = e_cur;
-      t = v;
-      v = x;
-    }
-  }
-}
-
-std::tuple<torch::Tensor, torch::Tensor>
-random_walk_cuda(torch::Tensor rowptr, torch::Tensor col, torch::Tensor start,
-                 int64_t walk_length, double p, double q) {
-  CHECK_CUDA(rowptr);
-  CHECK_CUDA(col);
-  CHECK_CUDA(start);
-  hipSetDevice(rowptr.get_device());
-
-  CHECK_INPUT(rowptr.dim() == 1);
-  CHECK_INPUT(col.dim() == 1);
-  CHECK_INPUT(start.dim() == 1);
-
-  auto n_out = torch::empty({walk_length + 1, start.size(0)}, start.options());
-  auto e_out = torch::empty({walk_length, start.size(0)}, start.options());
-
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-
-  if (p == 1. && q == 1.) {
-    auto rand = torch::rand({start.size(0), walk_length},
-                            start.options().dtype(torch::kFloat));
-
-   hipLaunchKernelGGL(( uniform_sampling_kernel), dim3(BLOCKS(start.numel())), dim3(THREADS), 0, stream, 
-        rowptr.data_ptr<int64_t>(), col.data_ptr<int64_t>(),
-        start.data_ptr<int64_t>(), rand.data_ptr<float>(),
-        n_out.data_ptr<int64_t>(), e_out.data_ptr<int64_t>(), walk_length,
-        start.numel());
-  } else {
-   hipLaunchKernelGGL(( rejection_sampling_kernel), dim3(BLOCKS(start.numel())), dim3(THREADS), 0, stream, 
-        time(NULL), rowptr.data_ptr<int64_t>(), col.data_ptr<int64_t>(),
-        start.data_ptr<int64_t>(), n_out.data_ptr<int64_t>(),
-        e_out.data_ptr<int64_t>(), walk_length, start.numel(), p, q);
-  }
-
-  return std::make_tuple(n_out.t().contiguous(), e_out.t().contiguous());
-}
--- a/csrc/knn.cpp
+++ b/csrc/knn.cpp
@@ -3,12 +3,12 @@

 #include "cpu/knn_cpu.h"

-#ifdef WITH_HIP
-#include "hip/knn_hip.h"
+#ifdef WITH_CUDA
+#include "cuda/knn_cuda.h"
 #endif

 #ifdef _WIN32
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__knn_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__knn_cpu(void) { return NULL; }
@@ -20,7 +20,7 @@ torch::Tensor knn(torch::Tensor x, torch::Tensor y,
                  torch::optional<torch::Tensor> ptr_y, int64_t k, bool cosine,
                  int64_t num_workers) {
  if (x.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    return knn_cuda(x, y, ptr_x, ptr_y, k, cosine);
 #else
    AT_ERROR("Not compiled with CUDA support");

--- a/csrc/nearest.cpp
+++ b/csrc/nearest.cpp
 #include <Python.h>
 #include <torch/script.h>

-#ifdef WITH_HIP
-#include "hip/nearest_hip.h"
+#ifdef WITH_CUDA
+#include "cuda/nearest_cuda.h"
 #endif

 #ifdef _WIN32
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__nearest_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__nearest_cpu(void) { return NULL; }
@@ -16,7 +16,7 @@ PyMODINIT_FUNC PyInit__nearest_cpu(void) { return NULL; }
 torch::Tensor nearest(torch::Tensor x, torch::Tensor y, torch::Tensor ptr_x,
                      torch::Tensor ptr_y) {
  if (x.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    return nearest_cuda(x, y, ptr_x, ptr_y);
 #else
    AT_ERROR("Not compiled with CUDA support");

--- a/csrc/radius.cpp
+++ b/csrc/radius.cpp
@@ -3,12 +3,12 @@

 #include "cpu/radius_cpu.h"

-#ifdef WITH_HIP
-#include "hip/radius_hip.h"
+#ifdef WITH_CUDA
+#include "cuda/radius_cuda.h"
 #endif

 #ifdef _WIN32
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__radius_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__radius_cpu(void) { return NULL; }
@@ -20,7 +20,7 @@ torch::Tensor radius(torch::Tensor x, torch::Tensor y,
                     torch::optional<torch::Tensor> ptr_y, double r,
                     int64_t max_num_neighbors, int64_t num_workers) {
  if (x.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    return radius_cuda(x, y, ptr_x, ptr_y, r, max_num_neighbors);
 #else
    AT_ERROR("Not compiled with CUDA support");

--- a/csrc/rw.cpp
+++ b/csrc/rw.cpp
@@ -3,12 +3,12 @@

 #include "cpu/rw_cpu.h"

-#ifdef WITH_HIP
-#include "hip/rw_hip.h"
+#ifdef WITH_CUDA
+#include "cuda/rw_cuda.h"
 #endif

 #ifdef _WIN32
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__rw_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__rw_cpu(void) { return NULL; }
@@ -19,7 +19,7 @@ std::tuple<torch::Tensor, torch::Tensor>
 random_walk(torch::Tensor rowptr, torch::Tensor col, torch::Tensor start,
            int64_t walk_length, double p, double q) {
  if (rowptr.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    return random_walk_cuda(rowptr, col, start, walk_length, p, q);
 #else
    AT_ERROR("Not compiled with CUDA support");

--- a/csrc/sampler.cpp
+++ b/csrc/sampler.cpp
@@ -4,7 +4,7 @@
 #include "cpu/sampler_cpu.h"

 #ifdef _WIN32
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__sampler_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__sampler_cpu(void) { return NULL; }
@@ -14,7 +14,7 @@ PyMODINIT_FUNC PyInit__sampler_cpu(void) { return NULL; }
 torch::Tensor neighbor_sampler(torch::Tensor start, torch::Tensor rowptr,
                               int64_t count, double factor) {
  if (rowptr.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    AT_ERROR("No CUDA version supported");
 #else
    AT_ERROR("Not compiled with CUDA support");

--- a/csrc/version.cpp
+++ b/csrc/version.cpp
 #include <Python.h>
 #include <torch/script.h>

-#ifdef WITH_HIP
-#include <hip/hip_runtime.h>
+#ifdef WITH_CUDA
+#include <cuda.h>
 #endif

 #ifdef _WIN32
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__version_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__version_cpu(void) { return NULL; }
@@ -14,8 +14,8 @@ PyMODINIT_FUNC PyInit__version_cpu(void) { return NULL; }
 #endif

 int64_t cuda_version() {
-#ifdef WITH_HIP
-  return TORCH_HIP_VERSION;
+#ifdef WITH_CUDA
+  return CUDA_VERSION;
 #else
  return -1;
 #endif

--- a/setup.cfg
+++ b/setup.cfg
 [metadata]
-long_description = file: README.md
-long_description_content_type = text/markdown
-classifiers = 
-	Development Status :: 5 - Production/Stable
-	License :: OSI Approved :: MIT License
-	Programming Language :: Python
-	Programming Language :: Python :: 3.7
-	Programming Language :: Python :: 3.8
-	Programming Language :: Python :: 3.9
-	Programming Language :: Python :: 3.10
-	Programming Language :: Python :: 3 :: Only
+long_description=file: README.md
+long_description_content_type=text/markdown
+
+classifiers =
+    Development Status :: 5 - Production/Stable
+    License :: OSI Approved :: MIT License
+    Programming Language :: Python
+    Programming Language :: Python :: 3.7
+    Programming Language :: Python :: 3.8
+    Programming Language :: Python :: 3.9
+    Programming Language :: Python :: 3.10
+    Programming Language :: Python :: 3 :: Only

 [aliases]
 test = pytest

 [tool:pytest]
 addopts = --capture=no
-
-[egg_info]
-tag_build = 
-tag_date = 0
-