diag_cuda.cu 1.81 KB
Newer Older
rusty1s's avatar
matmul  
rusty1s committed
1
2
#include "diag_cuda.h"

rusty1s's avatar
rusty1s committed
3
4
#include <ATen/cuda/CUDAContext.h>

rusty1s's avatar
matmul  
rusty1s committed
5
#include "utils.cuh"
rusty1s's avatar
rusty1s committed
6
7
8

#define THREADS 1024

rusty1s's avatar
rusty1s committed
9
10
__global__ void non_diag_mask_kernel(const int64_t *row_data,
                                     const int64_t *col_data, bool *out_data,
rusty1s's avatar
rusty1s committed
11
12
13
14
15
16
                                     int64_t N, int64_t k, int64_t num_diag,
                                     int64_t numel) {

  int64_t thread_idx = blockDim.x * blockIdx.x + threadIdx.x;

  if (thread_idx < numel) {
rusty1s's avatar
rusty1s committed
17
    int64_t r = row_data[thread_idx], c = col_data[thread_idx];
rusty1s's avatar
rusty1s committed
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41

    if (k < 0) {
      if (r + k < 0) {
        out_data[thread_idx] = true;
      } else if (r + k >= N) {
        out_data[thread_idx + num_diag] = true;
      } else if (r + k > c) {
        out_data[thread_idx + r + k] = true;
      } else if (r + k < c) {
        out_data[thread_idx + r + k + 1] = true;
      }

    } else {
      if (r + k >= N) {
        out_data[thread_idx + num_diag] = true;
      } else if (r + k > c) {
        out_data[thread_idx + r] = true;
      } else if (r + k < c) {
        out_data[thread_idx + r + 1] = true;
      }
    }
  }
}

rusty1s's avatar
rusty1s committed
42
43
torch::Tensor non_diag_mask_cuda(torch::Tensor row, torch::Tensor col,
                                 int64_t M, int64_t N, int64_t k) {
rusty1s's avatar
matmul  
rusty1s committed
44
45
  CHECK_CUDA(row);
  CHECK_CUDA(col);
rusty1s's avatar
rusty1s committed
46
47
  cudaSetDevice(row.get_device());

rusty1s's avatar
matmul  
rusty1s committed
48
49
  auto E = row.size(0);
  auto num_diag = k < 0 ? std::min(M + k, N) : std::min(M, N - k);
rusty1s's avatar
rusty1s committed
50

rusty1s's avatar
matmul  
rusty1s committed
51
52
  auto row_data = row.data_ptr<int64_t>();
  auto col_data = col.data_ptr<int64_t>();
rusty1s's avatar
rusty1s committed
53

Matthias Fey's avatar
Matthias Fey committed
54
  auto mask = torch::zeros({E + num_diag}, row.options().dtype(torch::kBool));
rusty1s's avatar
matmul  
rusty1s committed
55
  auto mask_data = mask.data_ptr<bool>();
rusty1s's avatar
rusty1s committed
56

rusty1s's avatar
rusty1s committed
57
58
59
  if (E == 0)
    return mask;

rusty1s's avatar
rusty1s committed
60
61
  auto stream = at::cuda::getCurrentCUDAStream();
  non_diag_mask_kernel<<<(E + THREADS - 1) / THREADS, THREADS, 0, stream>>>(
rusty1s's avatar
rusty1s committed
62
      row_data, col_data, mask_data, N, k, num_diag, E);
rusty1s's avatar
rusty1s committed
63
64
65

  return mask;
}