multi_tensor_l2norm_kernel.cu 4.73 KB
Newer Older
Michael Carilli's avatar
Michael Carilli committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
// Another possibility:
// #include <torch/all.h>

#include <assert.h>

#include "type_shim.h"
#include "multi_tensor_apply.cuh"

#define BLOCK_SIZE 512
#define ILP 4

template<typename x_t>
struct L2NormFunctor
{
19
  __device__ __forceinline__ void operator()(
Michael Carilli's avatar
Michael Carilli committed
20
21
22
    int chunk_size,
    volatile int* noop_gmem,
    TensorListMetadata<1>& tl,
23
24
25
26
    float* output,
    float* output_per_tensor,
    bool per_tensor,
    int max_chunks_per_tensor)
Michael Carilli's avatar
Michael Carilli committed
27
28
29
30
31
32
33
34
35
36
37
38
39
40
  {
    // I'd like this kernel to propagate infs/nans.
    // if(*noop_gmem == 1)
    //   return;

    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    x_t* x = (x_t*)tl.addresses[0][tensor_loc];
    x += chunk_idx*chunk_size;

    n -= chunk_idx*chunk_size;

41
    __shared__ float s_vals[512];
Michael Carilli's avatar
Michael Carilli committed
42

43
44
45
46
47
    float vals[ILP]; // = {0}; // this probably works too but I want to be sure...
    for(int i = 0; i < ILP; i++)
      vals[i] = 0.f;

    for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x*ILP)
Michael Carilli's avatar
Michael Carilli committed
48
    {
49
50
51
52
53
54
55
56
57
58
      #pragma unroll
      for(int ii = 0; ii < ILP; ii++)
      {
        int i = i_start + threadIdx.x + ii*blockDim.x;
        if(i < n && i < chunk_size)
        {
          float next = static_cast<float>(x[i]);
          vals[ii] += next*next;
        }
      }
Michael Carilli's avatar
Michael Carilli committed
59
60
    }

61
62
63
64
65
    float val = 0.f;
    for(int i = 0; i < ILP; i++)
        val += vals[i];

    float final = reduce_block_into_lanes(s_vals, val);
Michael Carilli's avatar
Michael Carilli committed
66
67
68
69
70
71

    if(threadIdx.x == 0)
    {
      if(!isfinite(final))
        *noop_gmem = 1; // Blindly fire off a write.  These will race but that's ok.
      output[blockIdx.x] += final;
72
73
      if(per_tensor)
        output_per_tensor[(tl.start_tensor_this_launch + tensor_loc)*max_chunks_per_tensor + chunk_idx] = final;
Michael Carilli's avatar
Michael Carilli committed
74
75
76
77
    }
  }
};

78
79
80
81
82
83
84
85

__global__ void cleanup(
  float* output,
  float* output_per_tensor,
  float* ret,
  float* ret_per_tensor,
  bool per_tensor,
  int max_chunks_per_tensor)
86
87
88
{
  __shared__ float vals[512];

89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
  if(blockIdx.x == 0)
  {
    float val = 0;
    if(threadIdx.x < 320)
      val = output[threadIdx.x];

    float final = reduce_block_into_lanes(vals, val);

    if(threadIdx.x == 0)
      *ret = sqrt(final);
  }

  if(per_tensor)
  {
    float* output_this_tensor = output_per_tensor + blockIdx.x*max_chunks_per_tensor;

    float val = 0;
    for(int i = threadIdx.x; i < max_chunks_per_tensor; i += blockDim.x)
      val += output_this_tensor[i];
108

109
    float final = reduce_block_into_lanes(vals, val);
110

111
112
113
    if(threadIdx.x == 0)
      ret_per_tensor[blockIdx.x] = sqrt(final);
  }
114
115
}

116
117

std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(
Michael Carilli's avatar
Michael Carilli committed
118
119
  int chunk_size,
  at::Tensor noop_flag,
120
121
  std::vector<std::vector<at::Tensor>> tensor_lists,
  at::optional<bool> per_tensor_python)
Michael Carilli's avatar
Michael Carilli committed
122
{
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
  bool per_tensor = per_tensor_python.has_value() ? per_tensor_python.value() : false;

  auto float_options = tensor_lists[0][0].options().dtype(at::kFloat);
  auto output = at::zeros({320}, float_options);

  at::Tensor output_per_tensor;
  at::Tensor ret_per_tensor;

  int ntensors = tensor_lists[0].size();
  int max_chunks_per_tensor = -1;

  if(per_tensor)
  {
    for(int t = 0; t < ntensors; t++)
    {
      int max_chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1)/chunk_size;
      if(max_chunks_this_tensor > max_chunks_per_tensor)
        max_chunks_per_tensor = max_chunks_this_tensor;
    }
    output_per_tensor = at::zeros({ntensors*max_chunks_per_tensor}, float_options);
    ret_per_tensor = at::empty({ntensors}, float_options);
  }
  else
  {
    ret_per_tensor = at::empty({0}, float_options);
  }
Michael Carilli's avatar
Michael Carilli committed
149
150
151
152
153
154
155
156

  DISPATCH_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), 0, "multi_tensor_l2norm_cuda",
    multi_tensor_apply<1>(
      BLOCK_SIZE,
      chunk_size,
      noop_flag,
      tensor_lists,
      L2NormFunctor<scalar_t_0>(),
157
158
159
160
      output.data<float>(),
      per_tensor ? output_per_tensor.data<float>() : nullptr,
      per_tensor,
      max_chunks_per_tensor);)
Michael Carilli's avatar
Michael Carilli committed
161
162
163
164
165

  AT_CUDA_CHECK(cudaGetLastError());

  // AT_CUDA_CHECK(cudaDeviceSynchronize());

166
  // This involves one more small kernel launches, but will be negligible end to end.
Michael Carilli's avatar
Michael Carilli committed
167
168
  // I could get rid of these by hacking the functor + multi tensor harness with persistence
  // logic, but keeping it simple for now
169
170
  auto ret = at::empty({1}, output.options());
  auto stream = at::cuda::getCurrentCUDAStream();
171
172
173
174
175
176
177
178
179
  cleanup<<<per_tensor ? ntensors : 1, 512, 0, stream>>>(
    output.data<float>(),
    per_tensor ? output_per_tensor.data<float>() : nullptr,
    ret.data<float>(),
    per_tensor ? ret_per_tensor.data<float>() : nullptr,
    per_tensor,
    max_chunks_per_tensor);

  return std::tuple<at::Tensor, at::Tensor>(ret, ret_per_tensor);
Michael Carilli's avatar
Michael Carilli committed
180
}