#include "kernel_utils.cuh" #include #ifdef VERSION_LE_04 #include "ATen/cuda/AccumulateType.cuh" #else #include "ATen/AccumulateType.h" #endif #include "ATen/cuda/CUDATensorMethods.cuh" // #include "ATen/cuda/CUDATypeConversion.cuh" // #include template __global__ void weight_norm_fwd_first_dim_kernel (scalar_t* __restrict__ w, accscalar_t* __restrict__ norms, const scalar_t* __restrict__ v, const scalar_t* __restrict__ g, const int rowSize) { // We are norming each slowest-dim row of the tensor separately. // For now, assign one block to each row. const int tid = threadIdx.x; const int row = blockIdx.x; const int stride = blockDim.x; // Logical index offset for this flattened row const int rowStart = row*rowSize; // Hack to get around nvcc complaining when an smem array is declared with the same name // but different types in different kernels (in this case different instantiations) // extern __shared__ accscalar_t s[]; // error: declaration is incompatible with previous "s" extern __shared__ char buf[]; accscalar_t* s = (accscalar_t*)buf; accscalar_t thread_sum = 0.f; for(int i = tid; i < rowSize; i += stride ) { accscalar_t val_f = scalar_cast(v[i+rowStart]); thread_sum += val_f*val_f; // AccumOp, could do Kahan here } reduce_block_into_lanes(s, thread_sum, 1, ReduceAdd()); accscalar_t result = s[0]; result = sqrtf(result); if(tid == 0) norms[row] = result; // Broadcast load, could use shared memory instead. accscalar_t g_this_row = scalar_cast(g[row]); accscalar_t rnorm = 1.f/result; // for consistency with backward kernel // Write data to output for(int i = tid; i < rowSize; i += stride ) { accscalar_t val_f = scalar_cast(v[i+rowStart]); w[i+rowStart] = scalar_cast(g_this_row*val_f*rnorm); } } template __global__ void weight_norm_fwd_last_dim_kernel ( scalar_t* __restrict__ w, accscalar_t* __restrict__ norms, const scalar_t* __restrict__ v, const scalar_t* __restrict__ g, const int fast_dim_size, const int slower_dims_size ) { const int fast_dim_location = threadIdx.x + blockIdx.x*blockDim.x; extern __shared__ char buf[]; accscalar_t* alloc = (accscalar_t*)buf; accscalar_t* s = &alloc[0]; accscalar_t* rnorms_this_block = &alloc[blockDim.x*blockDim.y]; accscalar_t thread_sum = 0.f; int slower_dims_location = threadIdx.y; int currentIdx = fast_dim_location + fast_dim_size*slower_dims_location; if(fast_dim_location < fast_dim_size) while(slower_dims_location < slower_dims_size) { accscalar_t val_f = scalar_cast(v[currentIdx]); thread_sum += val_f*val_f; // AccumOp, could do Kahan here currentIdx += blockDim.y*fast_dim_size; slower_dims_location += blockDim.y; } reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd()); // Better to pass an EpilogueOp to reduce_block_into_lanes, implement later if(threadIdx.y == 0) { accscalar_t result = s[threadIdx.x]; accscalar_t norm_this_col = sqrtf(result); norms[fast_dim_location] = norm_this_col; rnorms_this_block[threadIdx.x] = 1.f/norm_this_col; } __syncthreads(); accscalar_t g_this_col = scalar_cast(g[fast_dim_location]); accscalar_t rnorm = rnorms_this_block[threadIdx.x]; slower_dims_location = threadIdx.y; currentIdx = fast_dim_location + fast_dim_size*slower_dims_location; if(fast_dim_location < fast_dim_size) while(slower_dims_location < slower_dims_size) { accscalar_t val_f = scalar_cast(v[currentIdx]); w[currentIdx] = scalar_cast(g_this_col*val_f*rnorm); currentIdx += blockDim.y*fast_dim_size; slower_dims_location += blockDim.y; } } void weight_norm_fwd_cuda (const at::Tensor& w, const at::Tensor& norms, const at::Tensor& v, const at::Tensor& g, int dim) { #ifdef DEBUG_ANY using namespace std; cout << "hello from send_to_fwd with v.type() = " << v.type() << endl; #endif const int ndims = v.ndimension(); if(dim == 0) { // Find logical size of each flattened slowest-dim row int rowSize = 1; for(int i = ndims - 1; i > 0; i--) rowSize *= v.size(i); using namespace at; cudaStream_t stream = globalContext().getCurrentCUDAStream(); AT_DISPATCH_FLOATING_TYPES_AND_HALF (v.type(), "weight_norm_fwd_first_dim_kernel", [&] { using cuda_scalar_t = apex::cuda::type; USING_ACCSCALAR_T weight_norm_fwd_first_dim_kernel <<>> (w.data(), norms.data(), v.data(), g.data(), rowSize); }); } else if(dim == ndims - 1) { // Precompute slower_dims_size and fast_dim_size int slower_dims_size = 1; for(int i = 0; i < ndims - 1; i++) slower_dims_size *= v.size(i); int fast_dim_size = v.size(ndims-1); using namespace at; cudaStream_t stream = globalContext().getCurrentCUDAStream(); AT_DISPATCH_FLOATING_TYPES_AND_HALF (v.type(), "weight_norm_fwd_last_dim_kernel", [&] { using cuda_scalar_t = apex::cuda::type; USING_ACCSCALAR_T // just trying this formatting out to see how it feels... weight_norm_fwd_last_dim_kernel <<<(fast_dim_size+TILE_W-1)/TILE_W, dim3(TILE_W,TILE_H), (TILE_W*TILE_H + TILE_W)*sizeof(accscalar_t), stream>>> (w.data(), norms.data(), v.data(), g.data(), fast_dim_size, slower_dims_size); }); } // else // { // intermediate dim kernel. Error checking on the dim was already done in // Module.cpp:weight_norm_fwd. Could put that logic here instead, if we include // in both files. // } // The kernel execution is asynchronous, so this will only catch errors on the kernel launch, // not the kernel's execution. Errors in kernel execution aren't guaranteed to be caught // until a later error check on a synchronizing CUDA call. Unfortunately, without manually // synchronizing here, this is the best we can do. THCudaCheck(cudaGetLastError()); #ifdef DEBUG_PROFILE THCudaCheck(cudaDeviceSynchronize()); #endif }