"...git@developer.sourcefind.cn:OpenDAS/ktransformers.git" did not exist on "c6aa379de2251e64bab47a2704266abf150080a8"
Commit d17a015f authored by Michael Carilli's avatar Michael Carilli
Browse files

Transferred backend and build system to use Pytorch C++ extension + ATen dispatch.

parent e733e78c
from . import amp
from . import RNN
from . import reparameterization
from . import fp16_utils
from . import parallel
from . import amp
from torch.utils.ffi import _wrap_function
from ._scale_lib import lib as _lib, ffi as _ffi
__all__ = []
def _import_symbols(locals):
for symbol in dir(_lib):
fn = getattr(_lib, symbol)
if callable(fn):
locals[symbol] = _wrap_function(fn, _ffi)
else:
locals[symbol] = fn
__all__.append(symbol)
_import_symbols(locals())
......@@ -4,7 +4,7 @@ import warnings
import torch
from ._C import scale_lib
from apex._C import scale_check_overflow
class AmpHandle(object):
def __init__(self, enable_caching=True):
......@@ -33,9 +33,9 @@ class AmpHandle(object):
for group in optimizer.param_groups:
for p in group['params']:
if p.grad is not None:
scale_lib.scale_check_overflow(p.grad.data,
1. / self._loss_scale,
self._overflow_buf)
scale_check_overflow(p.grad.data,
1. / self._loss_scale,
self._overflow_buf)
if self._overflow_buf.any():
self._loss_scale /= 2.
optimizer_step = optimizer.step
......
#include <THC/THC.h>
#include "scale_kernel.h"
extern THCState *state;
void scale_check_overflow(THCudaTensor *grads,
float scale,
THCudaByteTensor *overflow_buf) {
size_t num_elems = THCudaTensor_nElement(state, grads);
float *d_grads = THCudaTensor_data(state, grads);
size_t buf_elems = THCudaByteTensor_nElement(state, overflow_buf);
uint8_t *d_overflow_buf = THCudaByteTensor_data(state, overflow_buf);
scale_check_overflow_kernel(state, d_grads, num_elems, scale,
d_overflow_buf, buf_elems);
}
void scale_check_overflow(THCudaTensor *grads,
float scale,
THCudaByteTensor *overflow_buf);
#ifndef SCALE_KERNEL_H
#define SCALE_KERNEL_H
#include <THC/THC.h>
#ifdef __cplusplus
extern "C" {
#endif
void scale_check_overflow_kernel(THCState *state,
float *d_grads, size_t n, float scale,
uint8_t *d_buf, size_t buf_n);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // SCALE_KERNEL_H
# This file contains the cffi-extension call to build the custom
# kernel used by amp.
# For mysterious reasons, it needs to live at the top-level directory.
# TODO: remove this when we move to cpp-extension.
import os
import torch
from torch.utils.ffi import create_extension
assert torch.cuda.is_available()
abs_path = os.path.dirname(os.path.realpath(__file__))
sources = ['apex/amp/src/scale_cuda.c']
headers = ['apex/amp/src/scale_cuda.h']
defines = [('WITH_CUDA', None)]
with_cuda = True
extra_objects = [os.path.join(abs_path, 'build/scale_kernel.o')]
# When running `python build_cffi.py` directly, set package=False. But
# if it's used with `cffi_modules` in setup.py, then set package=True.
package = (__name__ != '__main__')
extension = create_extension(
'apex.amp._C.scale_lib',
package=package,
headers=headers,
sources=sources,
define_macros=defines,
relative_to=__file__,
with_cuda=with_cuda,
extra_objects=extra_objects
)
if __name__ == '__main__':
extension.build()
#define PY_SSIZE_T_CLEAN
#define ARG_OFFSET 5
#include <Python.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <vector>
#include <string>
#include <cmath>
#include <cassert>
#include <iostream>
// #define USE_NVTX
#ifdef USE_NVTX
#include "nvToolsExt.h"
#endif
//Meta-data format we will use
#include <THCTensorInfo.cuh>
//Cuda kernels
#include <kernel.h>
#define ERROR_MSG cout << "Error at " << __FILE__ << ":" << __LINE__ << "\n";
using namespace std;
TensorInfo<void, idxType> PyOb_2_tinfo(PyObject* tensor, float_types data_type)
{
PyObject* PyStrides = PyObject_CallMethod(tensor, "stride", NULL);
if(PyStrides == NULL)
{
ERROR_MSG;
cout << "PyStrides = NULL" << endl;
}
PyObject* PySizes = PyObject_CallMethod(tensor, "size", NULL);
if(PySizes == NULL)
{
ERROR_MSG;
cout << "PySizes = NULL" << endl;
}
PyObject* PyDataPtr = PyObject_CallMethod(tensor, "data_ptr", NULL);
if(PyDataPtr == NULL)
{
ERROR_MSG;
cout << "PyDataPtr = NULL" << endl;
}
void* data_ptr = (void*) PyLong_AsLong(PyDataPtr);
Py_ssize_t ndims = PyList_GET_SIZE(PySizes);
// TODO put proper checking on ndims < MAX_CUTORCH_DIMS
idxType strides[MAX_CUTORCH_DIMS], sizes[MAX_CUTORCH_DIMS];
for(int i = 0; i < ndims; i++)
{
strides[i] = PyLong_AsLong(PyTuple_GetItem(PyStrides, i));
sizes[i] = PyLong_AsLong(PyTuple_GetItem(PySizes, i));
}
Py_DECREF(PyStrides);
Py_DECREF(PySizes);
Py_DECREF(PyDataPtr);
return TensorInfo<void, idxType>(data_ptr, ndims, sizes, strides, data_type);
}
vector<TensorInfo<void, idxType> > get_TInfos(PyObject* args)
{
vector<TensorInfo<void, idxType> > info_vec;
#ifdef DEBUG_ANY
cout << "Processing " << PyTuple_GET_SIZE(args) << " arguments" << endl;
#endif
#ifdef CHECK_MEMLEAK
for(int iter = 0; iter < 1e7; iter++ )
#endif
for(Py_ssize_t i = 0; i<PyTuple_GET_SIZE(args) - 1; i++)
{
PyObject* pyTensor = PyTuple_GetItem(args, i);
// check type, only take if Tensor, Variable, or Parameter
string objType(pyTensor->ob_type->tp_name);
PyObject* pyObjTypeCall = PyObject_CallMethod(pyTensor, "type", NULL);
if(pyObjTypeCall == NULL)
{
ERROR_MSG;
cout << "For args item " << i << ", pyObjTypeCall = NULL" << endl;
}
PyObject* pyObjASCII = PyUnicode_AsASCIIString(pyObjTypeCall);
if(pyObjASCII == NULL)
{
ERROR_MSG;
cout << "For args item " << i << ", pyObjASCII = NULL " << endl;
}
Py_DECREF(pyObjTypeCall);
string objTypeCall(PyBytes_AsString(pyObjASCII));
Py_DECREF(pyObjASCII);
#ifdef DEBUG_ANY
cout << "arg " << i << endl;
cout << "objType = " << objType << endl;
cout << "objTypeCall = " << objTypeCall << endl;
#endif
if(objTypeCall == "torch.cuda.FloatTensor")
#ifdef CHECK_MEMLEAK
if(iter == 0 )
#endif
info_vec.push_back(PyOb_2_tinfo(pyTensor, FLOAT));
#ifdef CHECK_MEMLEAK
else
info_vec[i] = PyOb_2_tinfo(pyTensor, FLOAT);
#endif
else if(objTypeCall == "torch.cuda.HalfTensor")
info_vec.push_back(PyOb_2_tinfo(pyTensor, HALF));
// TODO add double
else
{
ERROR_MSG;
cout << "For args item " << i << ", unsupported .type() found: "
<< objTypeCall << "\n"
"Supported types:\n"
"torch.cuda.FloatTensor\n"
"torch.cuda.HalfTensor\n"
"torch.autograd.variable.Variable containing FloatTensor\n"
"torch.autograd.variable.Variable containing HalfTensor\n"
"torch.nn.parameter.Parameter containing FloatTensor\n"
"torch.nn.parameter.Parameter containing HalfTensor\n"
<< endl;
}
}
return info_vec;
}
int getLastArg_AsInt(PyObject* args)
{
// None of these should return new references so I don't think this leaks memory.
int dims = PyLong_AsLong(PyTuple_GetItem(args, PyTuple_GET_SIZE(args) - 1));
return dims;
}
// Stepping stone, can evolve to be more general (argument forwarding?)
template<typename wrapper>
void dispatch
(
float_types rtti,
vector<TensorInfo<void, idxType>>& tensors,
int dim
)
{
switch(rtti)
{
case FLOAT:
wrapper::template call<float, float, idxType>(tensors, dim);
break;
case HALF:
wrapper::template call<half, float, idxType>(tensors, dim);
break;
default:
std::cout << "Unsupported rtti in Module.cpp:dispatch()" << std::endl;
PyErr_SetString(PyExc_RuntimeError, "Unsupported data type in Module.cpp:dispatch, "
"supported data types are half and float");
exit(-1);
}
}
// Will extract all tensors in order. Assumes flat structure, tensors can not be wrapped in lists
// tuples or any other iterator structure.
static PyObject* weight_norm_fwd(PyObject* self, PyObject* args)
{
#ifdef USE_NVTX
nvtxRangePushA("weight_norm_fwd C backend");
#endif
vector<TensorInfo<void, idxType> > tensors = get_TInfos(args);
int dim = getLastArg_AsInt(args);
if(dim != 0 && dim != tensors[2].dims - 1)
PyErr_SetString(PyExc_RuntimeError, "weight_norm_fwd currently only "
"supports first or last dimension.");
else
{
#ifdef DEBUG_ANY
cout << "tensors.size() = " << tensors.size() << ", dim = " << dim << endl;
#endif
dispatch<send_to_fwd_wrapper>(tensors[0].type, tensors, dim);
#ifdef USE_NVTX
nvtxRangePop();
#endif
}
Py_RETURN_NONE;
}
static PyObject* weight_norm_bwd(PyObject* self, PyObject* args)
{
#ifdef USE_NVTX
nvtxRangePushA("weight_norm_bwd C backend");
#endif
vector<TensorInfo<void, idxType> >tensors = get_TInfos(args);
int dim = getLastArg_AsInt(args);
if(dim != 0 && dim != tensors[3].dims - 1)
PyErr_SetString(PyExc_RuntimeError, "weight_norm_bwd currently only "
"supports first or last dimension.");
else
{
#ifdef DEBUG_ANY
cout << "tensors.size() = " << tensors.size() << ", dim = " << dim << endl;
#endif
dispatch<send_to_bwd_wrapper>(tensors[0].type, tensors, dim);
#ifdef USE_NVTX
nvtxRangePop();
#endif
}
Py_RETURN_NONE;
}
//*******************PYTHON BOILER PLATE*******************
static PyMethodDef apex_methods[] = {
{"weight_norm_fwd", (PyCFunction) weight_norm_fwd, METH_VARARGS, "Slowest-dim norm, forward pass."},
{"weight_norm_bwd", (PyCFunction) weight_norm_bwd, METH_VARARGS, "Slowest-dim norm, backward pass."},
{NULL, NULL, 0, NULL}
};
#if PY_MAJOR_VERSION >= 3
//Module Definitions
static struct PyModuleDef apex = {
PyModuleDef_HEAD_INIT, "apex._C", "Module to add CUDA extensions to Pytorch.", -1, apex_methods
};
//Initialization Function
PyMODINIT_FUNC PyInit__C(void){
//Let's throw an error if we can't find pytorch.
PyImport_ImportModule("torch");
Py_Initialize();
return PyModule_Create(&apex);
}
#else
PyMODINIT_FUNC initMODULE(void){
//Let's throw an error if we can't find pytorch.
PyImport_ImportModule("torch");
(void) Py_InitModule3("apex._C", apex, "A PyTorch Extension.");
}
#endif
//*********************************************************
#include <torch/torch.h>
// Ideally, I'd like to call this file "weight_norm.cu" and put the interface and the implementation
// here, but I can't make nvcc play well with torch.h. For now, use a layer of indirection
// and separate .cu implementation files.
// If we want everything to be part of "apex._C", we need all the interface functions defined
// in this file, or linker will complain about "multiple definitions of PyInit".
// TODO: multiple modules?
// TODO: modify fwd+bwd calls to return a tuple of Tensors. This will require changing the
// Python client code as well. For now, get things working with the same Python-side API.
void weight_norm_fwd_cuda
(const at::Tensor& w,
const at::Tensor& norms,
const at::Tensor& v,
const at::Tensor& g,
int dim);
void weight_norm_fwd
(at::Tensor w,
at::Tensor norms,
at::Tensor v,
at::Tensor g,
int dim)
{
weight_norm_fwd_cuda(w, norms, v, g, dim);
}
void weight_norm_bwd_cuda
(const at::Tensor& pLpv,
const at::Tensor& pLpg,
const at::Tensor& pLpw,
const at::Tensor& savedv,
const at::Tensor& savedg,
const at::Tensor& savedNorms,
int dim);
void weight_norm_bwd
(at::Tensor pLpv,
at::Tensor pLpg,
at::Tensor pLpw,
at::Tensor savedv,
at::Tensor savedg,
at::Tensor savedNorms,
int dim)
{
weight_norm_bwd_cuda(pLpv, pLpg, pLpw, savedv, savedg, savedNorms, dim);
}
void scale_check_overflow_cuda
(const at::Tensor& d_grads,
float scale,
const at::Tensor& d_buf);
void scale_check_overflow
(at::Tensor grads,
float scale,
at::Tensor overflow_buf)
{
AT_CHECK(grads.type().is_cuda(), "x must be a CUDA tensor");
AT_CHECK(overflow_buf.type().is_cuda(), "y must be a CUDA tensor");
scale_check_overflow_cuda(grads, scale, overflow_buf);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("weight_norm_fwd", &weight_norm_fwd, "Fused weight norm, forward pass");
m.def("weight_norm_bwd", &weight_norm_bwd, "Fused weight norm, backward pass");
m.def("scale_check_overflow", &scale_check_overflow, "Fused overflow check + scale for FP32 tensors");
}
#include "../include/kernel.h"
template<typename T> struct TtoInt { static const int test = -1; };
template<> struct TtoInt<float> { static const int test = 0; };
template<> struct TtoInt<half> { static const int test = 0; };
template<> struct TtoInt<double> { static const int test = 0; };
#if __CUDACC_VER_MAJOR__ >= 9
#define __SHFL_DOWN(var, delta) __shfl_down_sync(0xffffffff, var, delta)
#else
#define __SHFL_DOWN(var, delta) __shfl_down(var, delta)
#endif
#if __CUDACC_VER_MAJOR__ >= 9
#define __SYNCWARP __syncwarp()
#else
#define __SYNCWARP
#endif
// Block size for weight_norm_*_first_dim_kernel.
// Currently, kernels are non-persistent.
// Dialing up the block size to, say 1024, can improve performance by
// increase the amount of cache available per block, which can improve cache hit rate.
// However, this is less efficient for short rows. 256 is pretty versatile.
// May be worth implementing heuristics later.
#define BLOCK 256
// Block size for weight_norm_*_last_dim_kernel.
// This is tricker than the first_dim case because we must make blocks
// at least 16 fast elements wide to ensure fully-coalesced half-precision accesses.
// Since output-element parallelism is along the fast dimension, this reduces the number of
// blocks we can launch by 16X.
#define TILE_W 16
// Somewhat versatile strategy: max out intra-block parallelism by extending
// blocks across the slow dimension up to the hardware-max block size of 1024.
#define TILE_H 64
using namespace std;
// lanes is intended to be <= 32.
template <typename T>
__device__ __forceinline__ void reduce_block_into_lanes(T *x, T val, int lanes)
{
int tid = threadIdx.x + threadIdx.y*blockDim.x;
int blockSize = blockDim.x*blockDim.y;
if(blockSize >= 64)
{
x[tid] = val;
__syncthreads();
}
#pragma unroll
for(int i = (blockSize >> 1); i >= 64; i >>= 1)
{
if(tid < i)
x[tid] += x[tid+i]; // JoinOp
__syncthreads();
}
if(tid < 32)
{
T final;
if(blockSize >= 64)
final = x[tid] + x[tid+32]; // JoinOp
else
final = val;
// __SYNCWARP();
#pragma unroll
for(int i = 16; i >= lanes; i >>= 1)
final += __SHFL_DOWN(final, i);
if(tid < lanes)
x[tid] = final; // EpilogueOp
}
// Make sure the smem result is visible to all warps.
__syncthreads();
}
template <typename T, typename IndexType>
__global__ void weight_norm_fwd_first_dim_kernel
(
TensorInfo<T, IndexType> w,
TensorInfo<float, IndexType> norms,
TensorInfo<T, IndexType> v,
TensorInfo<T, IndexType> g,
IndexType rowSize
)
{
// We are norming each slowest-dim row of the tensor separately.
// For now, assign one block to each row.
IndexType tid = threadIdx.x;
IndexType row = blockIdx.x;
IndexType stride = blockDim.x;
// Logical index offset for this flattened row
IndexType rowStart = row*rowSize;
extern __shared__ float s[];
float thread_sum = 0.f;
for(IndexType i = tid; i < rowSize; i += stride )
{
float val_f = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(v, i + rowStart));
thread_sum += val_f*val_f; // AccumOp, could do Kahan here
}
reduce_block_into_lanes(s, thread_sum, 1);
float result = s[0];
result = sqrtf(result);
if(tid == 0)
DEVICE_LINEAR_GET_F(norms, row) = result;
// Broadcast load, could use shared memory instead.
float g_this_row = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(g, row));
float rnorm = 1.f/result; // for consistency with backward kernel
// Write data to output
for(IndexType i = tid; i < rowSize; i += stride )
{
float val_f = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(v, i + rowStart));
DEVICE_LINEAR_GET(w, i + rowStart) = ScalarConvert<float,T>::to(g_this_row*val_f*rnorm);
}
}
template <typename T, typename IndexType>
__global__ void weight_norm_fwd_last_dim_kernel
(
TensorInfo<T, IndexType> w,
TensorInfo<float, IndexType> norms,
TensorInfo<T, IndexType> v,
TensorInfo<T, IndexType> g,
IndexType fast_dim_size,
IndexType slower_dims_size
)
{
IndexType fast_dim_location = threadIdx.x + blockIdx.x*blockDim.x;
extern __shared__ float alloc[];
float* s = &alloc[0];
float* rnorms_this_block = &alloc[blockDim.x*blockDim.y];
float thread_sum = 0.f;
IndexType slower_dims_location = threadIdx.y;
IndexType currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
if(fast_dim_location < fast_dim_size)
while(slower_dims_location < slower_dims_size)
{
float val_f = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(v, currentIdx));
thread_sum += val_f*val_f; // AccumOp, could do Kahan here
currentIdx += blockDim.y*fast_dim_size;
slower_dims_location += blockDim.y;
}
reduce_block_into_lanes(s, thread_sum, blockDim.x);
// Better to pass an EpilogueOp to reduce_block_into_lanes, implement later
if(threadIdx.y == 0)
{
float result = s[threadIdx.x];
float norm_this_col = sqrtf(result);
DEVICE_LINEAR_GET_F(norms, fast_dim_location) = norm_this_col;
rnorms_this_block[threadIdx.x] = 1.f/norm_this_col;
}
__syncthreads();
float g_this_col = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(g, fast_dim_location));
float rnorm = rnorms_this_block[threadIdx.x];
slower_dims_location = threadIdx.y;
currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
if(fast_dim_location < fast_dim_size)
while(slower_dims_location < slower_dims_size)
{
float val_f = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(v, currentIdx));
DEVICE_LINEAR_GET(w, currentIdx) = ScalarConvert<float,T>::to(g_this_col*val_f*rnorm);
currentIdx += blockDim.y*fast_dim_size;
slower_dims_location += blockDim.y;
}
}
template <typename T, typename IndexType>
__global__ void weight_norm_bwd_first_dim_kernel
(
TensorInfo<T, IndexType> pLpv,
TensorInfo<T, IndexType> pLpg,
TensorInfo<T, IndexType> pLpw,
TensorInfo<T, IndexType> savedv,
TensorInfo<T, IndexType> savedg,
TensorInfo<float, IndexType> savedNorms,
IndexType rowSize
)
{
// For now, assign one block to each row.
IndexType tid = threadIdx.x;
IndexType row = blockIdx.x;
IndexType stride = blockDim.x;
// Logical index offset for this flattened row
IndexType rowStart = row*rowSize;
extern __shared__ float s[];
float thread_sum = 0.f;
for(IndexType i = tid; i < rowSize; i += stride )
{
float pLpwi = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(pLpw, i + rowStart));
float savedvi = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(savedv, i + rowStart));
thread_sum += pLpwi*savedvi; // AccumOp, could do Kahan here
}
reduce_block_into_lanes(s, thread_sum, 1);
float result = s[0];
// Could choose to save reciprocal of norm instead I suppose, but norms is probably
// more handy to keep around.
// Broadcast load; could use shared memory instead.
float rnorm = 1.f/DEVICE_LINEAR_GET_F(savedNorms, row);
float rnorm3 = rnorm*rnorm*rnorm;
// Write g gradients.
if(tid == 0)
DEVICE_LINEAR_GET(pLpg, row) = ScalarConvert<float, T>::to(result*rnorm);
// Broadcast load, could use shared memory instead.
float g_this_row = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(savedg, row));
// Write v gradients. We are reusing values that were loaded earlier, so there
// is an optimization opportunity here (store values persistently).
for(IndexType j = tid; j < rowSize; j += stride )
{
float pLpwj = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(pLpw, j + rowStart));
float savedvj = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(savedv, j + rowStart));
float pLpvj = g_this_row*(rnorm*pLpwj - rnorm3*savedvj*result);
DEVICE_LINEAR_GET(pLpv, j + rowStart) = ScalarConvert<float,T>::to(pLpvj);
}
}
template <typename T, typename IndexType>
__global__ void weight_norm_bwd_last_dim_kernel
(
TensorInfo<T, IndexType> pLpv,
TensorInfo<T, IndexType> pLpg,
TensorInfo<T, IndexType> pLpw,
TensorInfo<T, IndexType> savedv,
TensorInfo<T, IndexType> savedg,
TensorInfo<float, IndexType> savedNorms,
IndexType fast_dim_size,
IndexType slower_dims_size
)
{
IndexType fast_dim_location = threadIdx.x + blockIdx.x*blockDim.x;
extern __shared__ float s[];
float thread_sum = 0.f;
IndexType slower_dims_location = threadIdx.y;
IndexType currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
if(fast_dim_location < fast_dim_size)
while(slower_dims_location < slower_dims_size)
{
float pLpwi = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(pLpw, currentIdx));
float savedvi = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(savedv, currentIdx));
thread_sum += pLpwi*savedvi; // AccumOp, could do Kahan here
currentIdx += blockDim.y*fast_dim_size;
slower_dims_location += blockDim.y;
}
reduce_block_into_lanes(s, thread_sum, blockDim.x);
float result = s[threadIdx.x];
// Broadcast load; could use shared memory instead.
float rnorm = 1.f/DEVICE_LINEAR_GET_F(savedNorms, fast_dim_location);
float rnorm3 = rnorm*rnorm*rnorm;
// Write g gradients.
if(threadIdx.y == 0)
DEVICE_LINEAR_GET(pLpg, fast_dim_location) = ScalarConvert<float, T>::to(result*rnorm);
// Entire block pulls these values, could use shared memory instead.
float g_this_col = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(savedg, fast_dim_location));
// Write v gradients.
slower_dims_location = threadIdx.y;
currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
if(fast_dim_location < fast_dim_size)
while(slower_dims_location < slower_dims_size)
{
float pLpwj = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(pLpw, currentIdx));
float savedvj = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(savedv, currentIdx));
float pLpvj = g_this_col*(rnorm*pLpwj - rnorm3*savedvj*result);
DEVICE_LINEAR_GET(pLpv, currentIdx) = ScalarConvert<float,T>::to(pLpvj);
currentIdx += blockDim.y*fast_dim_size;
slower_dims_location += blockDim.y;
}
}
template<typename DataType,
typename AccumType,
typename IndexType>
void send_to_fwd_wrapper::call
(
vector<TensorInfo<void, idxType>>& tensors,
int dim
)
{
#ifdef DEBUG_ANY
cout << "hello from send_to_fwd with v.type = " << v.type << endl;
#endif
auto w (*((TensorInfo<DataType , idxType>*)&tensors[0]));
auto norms(*((TensorInfo<AccumType, idxType>*)&tensors[1]));
auto v (*((TensorInfo<DataType , idxType>*)&tensors[2]));
auto g (*((TensorInfo<DataType , idxType>*)&tensors[3]));
if(dim == 0)
{
// Find logical size of each flattened slowest-dim row
IndexType rowSize = 1;
for(IndexType i = v.dims - 1; i > 0; i--)
rowSize *= v.sizes[i];
weight_norm_fwd_first_dim_kernel<<<v.sizes[0], BLOCK, BLOCK*sizeof(float)>>>
(
w,
norms,
v,
g,
rowSize
);
}
else if(dim == v.dims - 1)
{
// Precompute slower_dims_size and fast_dim_size because they involve dynamically indexing an array.
IndexType slower_dims_size = 1;
for(IndexType i = 0; i < v.dims - 1; i++)
slower_dims_size *= v.sizes[i];
int fast_dim_size = v.sizes[v.dims-1];
weight_norm_fwd_last_dim_kernel<<<(fast_dim_size+TILE_W-1)/TILE_W,
dim3(TILE_W,TILE_H),
(TILE_W*TILE_H + TILE_W)*sizeof(float)>>>
(
w,
norms,
v,
g,
fast_dim_size,
slower_dims_size
);
}
// else
// {
// intermediate dim kernel. Error checking on the dim was already done in
// Module.cpp:weight_norm_fwd. Could put that logic here instead, if we include
// <python.h> in both files.
// }
#ifdef DEBUG_PROFILE
cudaDeviceSynchronize();
#endif
}
template<typename DataType,
typename AccumType,
typename IndexType>
void send_to_bwd_wrapper::call
(
vector<TensorInfo<void, idxType>>& tensors,
int dim
)
{
#ifdef DEBUG_ANY
cout << "Hello from send_to_bwd with pLpw.type = " << pLpw.type << endl;
#endif
auto pLpv (*((TensorInfo<DataType , idxType>*)&tensors[0]));
auto pLpg (*((TensorInfo<DataType , idxType>*)&tensors[1]));
auto pLpw (*((TensorInfo<DataType , idxType>*)&tensors[2]));
auto savedv (*((TensorInfo<DataType , idxType>*)&tensors[3]));
auto savedg (*((TensorInfo<DataType , idxType>*)&tensors[4]));
auto savedNorms(*((TensorInfo<AccumType, idxType>*)&tensors[5]));
if(dim == 0)
{
// Find logical size of each flattened slowest-dim row
IndexType rowSize = 1;
for(IndexType i = savedv.dims - 1; i > 0; i--)
rowSize *= savedv.sizes[i];
weight_norm_bwd_first_dim_kernel<<<pLpw.sizes[0], BLOCK, BLOCK*sizeof(float)>>>
(
pLpv,
pLpg,
pLpw,
savedv,
savedg,
savedNorms,
rowSize
);
}
else if(dim == savedv.dims - 1)
{
// Precompute slower_dims_size and fast_dim_size because they involve dynamically indexing an array.
IndexType slower_dims_size = 1;
for(IndexType i = 0; i < savedv.dims - 1; i++)
slower_dims_size *= savedv.sizes[i];
int fast_dim_size = savedv.sizes[savedv.dims-1];
weight_norm_bwd_last_dim_kernel<<<(fast_dim_size+TILE_W-1)/TILE_W,
dim3(TILE_W,TILE_H),
(TILE_W*TILE_H + TILE_W)*sizeof(float)>>>
(
pLpv,
pLpg,
pLpw,
savedv,
savedg,
savedNorms,
fast_dim_size,
slower_dims_size
);
}
// else
// {
// intermediate dim kernel. Error checking on the dim was already done in
// Module.cpp:weight_norm_bwd. Could put that logic here instead, if we include
// <python.h> in both files.
// }
#ifdef DEBUG_PROFILE
cudaDeviceSynchronize();
#endif
}
#define INSTANTIATE_SEND_TO_FWD(DATATYPE, ACCUMTYPE, IDXTYPE) \
template void send_to_fwd_wrapper::call<DATATYPE, ACCUMTYPE, IDXTYPE> \
( \
vector<TensorInfo<void, idxType>>&, \
int \
);
INSTANTIATE_SEND_TO_FWD(float, float, idxType)
INSTANTIATE_SEND_TO_FWD(half, float, idxType)
#undef INSTANTIATE_SEND_TO_FWD
#define INSTANTIATE_SEND_TO_BWD(DATATYPE, ACCUMTYPE, IDXTYPE) \
template void send_to_bwd_wrapper::call<DATATYPE, ACCUMTYPE, IDXTYPE> \
( \
vector<TensorInfo<void, idxType>>&, \
int \
);
INSTANTIATE_SEND_TO_BWD(float, float, idxType)
INSTANTIATE_SEND_TO_BWD(half, float, idxType)
#undef INSTANTIATE_SEND_TO_BWD
#undef BLOCK
#undef TILE_W
#undef TILE_H
#include <cuda.h>
#include <cuda_runtime.h>
#if __CUDACC_VER_MAJOR__ >= 9
#define __SHFL_DOWN(var, delta) __shfl_down_sync(0xffffffff, var, delta)
#else
#define __SHFL_DOWN(var, delta) __shfl_down(var, delta)
#endif
#if __CUDACC_VER_MAJOR__ >= 9
#define __SYNCWARP __syncwarp()
#else
#define __SYNCWARP
#endif
// Block size for weight_norm_*_first_dim_kernel.
// Currently, kernels are non-persistent.
// Dialing up the block size to, say 1024, can improve performance by
// increase the amount of cache available per block, which can improve cache hit rate.
// However, this is less efficient for short rows. 256 is pretty versatile.
// May be worth implementing heuristics later.
#define BLOCK 256
// Block size for weight_norm_*_last_dim_kernel.
// This is tricker than the first_dim case because we must make blocks
// at least 16 fast elements wide to ensure fully-coalesced half-precision accesses.
// Since output-element parallelism is along the fast dimension, this reduces the number of
// blocks we can launch by 16X.
#define TILE_W 16
// Somewhat versatile strategy: max out intra-block parallelism by extending
// blocks across the slow dimension up to the hardware-max block size of 1024.
#define TILE_H 64
// For reference, in THCTensorMathReduce.cuh:
// template <typename T>
// struct ReduceAdd {
// inline __device__ T operator()(const T a, const T b) const {
// return THCNumerics<T>::add(a, b);
// }
// };
// lanes is intended to be <= 32.
template
<typename T,
typename ReduceOp>
__device__ __forceinline__ void reduce_block_into_lanes
(T *x,
T val,
int lanes,
ReduceOp reduceOp)
{
int tid = threadIdx.x + threadIdx.y*blockDim.x;
int blockSize = blockDim.x*blockDim.y;
if(blockSize >= 64)
{
x[tid] = val;
__syncthreads();
}
#pragma unroll
for(int i = (blockSize >> 1); i >= 64; i >>= 1)
{
if(tid < i)
x[tid] = reduceOp(x[tid], x[tid+i]);
__syncthreads();
}
if(tid < 32)
{
T final;
if(blockSize >= 64)
final = reduceOp(x[tid], x[tid+32]);
else
final = val;
// __SYNCWARP();
#pragma unroll
for(int i = 16; i >= lanes; i >>= 1)
final = reduceOp(final, __SHFL_DOWN(final, i));
if(tid < lanes)
x[tid] = final; // EpilogueOp
}
// Make sure the smem result is visible to all warps.
__syncthreads();
}
#include "scale_kernel.h"
#include <ATen/ATen.h>
#include "ATen/AccumulateType.h"
#include "ATen/cuda/CUDATensorMethods.cuh"
#include "ATen/cuda/CUDATypeConversion.cuh"
#include <THC/THCTensorMathReduce.cuh>
#include <assert.h>
#define BLOCK_SIZE 1024
#define MAX_BLOCKS 1024
#ifdef __cplusplus
extern "C" {
#endif
__global__
void scale_reduce_overflow(float *in, size_t n, float scale,
uint8_t *overflow_out) {
// It makes sense to lock the type to "float" here because the downscaling
// should only be applied to the FP32 master gradients. Also, if "in" were
// a different type, it would require divergent code for the vectorized load logic.
__global__ void scale_reduce_overflow
(float *in,
size_t n,
float scale,
uint8_t *overflow_out)
{
__shared__ uint8_t cta_overflow[BLOCK_SIZE];
int tid = blockIdx.x * blockDim.x + threadIdx.x;
......@@ -71,17 +79,25 @@ void scale_reduce_overflow(float *in, size_t n, float scale,
}
}
void scale_check_overflow_kernel(THCState *state,
float *d_grads, size_t n, float scale,
uint8_t *d_buf, size_t buf_n) {
int num_blks = min((int(n) + BLOCK_SIZE - 1) / BLOCK_SIZE,
MAX_BLOCKS);
assert(buf_n >= num_blks);
cudaStream_t cur_stream = THCState_getCurrentStream(state);
scale_reduce_overflow<<<num_blks, BLOCK_SIZE, 0, cur_stream>>>(
d_grads, n, scale, d_buf);
void scale_check_overflow_cuda
(const at::Tensor& d_grads,
float scale,
const at::Tensor& d_buf)
{
using namespace at;
cudaStream_t stream = globalContext().getCurrentCUDAStream();
size_t n = d_grads.numel();
size_t buf_n = d_buf.numel();
int num_blks = min((int(n) + BLOCK_SIZE - 1) / BLOCK_SIZE,
MAX_BLOCKS);
assert(buf_n >= num_blks);
scale_reduce_overflow<<<num_blks, BLOCK_SIZE, 0, stream>>>
(d_grads.data<float>(),
n,
scale,
d_buf.data<uint8_t>());
THCudaCheck(cudaGetLastError());
}
#ifdef __cplusplus
} // extern "C"
#endif
\ No newline at end of file
#include "kernel_utils.cuh"
#include <ATen/ATen.h>
#include "ATen/AccumulateType.h"
#include "ATen/cuda/CUDATensorMethods.cuh"
#include "ATen/cuda/CUDATypeConversion.cuh"
#include <THC/THCTensorMathReduce.cuh>
template
<typename scalar_t,
typename accscalar_t>
__global__ void weight_norm_bwd_first_dim_kernel
(scalar_t* __restrict__ pLpv,
scalar_t* __restrict__ pLpg,
const scalar_t* __restrict__ pLpw,
const scalar_t* __restrict__ savedv,
const scalar_t* __restrict__ savedg,
const accscalar_t* __restrict__ savedNorms,
const int rowSize)
{
// For now, assign one block to each row.
const int tid = threadIdx.x;
const int row = blockIdx.x;
const int stride = blockDim.x;
// Logical index offset for this flattened row
const int rowStart = row*rowSize;
// Hack to get around nvcc complaining when an smem array is declared with the same name
// but different types in different kernels (in this case different instantiations)
// extern __shared__ accscalar_t s[]; // error: declaration is incompatible with previous "s"
extern __shared__ char buf[];
accscalar_t* s = (accscalar_t*)buf;
accscalar_t thread_sum = 0.f;
for(int i = tid; i < rowSize; i += stride )
{
accscalar_t pLpwi = scalar_cast<accscalar_t>(pLpw[i+rowStart]);
accscalar_t savedvi = scalar_cast<accscalar_t>(savedv[i+rowStart]);
thread_sum += pLpwi*savedvi; // AccumOp, could do Kahan here
}
reduce_block_into_lanes(s, thread_sum, 1, ReduceAdd<accscalar_t>());
accscalar_t result = s[0];
// Could choose to save reciprocal of norm instead I suppose, but norms is probably
// more handy to keep around.
// Broadcast load; could use shared memory instead.
accscalar_t rnorm = 1.f/savedNorms[row];
accscalar_t rnorm3 = rnorm*rnorm*rnorm;
// Write g gradients.
if(tid == 0)
pLpg[row] = scalar_cast<scalar_t>(result*rnorm);
// Broadcast load, could use shared memory instead.
accscalar_t g_this_row = scalar_cast<accscalar_t>(savedg[row]);
// Write v gradients. We are reusing values that were loaded earlier, so there
// is an optimization opportunity here (store values persistently).
for(int j = tid; j < rowSize; j += stride )
{
accscalar_t pLpwj = scalar_cast<accscalar_t>(pLpw[j+rowStart]);
accscalar_t savedvj = scalar_cast<accscalar_t>(savedv[j+rowStart]);
accscalar_t pLpvj = g_this_row*(rnorm*pLpwj - rnorm3*savedvj*result);
pLpv[j+rowStart] = scalar_cast<scalar_t>(pLpvj);
}
}
template
<typename scalar_t,
typename accscalar_t>
__global__ void weight_norm_bwd_last_dim_kernel
(scalar_t* __restrict__ pLpv,
scalar_t* __restrict__ pLpg,
const scalar_t* __restrict__ pLpw,
const scalar_t* __restrict__ savedv,
const scalar_t* __restrict__ savedg,
const accscalar_t* __restrict__ savedNorms,
const int fast_dim_size,
const int slower_dims_size)
{
const int fast_dim_location = threadIdx.x + blockIdx.x*blockDim.x;
extern __shared__ char buf[];
accscalar_t* s = (accscalar_t*)buf;
accscalar_t thread_sum = 0.f;
int slower_dims_location = threadIdx.y;
int currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
if(fast_dim_location < fast_dim_size)
while(slower_dims_location < slower_dims_size)
{
accscalar_t pLpwi = scalar_cast<accscalar_t>(pLpw[currentIdx]);
accscalar_t savedvi = scalar_cast<accscalar_t>(savedv[currentIdx]);
thread_sum += pLpwi*savedvi; // AccumOp, could do Kahan here
currentIdx += blockDim.y*fast_dim_size;
slower_dims_location += blockDim.y;
}
reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd<accscalar_t>());
accscalar_t result = s[threadIdx.x];
// Broadcast load; could use shared memory instead.
accscalar_t rnorm = 1.f/savedNorms[fast_dim_location];
accscalar_t rnorm3 = rnorm*rnorm*rnorm;
// Write g gradients.
if(threadIdx.y == 0)
pLpg[fast_dim_location] = scalar_cast<scalar_t>(result*rnorm);
// Entire block pulls these values, could use shared memory instead.
accscalar_t g_this_col = scalar_cast<accscalar_t>(savedg[fast_dim_location]);
// Write v gradients.
slower_dims_location = threadIdx.y;
currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
if(fast_dim_location < fast_dim_size)
while(slower_dims_location < slower_dims_size)
{
accscalar_t pLpwj = scalar_cast<accscalar_t>(pLpw[currentIdx]);
accscalar_t savedvj = scalar_cast<accscalar_t>(savedv[currentIdx]);
accscalar_t pLpvj = g_this_col*(rnorm*pLpwj - rnorm3*savedvj*result);
pLpv[currentIdx] = scalar_cast<scalar_t>(pLpvj);
currentIdx += blockDim.y*fast_dim_size;
slower_dims_location += blockDim.y;
}
}
void weight_norm_bwd_cuda
(const at::Tensor& pLpv,
const at::Tensor& pLpg,
const at::Tensor& pLpw,
const at::Tensor& savedv,
const at::Tensor& savedg,
const at::Tensor& savedNorms,
int dim)
{
#ifdef DEBUG_ANY
using namespace std;
cout << "Hello from send_to_bwd with pLpw.type = " << pLpw.type << endl;
#endif
const int ndims = savedv.ndimension();
if(dim == 0)
{
// Find logical size of each flattened slowest-dim row
int rowSize = 1;
for(int i = ndims - 1; i > 0; i--)
rowSize *= savedv.size(i);
using namespace at;
cudaStream_t stream = globalContext().getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(savedv.type(),
"weight_norm_bwd_first_dim_kernel",
[&]
{
using cuda_scalar_t = cuda::type<scalar_t>;
using accscalar_t = acc_type<cuda_scalar_t, true>;
weight_norm_bwd_first_dim_kernel
<<<pLpw.size(0),
BLOCK,
BLOCK*sizeof(accscalar_t),
stream>>>
(pLpv.data<cuda_scalar_t>(),
pLpg.data<cuda_scalar_t>(),
pLpw.data<cuda_scalar_t>(),
savedv.data<cuda_scalar_t>(),
savedg.data<cuda_scalar_t>(),
savedNorms.data<accscalar_t>(),
rowSize);
});
}
else if(dim == ndims - 1)
{
// Precompute slower_dims_size and fast_dim_size because they involve dynamically indexing an array.
int slower_dims_size = 1;
for(int i = 0; i < ndims - 1; i++)
slower_dims_size *= savedv.size(i);
int fast_dim_size = savedv.size(ndims-1);
using namespace at;
cudaStream_t stream = globalContext().getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(savedv.type(),
"weight_norm_bwd_last_dim_kernel",
[&]
{
using cuda_scalar_t = cuda::type<scalar_t>;
using accscalar_t = acc_type<cuda_scalar_t, true>;
weight_norm_bwd_last_dim_kernel
<<<(fast_dim_size+TILE_W-1)/TILE_W,
dim3(TILE_W,TILE_H),
(TILE_W*TILE_H + TILE_W)*sizeof(accscalar_t),
stream>>>
(pLpv.data<cuda_scalar_t>(),
pLpg.data<cuda_scalar_t>(),
pLpw.data<cuda_scalar_t>(),
savedv.data<cuda_scalar_t>(),
savedg.data<cuda_scalar_t>(),
savedNorms.data<accscalar_t>(),
fast_dim_size,
slower_dims_size);
});
}
// else
// {
// intermediate dim kernel. Error checking on the dim was already done in
// Module.cpp:weight_norm_bwd. Could put that logic here instead, if we include
// <python.h> in both files.
// }
// The kernel execution is asynchronous, so this will only catch errors on the kernel launch,
// not the kernel's execution. Errors in kernel execution aren't guaranteed to be caught
// until a later error check on a synchronizing CUDA call. Unfortunately, without manually
// synchronizing here, this is the best we can do.
THCudaCheck(cudaGetLastError());
#ifdef DEBUG_PROFILE
THCudaCheck(cudaDeviceSynchronize());
#endif
}
#include "kernel_utils.cuh"
#include <ATen/ATen.h>
#include "ATen/AccumulateType.h"
#include "ATen/cuda/CUDATensorMethods.cuh"
#include "ATen/cuda/CUDATypeConversion.cuh"
#include <THC/THCTensorMathReduce.cuh>
template
<typename scalar_t,
typename accscalar_t>
__global__ void weight_norm_fwd_first_dim_kernel
(scalar_t* __restrict__ w,
accscalar_t* __restrict__ norms,
const scalar_t* __restrict__ v,
const scalar_t* __restrict__ g,
const int rowSize)
{
// We are norming each slowest-dim row of the tensor separately.
// For now, assign one block to each row.
const int tid = threadIdx.x;
const int row = blockIdx.x;
const int stride = blockDim.x;
// Logical index offset for this flattened row
const int rowStart = row*rowSize;
// Hack to get around nvcc complaining when an smem array is declared with the same name
// but different types in different kernels (in this case different instantiations)
// extern __shared__ accscalar_t s[]; // error: declaration is incompatible with previous "s"
extern __shared__ char buf[];
accscalar_t* s = (accscalar_t*)buf;
accscalar_t thread_sum = 0.f;
for(int i = tid; i < rowSize; i += stride )
{
accscalar_t val_f = scalar_cast<accscalar_t>(v[i+rowStart]);
thread_sum += val_f*val_f; // AccumOp, could do Kahan here
}
reduce_block_into_lanes(s, thread_sum, 1, ReduceAdd<accscalar_t>());
accscalar_t result = s[0];
result = sqrtf(result);
if(tid == 0)
norms[row] = result;
// Broadcast load, could use shared memory instead.
accscalar_t g_this_row = scalar_cast<accscalar_t>(g[row]);
accscalar_t rnorm = 1.f/result; // for consistency with backward kernel
// Write data to output
for(int i = tid; i < rowSize; i += stride )
{
accscalar_t val_f = scalar_cast<accscalar_t>(v[i+rowStart]);
w[i+rowStart] = scalar_cast<scalar_t>(g_this_row*val_f*rnorm);
}
}
template
<typename scalar_t,
typename accscalar_t>
__global__ void weight_norm_fwd_last_dim_kernel
(
scalar_t* __restrict__ w,
accscalar_t* __restrict__ norms,
const scalar_t* __restrict__ v,
const scalar_t* __restrict__ g,
const int fast_dim_size,
const int slower_dims_size
)
{
const int fast_dim_location = threadIdx.x + blockIdx.x*blockDim.x;
extern __shared__ char buf[];
accscalar_t* alloc = (accscalar_t*)buf;
accscalar_t* s = &alloc[0];
accscalar_t* rnorms_this_block = &alloc[blockDim.x*blockDim.y];
accscalar_t thread_sum = 0.f;
int slower_dims_location = threadIdx.y;
int currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
if(fast_dim_location < fast_dim_size)
while(slower_dims_location < slower_dims_size)
{
accscalar_t val_f = scalar_cast<accscalar_t>(v[currentIdx]);
thread_sum += val_f*val_f; // AccumOp, could do Kahan here
currentIdx += blockDim.y*fast_dim_size;
slower_dims_location += blockDim.y;
}
reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd<accscalar_t>());
// Better to pass an EpilogueOp to reduce_block_into_lanes, implement later
if(threadIdx.y == 0)
{
accscalar_t result = s[threadIdx.x];
accscalar_t norm_this_col = sqrtf(result);
norms[fast_dim_location] = norm_this_col;
rnorms_this_block[threadIdx.x] = 1.f/norm_this_col;
}
__syncthreads();
accscalar_t g_this_col = scalar_cast<accscalar_t>(g[fast_dim_location]);
accscalar_t rnorm = rnorms_this_block[threadIdx.x];
slower_dims_location = threadIdx.y;
currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
if(fast_dim_location < fast_dim_size)
while(slower_dims_location < slower_dims_size)
{
accscalar_t val_f = scalar_cast<accscalar_t>(v[currentIdx]);
w[currentIdx] = scalar_cast<scalar_t>(g_this_col*val_f*rnorm);
currentIdx += blockDim.y*fast_dim_size;
slower_dims_location += blockDim.y;
}
}
void weight_norm_fwd_cuda
(const at::Tensor& w,
const at::Tensor& norms,
const at::Tensor& v,
const at::Tensor& g,
int dim)
{
#ifdef DEBUG_ANY
using namespace std;
cout << "hello from send_to_fwd with v.type = " << v.type << endl;
#endif
const int ndims = v.ndimension();
if(dim == 0)
{
// Find logical size of each flattened slowest-dim row
int rowSize = 1;
for(int i = ndims - 1; i > 0; i--)
rowSize *= v.size(i);
using namespace at;
cudaStream_t stream = globalContext().getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(v.type(),
"weight_norm_fwd_first_dim_kernel",
[&]
{
using cuda_scalar_t = cuda::type<scalar_t>;
using accscalar_t = acc_type<cuda_scalar_t, true>;
weight_norm_fwd_first_dim_kernel
<<<v.size(0),
BLOCK,
BLOCK*sizeof(accscalar_t),
stream>>>
(w.data<cuda_scalar_t>(),
norms.data<accscalar_t>(),
v.data<cuda_scalar_t>(),
g.data<cuda_scalar_t>(),
rowSize);
});
}
else if(dim == ndims - 1)
{
// Precompute slower_dims_size and fast_dim_size
int slower_dims_size = 1;
for(int i = 0; i < ndims - 1; i++)
slower_dims_size *= v.size(i);
int fast_dim_size = v.size(ndims-1);
using namespace at;
cudaStream_t stream = globalContext().getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(v.type(),
"weight_norm_fwd_last_dim_kernel",
[&]
{
using cuda_scalar_t = cuda::type<scalar_t>;
using accscalar_t = acc_type<cuda_scalar_t, true>;
// just trying this formatting out to see how it feels...
weight_norm_fwd_last_dim_kernel
<<<(fast_dim_size+TILE_W-1)/TILE_W,
dim3(TILE_W,TILE_H),
(TILE_W*TILE_H + TILE_W)*sizeof(accscalar_t),
stream>>>
(w.data<cuda_scalar_t>(),
norms.data<accscalar_t>(),
v.data<cuda_scalar_t>(),
g.data<cuda_scalar_t>(),
fast_dim_size,
slower_dims_size);
});
}
// else
// {
// intermediate dim kernel. Error checking on the dim was already done in
// Module.cpp:weight_norm_fwd. Could put that logic here instead, if we include
// <python.h> in both files.
// }
// The kernel execution is asynchronous, so this will only catch errors on the kernel launch,
// not the kernel's execution. Errors in kernel execution aren't guaranteed to be caught
// until a later error check on a synchronizing CUDA call. Unfortunately, without manually
// synchronizing here, this is the best we can do.
THCudaCheck(cudaGetLastError());
#ifdef DEBUG_PROFILE
THCudaCheck(cudaDeviceSynchronize());
#endif
}
#ifndef THC_TENSOR_INFO_INC
#define THC_TENSOR_INFO_INC
#include <cuda.h>
#include <cuda_fp16.h>
#include <assert.h>
// Maximum number of dimensions allowed for cutorch
#define MAX_CUTORCH_DIMS 10
// Warning string for tensor arguments that are too large or have too many dimensions
#define CUTORCH_STR(X) #X
#define CUTORCH_DIM_WARNING "tensor too large or too many (>" \
CUTORCH_STR(MAX_CUTORCH_DIMS) ") dimensions"
enum float_types { FLOAT = 0 , HALF = 1, DOUBLE = 2 };
// CUDA kernel argument that defines tensor layout
template <typename T, typename IndexType>
struct TensorInfo {
TensorInfo(T* p,
int dim,
IndexType sz[MAX_CUTORCH_DIMS],
IndexType st[MAX_CUTORCH_DIMS]);
TensorInfo(T* p,
int dim,
IndexType sz[MAX_CUTORCH_DIMS],
IndexType st[MAX_CUTORCH_DIMS],
float_types type);
//Good way to cast from another format
//template <TensorInfo<typename T2, typename I2> >
//TensorInfo(TensorInfo<T2, I2> &tinfo_in){
// data = reinterpret_cast<T*>(tinfo_in.data);
//}
T* data;
IndexType sizes[MAX_CUTORCH_DIMS];
IndexType strides[MAX_CUTORCH_DIMS];
int dims;
float_types type;
};
//Expand our combinations as convenient typedefs
typedef TensorInfo<half, int> t_hi;
typedef TensorInfo<half, long> t_hl;
typedef TensorInfo<float, int> t_fi;
typedef TensorInfo<float, long> t_fl;
template <typename T, typename IndexType>
TensorInfo<T, IndexType>::TensorInfo(T* p,
int dim,
IndexType sz[MAX_CUTORCH_DIMS],
IndexType st[MAX_CUTORCH_DIMS]) {
data = p;
dims = dim;
assert(dims > 0 && dims < MAX_CUTORCH_DIMS);
for (int i = 0; i < dim; ++i) {
sizes[i] = sz[i];
strides[i] = st[i];
}
}
template<typename T, typename IndexType>
TensorInfo<T, IndexType>::TensorInfo(T* p,
int dim,
IndexType sz[MAX_CUTORCH_DIMS],
IndexType st[MAX_CUTORCH_DIMS],
float_types _type){
data = p;
dims = dim;
assert(dims > 0 && dims < MAX_CUTORCH_DIMS);
for (int i = 0; i < dim; ++i) {
sizes[i] = sz[i];
strides[i] = st[i];
}
type=_type;
}
// Translate a linear index for the apply to a T* offset;
// specialized on `Dims` to reduce nvcc compilation time
template <typename T, typename IndexType, int Dims>
struct IndexToOffset {
static __forceinline__ __host__ __device__ IndexType get(
IndexType linearId,
const TensorInfo<T, IndexType>& info) {
IndexType offset = 0;
// Use static dims
for (int i = Dims - 1; i > 0; --i) {
for (int i = Dims - 1; i > 0; --i) {
offset += linearId % info.sizes[i] * info.strides[i];
linearId /= info.sizes[i];
}
offset += linearId * info.strides[0];
return offset;
}
}
};
// For contiguous tensors, the offset = index
template <typename T, typename IndexType>
struct IndexToOffset<T, IndexType, -2> {
static __forceinline__ __host__ __device__ IndexType
get(IndexType linearId, const TensorInfo<T, IndexType>& info) {
return linearId;
}
};
template <typename T, typename IndexType>
struct IndexToOffset<T, IndexType, -1> {
static __forceinline__ __host__ __device__ IndexType get(
IndexType linearId,
const TensorInfo<T, IndexType>& info) {
IndexType offset = 0;
// Use dynamic dims
for (int i = info.dims - 1; i >= 0; --i) {
IndexType curDimIndex = linearId % info.sizes[i];
IndexType curDimOffset = curDimIndex * info.strides[i];
offset += curDimOffset;
linearId /= info.sizes[i];
}
return offset;
}
};
#endif // THC_TENSOR_INFO_INC
#include "THCTensorInfo.cuh"
#include <iostream>
#include <cstdio>
#include <cassert>
#include <cuda.h>
#include <vector>
#define Dims -2
#define DEVICE_LINEAR_GET(D_TENSOR, INDEX) D_TENSOR.data[IndexToOffset<T, IndexType, Dims>::get(INDEX, D_TENSOR)]
#define DEVICE_LINEAR_GET_F(D_TENSOR, INDEX) D_TENSOR.data[IndexToOffset<float, IndexType, Dims>::get(INDEX, D_TENSOR)]
typedef int idxType;
struct send_to_fwd_wrapper
{
template<typename DataType,
typename AccumType,
typename IndexType>
static void call(std::vector<TensorInfo<void, idxType>>& tensors, int dim);
};
struct send_to_bwd_wrapper
{
template<typename DataType,
typename AccumType,
typename IndexType>
static void call(std::vector<TensorInfo<void, idxType>>& tensors, int dim);
};
template <typename In, typename Out>
struct ScalarConvert {
static __host__ __device__ __forceinline__ Out to(const In v) { return (Out) v; }
};
#ifdef CUDA_HALF_TENSOR
template <typename Out>
struct ScalarConvert<half, Out> {
static __host__ __device__ __forceinline__ Out to(const half v) {
#ifdef __CUDA_ARCH__
return (Out) __half2float(v);
#else
return (Out) THC_half2float(v);
#endif
}
};
template <typename In>
struct ScalarConvert<In, half> {
static __host__ __device__ __forceinline__ half to(const In v) {
#ifdef __CUDA_ARCH__
return __float2half((float) v);
#else
return THC_float2half((float) v);
#endif
}
};
template <>
struct ScalarConvert<half, half> {
static __host__ __device__ __forceinline__ half to(const half v) {
return v;
}
};
#endif
import re
import sys
import os
import shutil
import inspect
import distutils
import distutils.spawn
import torch.cuda
from setuptools import setup, find_packages
from distutils.command.clean import clean
from torch.utils.cpp_extension import CppExtension, CUDAExtension
from torch.utils.cpp_extension import CUDA_HOME
# TODO: multiple modules, so we don't have to route all interfaces through
# the same interface.cpp file?
if torch.cuda.is_available() and CUDA_HOME is not None:
ext_modules = []
extension = CUDAExtension(
'apex._C', [
'csrc/interface.cpp',
'csrc/weight_norm_fwd_cuda.cu',
'csrc/weight_norm_bwd_cuda.cu',
'csrc/scale_cuda.cu',
],
extra_compile_args={'cxx': ['-g'],
'nvcc': ['-O2', '-arch=sm_70']}) # TODO: compile for all arches.
ext_modules.append(extension)
else:
raise RuntimeError("Apex requires Cuda 9.0 or higher")
from setuptools import setup, Extension, find_packages
from setuptools.command.install import install
import subprocess
import ctypes.util
import torch
def find(path, regex_func, collect=False):
"""
Recursively searches through a directory with regex_func and
either collects all instances or returns the first instance.
Args:
path: Directory to search through
regex_function: A function to run on each file to decide if it should be returned/collected
collect (False) : If True will collect all instances of matching, else will return first instance only
"""
collection = [] if collect else None
for root, dirs, files in os.walk(path):
for file in files:
if regex_func(file):
if collect:
collection.append(os.path.join(root, file))
else:
return os.path.join(root, file)
return list(set(collection))
def findcuda():
"""
Based on PyTorch build process. Will look for nvcc for compilation.
Either will set cuda home by enviornment variable CUDA_HOME or will search
for nvcc. Returns NVCC executable, cuda major version and cuda home directory.
"""
cuda_path = None
CUDA_HOME = None
CUDA_HOME = os.getenv('CUDA_HOME', '/usr/local/cuda')
if not os.path.exists(CUDA_HOME):
# We use nvcc path on Linux and cudart path on macOS
cudart_path = ctypes.util.find_library('cudart')
if cudart_path is not None:
cuda_path = os.path.dirname(cudart_path)
if cuda_path is not None:
CUDA_HOME = os.path.dirname(cuda_path)
if not cuda_path and not CUDA_HOME:
nvcc_path = find('/usr/local/', re.compile("nvcc").search, False)
if nvcc_path:
CUDA_HOME = os.path.dirname(nvcc_path)
if CUDA_HOME:
os.path.dirname(CUDA_HOME)
if (not os.path.exists(CUDA_HOME+os.sep+"lib64")
or not os.path.exists(CUDA_HOME+os.sep+"include") ):
raise RuntimeError("Error: found NVCC at ", nvcc_path ," but could not locate CUDA libraries"+
" or include directories.")
raise RuntimeError("Error: Could not find cuda on this system."+
" Please set your CUDA_HOME enviornment variable to the CUDA base directory.")
NVCC = find(CUDA_HOME+os.sep+"bin",
re.compile('nvcc$').search)
print("Found NVCC = ", NVCC)
# Parse output of nvcc to get cuda major version
nvcc_output = subprocess.check_output([NVCC, '--version']).decode("utf-8")
CUDA_LIB = re.compile(', V[0-9]+\.[0-9]+\.[0-9]+').search(nvcc_output).group(0).split('V')[1]
print("Found CUDA_LIB = ", CUDA_LIB)
if CUDA_LIB:
try:
CUDA_VERSION = int(CUDA_LIB.split('.')[0])
except (ValueError, TypeError):
CUDA_VERSION = 9
else:
CUDA_VERSION = 9
if CUDA_VERSION < 8:
raise RuntimeError("Error: APEx requires CUDA 8 or newer")
return NVCC, CUDA_VERSION, CUDA_HOME
#Get some important paths
curdir = os.path.dirname(os.path.abspath(inspect.stack()[0][1]))
buildir = curdir+os.sep+"build"
if not os.path.exists(buildir):
os.makedirs(buildir)
torch_dir = os.path.split(torch.__file__)[0] + os.sep + "lib"
cuda_files = find(curdir, lambda file: file.endswith(".cu"), True)
cuda_headers = find(curdir, lambda file: file.endswith(".cuh"), True)
headers = find(curdir, lambda file: file.endswith(".h"), True)
libaten = list(set(find(torch_dir, re.compile("libaten", re.IGNORECASE).search, True)))
libaten_names = [os.path.splitext(os.path.basename(entry))[0] for entry in libaten]
for i, entry in enumerate(libaten_names):
if entry[:3]=='lib':
libaten_names[i] = entry[3:]
aten_h = find(torch_dir, re.compile("aten.h", re.IGNORECASE).search, False)
torch_inc = os.path.dirname(os.path.dirname(aten_h))
include_dirs = [torch_inc]
library_dirs = []
for file in cuda_headers+headers:
dir = os.path.dirname(file)
if dir not in include_dirs:
include_dirs.append(dir)
# Object files that use the PyTorch cffi-extension interface
# They need special handling during compilation
cffi_objects = ['scale_kernel.o']
assert libaten, "Could not find PyTorch's libATen."
assert aten_h, "Could not find PyTorch's ATen header."
library_dirs.append(os.path.dirname(libaten[0]))
#create some places to collect important things
object_files = []
extra_link_args=[]
main_libraries = []
main_libraries += ['cudart',]+libaten_names
extra_compile_args = ["--std=c++11",]
#findcuda returns root dir of CUDA
#include cuda/include and cuda/lib64 for python module build.
NVCC, CUDA_VERSION, CUDA_HOME=findcuda()
library_dirs.append(os.path.join(CUDA_HOME, "lib64"))
include_dirs.append(os.path.join(CUDA_HOME, 'include'))
class RMBuild(clean):
def run(self):
#BE VERY CAUTIOUS WHEN USING RMTREE!!!
#These are some carefully written/crafted directories
if os.path.exists(buildir):
shutil.rmtree(buildir)
distdir = curdir+os.sep+"dist"
if os.path.exists(distdir):
shutil.rmtree(distdir)
eggdir = curdir+os.sep+"apex.egg-info"
if os.path.exists(eggdir):
shutil.rmtree(eggdir)
clean.run(self)
def CompileCudaFiles(NVCC, CUDA_VERSION):
print()
print("Compiling cuda modules with nvcc:")
gencodes = ['-gencode', 'arch=compute_52,code=sm_52',
'-gencode', 'arch=compute_60,code=sm_60',
'-gencode', 'arch=compute_61,code=sm_61',]
if CUDA_VERSION > 8:
gencodes += ['-gencode', 'arch=compute_70,code=sm_70',
'-gencode', 'arch=compute_70,code=compute_70',]
#Need arches to compile for. Compiles for 70 which requires CUDA9
nvcc_cmd = [NVCC,
'-Xcompiler',
'-fPIC'
] + gencodes + [
'--std=c++11',
'-O3',
]
for dir in include_dirs:
nvcc_cmd.append("-I"+dir)
# Hack: compiling the cffi kernel code needs the TH{C}
# subdirs of include on path as well
for suffix in ['TH', 'THC']:
nvcc_cmd.append('-I{}/{}'.format(torch_inc, suffix))
for file in cuda_files:
object_name = os.path.basename(
os.path.splitext(file)[0]+".o"
)
object_file = os.path.join(buildir, object_name)
object_files.append(object_file)
file_opts = ['-c', file, '-o', object_file]
extra_args = []
if object_name in cffi_objects:
for module in ['TH', 'THC']:
extra_args.append('-I{}/{}'.format(torch_inc, module))
build_args = nvcc_cmd + extra_args + file_opts
print(' '.join(build_args))
subprocess.check_call(build_args)
for object_file in object_files:
extra_link_args.append(object_file)
if 'clean' not in sys.argv:
print()
print("Arguments used to build CUDA extension:")
print("extra_compile_args :", extra_compile_args)
print("include_dirs: ", include_dirs)
print("extra_link_args: ", extra_link_args)
print("library_dirs: ", library_dirs)
print("libraries: ", main_libraries)
print()
CompileCudaFiles(NVCC, CUDA_VERSION)
print("Building CUDA extension.")
cuda_ext = Extension('apex._C',
[os.path.join('csrc', 'Module.cpp')],
extra_compile_args = extra_compile_args,
include_dirs=include_dirs,
extra_link_args=extra_link_args,
library_dirs=library_dirs,
runtime_library_dirs = library_dirs,
libraries=main_libraries
)
if 'clean' not in sys.argv:
print("Building module.")
setup(
name='apex', version='0.1',
cmdclass={
'clean' : RMBuild,
},
ext_modules=[cuda_ext,],
name='apex',
version='0.1',
packages=find_packages(exclude=('build',
'csrc',
'include',
'tests',
'dist',
'docs',
'tests',
'examples',
'apex.egg-info',)),
ext_modules=ext_modules,
description='PyTorch Extensions written by NVIDIA',
packages=find_packages(exclude=("build", "csrc", "include", "tests")),
# Require cffi
install_requires=["cffi>=1.0.0"],
setup_requires=["cffi>=1.0.0"],
cffi_modules=[os.path.join(os.path.dirname(__file__),
'build_cffi.py:extension')],
cmdclass={'build_ext': torch.utils.cpp_extension.BuildExtension},
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment