Commit 2fa4dbaf authored by Christian Sarofeen's avatar Christian Sarofeen
Browse files

Initial release

parents
#define PY_SSIZE_T_CLEAN
#define ARG_OFFSET 5
#include <Python.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <vector>
#include <string>
#include <cmath>
#include <cassert>
#include <iostream>
// #define USE_NVTX
#ifdef USE_NVTX
#include "nvToolsExt.h"
#endif
//Meta-data format we will use
#include <THCTensorInfo.cuh>
//Cuda kernels
#include <kernel.h>
#define ERROR_MSG cout << "Error at " << __FILE__ << ":" << __LINE__ << "\n";
using namespace std;
TensorInfo<void, idxType> PyOb_2_tinfo(PyObject* tensor, float_types data_type)
{
PyObject* PyStrides = PyObject_CallMethod(tensor, "stride", NULL);
if(PyStrides == NULL)
{
ERROR_MSG;
cout << "PyStrides = NULL" << endl;
}
PyObject* PySizes = PyObject_CallMethod(tensor, "size", NULL);
if(PySizes == NULL)
{
ERROR_MSG;
cout << "PySizes = NULL" << endl;
}
PyObject* PyDataPtr = PyObject_CallMethod(tensor, "data_ptr", NULL);
if(PyDataPtr == NULL)
{
ERROR_MSG;
cout << "PyDataPtr = NULL" << endl;
}
void* data_ptr = (void*) PyLong_AsLong(PyDataPtr);
Py_ssize_t ndims = PyList_GET_SIZE(PySizes);
//TODO put proper checking on ndims < MAX_CUTORCH_DIMS
idxType strides[MAX_CUTORCH_DIMS], sizes[MAX_CUTORCH_DIMS];
for(int i = 0; i < ndims; i++)
{
strides[i] = PyLong_AsLong(PyTuple_GetItem(PyStrides, i));
sizes[i] = PyLong_AsLong(PyTuple_GetItem(PySizes, i));
}
// Reference counts still behave strangely, but at least these appear to cap
// the process' memory usage.
Py_DECREF(PyStrides);
Py_DECREF(PySizes);
Py_DECREF(PyDataPtr);
return TensorInfo<void, idxType>(data_ptr, ndims, sizes, strides, data_type);
}
vector<TensorInfo<void, idxType> > get_TInfos(PyObject* args)
{
vector<TensorInfo<void, idxType> > info_vec;
#ifdef DEBUG_ANY
cout << "Processing " << PyTuple_GET_SIZE(args) << " arguments" << endl;
#endif
#ifdef CHECK_MEMLEAK
for(int iter = 0; iter < 1e7; iter++ )
#endif
for(Py_ssize_t i = 0; i<PyTuple_GET_SIZE(args) - 1; i++)
{
PyObject* pyTensor = PyTuple_GetItem(args, i);
// check type, only take if Tensor, Variable, or Parameter
string objType(pyTensor->ob_type->tp_name);
PyObject* pyObjTypeCall = PyObject_CallMethod(pyTensor, "type", NULL);
if(pyObjTypeCall == NULL)
{
ERROR_MSG;
cout << "For args item " << i << ", pyObjTypeCall = NULL" << endl;
}
// This gives a segfault:
// cout << "pyObjTypeCall direct conversion attempt = " <<
// PyBytes_AsString(pyObjTypeCall) << endl;
PyObject* pyObjASCII = PyUnicode_AsASCIIString(pyObjTypeCall);
if(pyObjASCII == NULL)
{
ERROR_MSG;
cout << "For args item " << i << ", pyObjASCII = NULL " << endl;
}
// cout << "Py_REFCNT(pyObjTypeCall) = " << Py_REFCNT(pyObjTypeCall) << endl;
Py_DECREF(pyObjTypeCall);
string objTypeCall(PyBytes_AsString(pyObjASCII));
// cout << "Py_REFCNT(pyObjASCII) = " << Py_REFCNT(pyObjASCII) << endl;
Py_DECREF(pyObjASCII);
#ifdef DEBUG_ANY
cout << "arg " << i << endl;
cout << "objType = " << objType << endl;
cout << "objTypeCall = " << objTypeCall << endl;
#endif
if(objTypeCall == "torch.cuda.FloatTensor")
#ifdef CHECK_MEMLEAK
if(iter == 0 )
#endif
info_vec.push_back(PyOb_2_tinfo(pyTensor, FLOAT));
#ifdef CHECK_MEMLEAK
else
info_vec[i] = PyOb_2_tinfo(pyTensor, FLOAT);
#endif
else if(objTypeCall == "torch.cuda.HalfTensor")
info_vec.push_back(PyOb_2_tinfo(pyTensor, HALF));
// Could add double
else
{
ERROR_MSG;
cout << "For args item " << i << ", unsupported .type() found: "
<< objTypeCall << "\n"
"Supported types:\n"
"torch.cuda.FloatTensor\n"
"torch.cuda.HalfTensor\n"
"torch.autograd.variable.Variable containing FloatTensor\n"
"torch.autograd.variable.Variable containing HalfTensor\n"
"torch.nn.parameter.Parameter containing FloatTensor\n"
"torch.nn.parameter.Parameter containing HalfTensor\n"
<< endl;
}
}
// PyErr_SetString(PyExc_RuntimeError, "Exception set in ");
return info_vec;
}
int getLastArg_AsInt(PyObject* args)
{
// None of these should return new references so I don't think this leaks memory.
int dims = PyLong_AsLong(PyTuple_GetItem(args, PyTuple_GET_SIZE(args) - 1));
return dims;
}
// Stepping stone, can evolve to be more general (argument forwarding?)
template<typename wrapper>
void dispatch
(
float_types rtti,
vector<TensorInfo<void, idxType>>& tensors,
int dim
)
{
switch(rtti)
{
case FLOAT:
wrapper::template call<float, float, idxType>(tensors, dim);
break;
case HALF:
wrapper::template call<half, float, idxType>(tensors, dim);
break;
default:
std::cout << "Unsupported rtti in Module.cpp:dispatch()" << std::endl;
PyErr_SetString(PyExc_RuntimeError, "Unsupported data type in Module.cpp:dispatch, "
"supported data types are half and float");
exit(-1);
}
}
//Will extract all tensors in order. Assumes flat structure, tensors can not be wrapped in lists
//tuples or any other iterator structure.
static PyObject* weight_norm_fwd(PyObject* self, PyObject* args)
{
#ifdef USE_NVTX
nvtxRangePushA("weight_norm_fwd C backend");
#endif
vector<TensorInfo<void, idxType> > tensors = get_TInfos(args);
int dim = getLastArg_AsInt(args);
if(dim != 0 && dim != tensors[2].dims - 1)
PyErr_SetString(PyExc_RuntimeError, "weight_norm_fwd currently only "
"supports first or last dimension.");
else
{
#ifdef DEBUG_ANY
cout << "tensors.size() = " << tensors.size() << ", dim = " << dim << endl;
#endif
dispatch<send_to_fwd_wrapper>(tensors[0].type, tensors, dim);
#ifdef USE_NVTX
nvtxRangePop();
#endif
}
Py_RETURN_NONE;
}
static PyObject* weight_norm_bwd(PyObject* self, PyObject* args)
{
#ifdef USE_NVTX
nvtxRangePushA("weight_norm_bwd C backend");
#endif
vector<TensorInfo<void, idxType> >tensors = get_TInfos(args);
int dim = getLastArg_AsInt(args);
if(dim != 0 && dim != tensors[3].dims - 1)
PyErr_SetString(PyExc_RuntimeError, "weight_norm_bwd currently only "
"supports first or last dimension.");
else
{
#ifdef DEBUG_ANY
cout << "tensors.size() = " << tensors.size() << ", dim = " << dim << endl;
#endif
dispatch<send_to_bwd_wrapper>(tensors[0].type, tensors, dim);
#ifdef USE_NVTX
nvtxRangePop();
#endif
}
Py_RETURN_NONE;
}
//*******************PYTHON BOILER PLATE*******************
static PyMethodDef apex_methods[] = {
{"weight_norm_fwd", (PyCFunction) weight_norm_fwd, METH_VARARGS, "Slowest-dim norm, forward pass."},
{"weight_norm_bwd", (PyCFunction) weight_norm_bwd, METH_VARARGS, "Slowest-dim norm, backward pass."},
{NULL, NULL, 0, NULL}
};
#if PY_MAJOR_VERSION >= 3
//Module Definitions
static struct PyModuleDef apex = {
PyModuleDef_HEAD_INIT, "apex._C", "Module to add CUDA extensions to Pytorch.", -1, apex_methods
};
//Initialization Function
PyMODINIT_FUNC PyInit__C(void){
//Let's throw an error if we can't find pytorch.
PyImport_ImportModule("torch");
Py_Initialize();
return PyModule_Create(&apex);
}
#else
PyMODINIT_FUNC initMODULE(void){
//Let's throw an error if we can't find pytorch.
PyImport_ImportModule("torch");
(void) Py_InitModule3("apex._C", apex, "A PyTorch Extension.");
}
#endif
//*********************************************************
#include "../include/kernel.h"
template<typename T> struct TtoInt { static const int test = -1; };
template<> struct TtoInt<float> { static const int test = 0; };
template<> struct TtoInt<half> { static const int test = 0; };
template<> struct TtoInt<double> { static const int test = 0; };
#if __CUDACC_VER_MAJOR__ >= 9
#define __SHFL_DOWN(var, delta) __shfl_down_sync(0xffffffff, var, delta)
#else
#define __SHFL_DOWN(var, delta) __shfl_down(var, delta)
#endif
#if __CUDACC_VER_MAJOR__ >= 9
#define __SYNCWARP __syncwarp()
#else
#define __SYNCWARP
#endif
// Block size for weight_norm_*_first_dim_kernel.
// Currently, kernels are non-persistent.
// Dialing up the block size to, say 1024, can improve performance by
// increase the amount of cache available per block, which can improve cache hit rate.
// However, this is less efficient for short rows. 256 is pretty versatile.
// Implement some heuristics later?
#define BLOCK 256
// Block size for weight_norm_*_last_dim_kernel.
// This is tricker than the first_dim case because we must make blocks
// at least 16 fast elements wide to ensure fully-coalesced half-precision accesses.
// Since output-element parallelism is along the fast dimension, this reduces the number of
// blocks we can launch by 16X.
#define TILE_W 16
// Somewhat versatile strategy: max out intra-block parallelism by extending
// blocks across the slow dimension up to the hardware-max block size of 1024.
#define TILE_H 64
using namespace std;
// lanes is intended to be <= 32.
template <typename T>
__device__ __forceinline__ void reduce_block_into_lanes(T *x, T val, int lanes)
{
int tid = threadIdx.x + threadIdx.y*blockDim.x;
int blockSize = blockDim.x*blockDim.y;
if(blockSize >= 64)
{
x[tid] = val;
__syncthreads();
}
#pragma unroll
for(int i = (blockSize >> 1); i >= 64; i >>= 1)
{
if(tid < i)
x[tid] += x[tid+i]; // JoinOp
__syncthreads();
}
if(tid < 32)
{
T final;
if(blockSize >= 64)
final = x[tid] + x[tid+32]; // JoinOp
else
final = val;
// __SYNCWARP();
#pragma unroll
for(int i = 16; i >= lanes; i >>= 1)
final += __SHFL_DOWN(final, i);
if(tid < lanes)
x[tid] = final; // EpilogueOp
}
// Make sure the smem result is visible to all warps.
__syncthreads();
}
template <typename T, typename IndexType>
__global__ void weight_norm_fwd_first_dim_kernel
(
TensorInfo<T, IndexType> w,
TensorInfo<float, IndexType> norms,
TensorInfo<T, IndexType> v,
TensorInfo<T, IndexType> g,
IndexType rowSize
)
{
// We are norming each slowest-dim row of the tensor separately.
// For now, assign one block to each row.
IndexType tid = threadIdx.x;
IndexType row = blockIdx.x;
IndexType stride = blockDim.x;
// Logical index offset for this flattened row
IndexType rowStart = row*rowSize;
extern __shared__ float s[];
float thread_sum = 0.f;
for(IndexType i = tid; i < rowSize; i += stride )
{
float val_f = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(v, i + rowStart));
thread_sum += val_f*val_f; // AccumOp, could do Kahan here
}
reduce_block_into_lanes(s, thread_sum, 1);
float result = s[0];
result = sqrtf(result);
if(tid == 0)
DEVICE_LINEAR_GET_F(norms, row) = result;
// Broadcast load, could use shared memory instead.
float g_this_row = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(g, row));
float rnorm = 1.f/result; // for consistency with backward kernel
// Write data to output
for(IndexType i = tid; i < rowSize; i += stride )
{
float val_f = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(v, i + rowStart));
DEVICE_LINEAR_GET(w, i + rowStart) = ScalarConvert<float,T>::to(g_this_row*val_f*rnorm);
}
}
template <typename T, typename IndexType>
__global__ void weight_norm_fwd_last_dim_kernel
(
TensorInfo<T, IndexType> w,
TensorInfo<float, IndexType> norms,
TensorInfo<T, IndexType> v,
TensorInfo<T, IndexType> g,
IndexType fast_dim_size,
IndexType slower_dims_size
)
{
IndexType fast_dim_location = threadIdx.x + blockIdx.x*blockDim.x;
extern __shared__ float alloc[];
float* s = &alloc[0];
float* rnorms_this_block = &alloc[blockDim.x*blockDim.y];
float thread_sum = 0.f;
IndexType slower_dims_location = threadIdx.y;
IndexType currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
if(fast_dim_location < fast_dim_size)
while(slower_dims_location < slower_dims_size)
{
float val_f = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(v, currentIdx));
thread_sum += val_f*val_f; // AccumOp, could do Kahan here
currentIdx += blockDim.y*fast_dim_size;
slower_dims_location += blockDim.y;
}
reduce_block_into_lanes(s, thread_sum, blockDim.x);
// Better to pass an EpilogueOp to reduce_block_into_lanes, can try later
if(threadIdx.y == 0)
{
float result = s[threadIdx.x];
float norm_this_col = sqrtf(result);
DEVICE_LINEAR_GET_F(norms, fast_dim_location) = norm_this_col;
rnorms_this_block[threadIdx.x] = 1.f/norm_this_col;
// printf("blockIdx.x = %d, threadIdx.x = %d, norm_this_col = %f\n",
// blockIdx.x, threadIdx.x, norm_this_col);
}
__syncthreads();
float g_this_col = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(g, fast_dim_location));
float rnorm = rnorms_this_block[threadIdx.x];
slower_dims_location = threadIdx.y;
currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
if(fast_dim_location < fast_dim_size)
while(slower_dims_location < slower_dims_size)
{
float val_f = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(v, currentIdx));
DEVICE_LINEAR_GET(w, currentIdx) = ScalarConvert<float,T>::to(g_this_col*val_f*rnorm);
currentIdx += blockDim.y*fast_dim_size;
slower_dims_location += blockDim.y;
}
}
template <typename T, typename IndexType>
__global__ void weight_norm_bwd_first_dim_kernel
(
TensorInfo<T, IndexType> pLpv,
TensorInfo<T, IndexType> pLpg,
TensorInfo<T, IndexType> pLpw,
TensorInfo<T, IndexType> savedv,
TensorInfo<T, IndexType> savedg,
TensorInfo<float, IndexType> savedNorms,
IndexType rowSize
)
{
// For now, assign one block to each row.
IndexType tid = threadIdx.x;
IndexType row = blockIdx.x;
IndexType stride = blockDim.x;
// Logical index offset for this flattened row
IndexType rowStart = row*rowSize;
extern __shared__ float s[];
float thread_sum = 0.f;
for(IndexType i = tid; i < rowSize; i += stride )
{
float pLpwi = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(pLpw, i + rowStart));
float savedvi = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(savedv, i + rowStart));
thread_sum += pLpwi*savedvi; // AccumOp, could do Kahan here
}
reduce_block_into_lanes(s, thread_sum, 1);
float result = s[0];
// Could choose to save reciprocal of norm instead I suppose, but norms is probably
// more handy to keep around.
// Broadcast load; could use shared memory instead.
float rnorm = 1.f/DEVICE_LINEAR_GET_F(savedNorms, row);
float rnorm3 = rnorm*rnorm*rnorm;
// Write g gradients.
if(tid == 0)
DEVICE_LINEAR_GET(pLpg, row) = ScalarConvert<float, T>::to(result*rnorm);
// Broadcast load, could use shared memory instead.
float g_this_row = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(savedg, row));
// Write v gradients. We are reusing values that were loaded earlier, so there
// is an optimization opportunity here (store values persistently).
for(IndexType j = tid; j < rowSize; j += stride )
{
float pLpwj = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(pLpw, j + rowStart));
float savedvj = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(savedv, j + rowStart));
float pLpvj = g_this_row*(rnorm*pLpwj - rnorm3*savedvj*result);
DEVICE_LINEAR_GET(pLpv, j + rowStart) = ScalarConvert<float,T>::to(pLpvj);
}
}
template <typename T, typename IndexType>
__global__ void weight_norm_bwd_last_dim_kernel
(
TensorInfo<T, IndexType> pLpv,
TensorInfo<T, IndexType> pLpg,
TensorInfo<T, IndexType> pLpw,
TensorInfo<T, IndexType> savedv,
TensorInfo<T, IndexType> savedg,
TensorInfo<float, IndexType> savedNorms,
IndexType fast_dim_size,
IndexType slower_dims_size
)
{
IndexType fast_dim_location = threadIdx.x + blockIdx.x*blockDim.x;
extern __shared__ float s[];
float thread_sum = 0.f;
IndexType slower_dims_location = threadIdx.y;
IndexType currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
if(fast_dim_location < fast_dim_size)
while(slower_dims_location < slower_dims_size)
{
float pLpwi = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(pLpw, currentIdx));
float savedvi = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(savedv, currentIdx));
thread_sum += pLpwi*savedvi; // AccumOp, could do Kahan here
currentIdx += blockDim.y*fast_dim_size;
slower_dims_location += blockDim.y;
}
reduce_block_into_lanes(s, thread_sum, blockDim.x);
float result = s[threadIdx.x];
// Broadcast load; could use shared memory instead.
float rnorm = 1.f/DEVICE_LINEAR_GET_F(savedNorms, fast_dim_location);
float rnorm3 = rnorm*rnorm*rnorm;
// Write g gradients.
if(threadIdx.y == 0)
DEVICE_LINEAR_GET(pLpg, fast_dim_location) = ScalarConvert<float, T>::to(result*rnorm);
// Entire block pulls these values, could use shared memory instead.
float g_this_col = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(savedg, fast_dim_location));
// Write v gradients.
slower_dims_location = threadIdx.y;
currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
if(fast_dim_location < fast_dim_size)
while(slower_dims_location < slower_dims_size)
{
float pLpwj = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(pLpw, currentIdx));
float savedvj = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(savedv, currentIdx));
float pLpvj = g_this_col*(rnorm*pLpwj - rnorm3*savedvj*result);
DEVICE_LINEAR_GET(pLpv, currentIdx) = ScalarConvert<float,T>::to(pLpvj);
currentIdx += blockDim.y*fast_dim_size;
slower_dims_location += blockDim.y;
}
}
template<typename DataType,
typename AccumType,
typename IndexType>
void send_to_fwd_wrapper::call
(
vector<TensorInfo<void, idxType>>& tensors,
int dim
)
{
#ifdef DEBUG_ANY
cout << "hello from send_to_fwd with v.type = " << v.type << endl;
#endif
auto w (*((TensorInfo<DataType , idxType>*)&tensors[0]));
auto norms(*((TensorInfo<AccumType, idxType>*)&tensors[1]));
auto v (*((TensorInfo<DataType , idxType>*)&tensors[2]));
auto g (*((TensorInfo<DataType , idxType>*)&tensors[3]));
if(dim == 0)
{
// Find logical size of each flattened slowest-dim row
IndexType rowSize = 1;
for(IndexType i = v.dims - 1; i > 0; i--)
rowSize *= v.sizes[i];
weight_norm_fwd_first_dim_kernel<<<v.sizes[0], BLOCK, BLOCK*sizeof(float)>>>
(
w,
norms,
v,
g,
rowSize
);
}
else if(dim == v.dims - 1)
{
// Precompute slower_dims_size and fast_dim_size because they involve dynamically indexing an array.
IndexType slower_dims_size = 1;
for(IndexType i = 0; i < v.dims - 1; i++)
slower_dims_size *= v.sizes[i];
int fast_dim_size = v.sizes[v.dims-1];
weight_norm_fwd_last_dim_kernel<<<(fast_dim_size+TILE_W-1)/TILE_W,
dim3(TILE_W,TILE_H),
(TILE_W*TILE_H + TILE_W)*sizeof(float)>>>
(
w,
norms,
v,
g,
fast_dim_size,
slower_dims_size
);
}
// else
// {
// intermediate dim kernel. Error checking on the dim was already done in
// Module.cpp:weight_norm_fwd. Could put that logic here instead, if we include
// <python.h> in both files.
// }
#ifdef DEBUG_PROFILE
cudaDeviceSynchronize();
#endif
}
// template <typename T, typename IndexType>
template<typename DataType,
typename AccumType,
typename IndexType>
void send_to_bwd_wrapper::call
(
vector<TensorInfo<void, idxType>>& tensors,
int dim
)
{
#ifdef DEBUG_ANY
cout << "Hello from send_to_bwd with pLpw.type = " << pLpw.type << endl;
#endif
// this feels sinful
auto pLpv (*((TensorInfo<DataType , idxType>*)&tensors[0]));
auto pLpg (*((TensorInfo<DataType , idxType>*)&tensors[1]));
auto pLpw (*((TensorInfo<DataType , idxType>*)&tensors[2]));
auto savedv (*((TensorInfo<DataType , idxType>*)&tensors[3]));
auto savedg (*((TensorInfo<DataType , idxType>*)&tensors[4]));
auto savedNorms(*((TensorInfo<AccumType, idxType>*)&tensors[5]));
if(dim == 0)
{
// Find logical size of each flattened slowest-dim row
IndexType rowSize = 1;
for(IndexType i = savedv.dims - 1; i > 0; i--)
rowSize *= savedv.sizes[i];
weight_norm_bwd_first_dim_kernel<<<pLpw.sizes[0], BLOCK, BLOCK*sizeof(float)>>>
(
pLpv,
pLpg,
pLpw,
savedv,
savedg,
savedNorms,
rowSize
);
}
else if(dim == savedv.dims - 1)
{
// Precompute slower_dims_size and fast_dim_size because they involve dynamically indexing an array.
IndexType slower_dims_size = 1;
for(IndexType i = 0; i < savedv.dims - 1; i++)
slower_dims_size *= savedv.sizes[i];
int fast_dim_size = savedv.sizes[savedv.dims-1];
weight_norm_bwd_last_dim_kernel<<<(fast_dim_size+TILE_W-1)/TILE_W,
dim3(TILE_W,TILE_H),
(TILE_W*TILE_H + TILE_W)*sizeof(float)>>>
(
pLpv,
pLpg,
pLpw,
savedv,
savedg,
savedNorms,
fast_dim_size,
slower_dims_size
);
}
// else
// {
// intermediate dim kernel. Error checking on the dim was already done in
// Module.cpp:weight_norm_bwd. Could put that logic here instead, if we include
// <python.h> in both files.
// }
#ifdef DEBUG_PROFILE
cudaDeviceSynchronize();
#endif
}
#define INSTANTIATE_SEND_TO_FWD(DATATYPE, ACCUMTYPE, IDXTYPE) \
template void send_to_fwd_wrapper::call<DATATYPE, ACCUMTYPE, IDXTYPE> \
( \
vector<TensorInfo<void, idxType>>&, \
int \
);
INSTANTIATE_SEND_TO_FWD(float, float, idxType)
INSTANTIATE_SEND_TO_FWD(half, float, idxType)
#undef INSTANTIATE_SEND_TO_FWD
#define INSTANTIATE_SEND_TO_BWD(DATATYPE, ACCUMTYPE, IDXTYPE) \
template void send_to_bwd_wrapper::call<DATATYPE, ACCUMTYPE, IDXTYPE> \
( \
vector<TensorInfo<void, idxType>>&, \
int \
);
INSTANTIATE_SEND_TO_BWD(float, float, idxType)
INSTANTIATE_SEND_TO_BWD(half, float, idxType)
#undef INSTANTIATE_SEND_TO_BWD
#undef BLOCK
#undef TILE_W
#undef TILE_H
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
SPHINXPROJ = PyTorch
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
docset: html
doc2dash --name $(SPHINXPROJ) --icon $(SOURCEDIR)/_static/img/nv-pytorch2.png --enable-js --online-redirect-url http://pytorch.org/docs/ --force $(BUILDDIR)/html/
# Manually fix because Zeal doesn't deal well with `icon.png`-only at 2x resolution.
cp $(SPHINXPROJ).docset/icon.png $(SPHINXPROJ).docset/icon@2x.png
convert $(SPHINXPROJ).docset/icon@2x.png -resize 16x16 $(SPHINXPROJ).docset/icon.png
gh-pages:
git checkout gh-pages
rm -rf build
rm -rf source
git checkout master -- .
make html
rm -rf ../_modules ../_sources ../_static
mv -fv build/html/* ../
rm -rf build
git add -A
git commit -m "Generated gh-pages for `git log master -1 --pretty=short --abbrev-commit`" && git push origin gh-pages ; git checkout master
.PHONY: help Makefile docset
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.. role:: hidden
:class: hidden-section
apex.RNN
===================================
This sumbodule is an in development API aimed to supply parity to torch.nn.RNN,
but be easier to extend. This module is not ready for use and still lacks important
features and validation.
.. automodule:: apex.RNN
.. currentmodule:: apex.RNN
.. RNN
----------
.. autofunction:: LSTM
.. autofunction:: mLSTM
.. autofunction:: GRU
.. autofunction:: ReLU
.. autofunction:: Tanh
body {
font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
}
/* Default header fonts are ugly */
h1, h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend, p.caption {
font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
}
/* Use white for docs background */
.wy-side-nav-search {
background-color: #fff;
}
.wy-nav-content-wrap, .wy-menu li.current > a {
background-color: #fff;
}
@media screen and (min-width: 1400px) {
.wy-nav-content-wrap {
background-color: rgba(0, 0, 0, 0.0470588);
}
.wy-nav-content {
background-color: #fff;
}
}
/* Fixes for mobile */
.wy-nav-top {
background-color: #fff;
background-image: url('../img/apex.jpg');
background-repeat: no-repeat;
background-position: center;
padding: 0;
margin: 0.4045em 0.809em;
color: #333;
}
.wy-nav-top > a {
display: none;
}
@media screen and (max-width: 768px) {
.wy-side-nav-search>a img.logo {
height: 60px;
}
}
/* This is needed to ensure that logo above search scales properly */
.wy-side-nav-search a {
display: block;
}
/* This ensures that multiple constructors will remain in separate lines. */
.rst-content dl:not(.docutils) dt {
display: table;
}
/* Use our red for literals (it's very similar to the original color) */
.rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
color: #F05732;
}
.rst-content tt.xref, a .rst-content tt, .rst-content tt.xref,
.rst-content code.xref, a .rst-content tt, a .rst-content code {
color: #404040;
}
/* Change link colors (except for the menu) */
a {
color: #F05732;
}
a:hover {
color: #F05732;
}
a:visited {
color: #D44D2C;
}
.wy-menu a {
color: #b3b3b3;
}
.wy-menu a:hover {
color: #b3b3b3;
}
/* Default footer text is quite big */
footer {
font-size: 80%;
}
footer .rst-footer-buttons {
font-size: 125%; /* revert footer settings - 1/80% = 125% */
}
footer p {
font-size: 100%;
}
/* For hidden headers that appear in TOC tree */
/* see http://stackoverflow.com/a/32363545/3343043 */
.rst-content .hidden-section {
display: none;
}
nav .hidden-section {
display: inherit;
}
.wy-side-nav-search>div.version {
color: #000;
}
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# PyTorch documentation build configuration file, created by
# sphinx-quickstart on Fri Dec 23 13:31:47 2016.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys
sys.path.insert(0, os.path.abspath('.'))
# sys.path.insert(0, os.path.abspath('../../apex/parallel/'))
import apex
# import multiproc
import sphinx_rtd_theme
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.autosummary',
'sphinx.ext.doctest',
'sphinx.ext.intersphinx',
'sphinx.ext.todo',
'sphinx.ext.coverage',
'sphinx.ext.mathjax',
'sphinx.ext.napoleon',
'sphinx.ext.viewcode',
]
napoleon_use_ivar = True
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
# source_suffix = ['.rst', '.md']
source_suffix = '.rst'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = 'APEx'
copyright = '2018'
author = 'Christian Sarofeen, Natalia Gimelshein, Michael Carilli, Raul Puri'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
# TODO: change to [:2] at v1.0
# version = 'master (' + torch.__version__ + ' )'
version = '0.0'
# The full version, including alpha/beta/rc tags.
# TODO: verify this works as expected
release = '0.0.0'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This patterns also effect to html_static_path and html_extra_path
exclude_patterns = []
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = True
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#
html_theme_options = {
'collapse_navigation': False,
'display_version': True,
'logo_only': True,
}
html_logo = '_static/img/nv-pytorch2.png'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# html_style_path = 'css/pytorch_theme.css'
html_context = {
'css_files': [
'https://fonts.googleapis.com/css?family=Lato',
'_static/css/pytorch_theme.css'
],
}
# -- Options for HTMLHelp output ------------------------------------------
# Output file base name for HTML help builder.
htmlhelp_basename = 'PyTorchdoc'
# -- Options for LaTeX output ---------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#
# 'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#
# 'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#
# 'preamble': '',
# Latex figure (float) alignment
#
# 'figure_align': 'htbp',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, 'apex.tex', 'APEx Documentation',
'Torch Contributors', 'manual'),
]
# -- Options for manual page output ---------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
(master_doc, 'APEx', 'APEx Documentation',
[author], 1)
]
# -- Options for Texinfo output -------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
(master_doc, 'APEx', 'APEx Documentation',
author, 'APEx', 'One line description of project.',
'Miscellaneous'),
]
# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {
'python': ('https://docs.python.org/', None),
'numpy': ('http://docs.scipy.org/doc/numpy/', None),
}
# -- A patch that prevents Sphinx from cross-referencing ivar tags -------
# See http://stackoverflow.com/a/41184353/3343043
from docutils import nodes
from sphinx.util.docfields import TypedField
from sphinx import addnodes
def patched_make_field(self, types, domain, items, **kw):
# `kw` catches `env=None` needed for newer sphinx while maintaining
# backwards compatibility when passed along further down!
# type: (List, unicode, Tuple) -> nodes.field
def handle_item(fieldarg, content):
par = nodes.paragraph()
par += addnodes.literal_strong('', fieldarg) # Patch: this line added
# par.extend(self.make_xrefs(self.rolename, domain, fieldarg,
# addnodes.literal_strong))
if fieldarg in types:
par += nodes.Text(' (')
# NOTE: using .pop() here to prevent a single type node to be
# inserted twice into the doctree, which leads to
# inconsistencies later when references are resolved
fieldtype = types.pop(fieldarg)
if len(fieldtype) == 1 and isinstance(fieldtype[0], nodes.Text):
typename = u''.join(n.astext() for n in fieldtype)
typename = typename.replace('int', 'python:int')
typename = typename.replace('long', 'python:long')
typename = typename.replace('float', 'python:float')
typename = typename.replace('type', 'python:type')
par.extend(self.make_xrefs(self.typerolename, domain, typename,
addnodes.literal_emphasis, **kw))
else:
par += fieldtype
par += nodes.Text(')')
par += nodes.Text(' -- ')
par += content
return par
fieldname = nodes.field_name('', self.label)
if len(items) == 1 and self.can_collapse:
fieldarg, content = items[0]
bodynode = handle_item(fieldarg, content)
else:
bodynode = self.list_type()
for fieldarg, content in items:
bodynode += nodes.list_item('', handle_item(fieldarg, content))
fieldbody = nodes.field_body('', bodynode)
return nodes.field('', fieldname, fieldbody)
TypedField.make_field = patched_make_field
.. role:: hidden
:class: hidden-section
apex.fp16_utils
===================================
This submodule contains utilities designed to streamline the mixed precision training recipe
presented by NVIDIA `on Parallel Forall`_ and in GTC 2018 Sessions
`Training Neural Networks with Mixed Precision: Theory and Practice`_ and
`Training Neural Networks with Mixed Precision: Real Examples`_.
For Pytorch users, Real Examples in particular is recommended.
.. _`on Parallel Forall`:
https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/
.. _`Training Neural Networks with Mixed Precision: Theory and Practice`:
http://on-demand.gputechconf.com/gtc/2018/video/S8923/
.. _`Training Neural Networks with Mixed Precision: Real Examples`:
http://on-demand.gputechconf.com/gtc/2018/video/S81012/
.. automodule:: apex.fp16_utils
.. currentmodule:: apex.fp16_utils
.. FusedNorm
----------
.. autofunction:: prep_param_lists
.. autofunction:: master_params_to_model_params
.. autofunction:: model_grads_to_master_grads
.. autoclass:: FP16_Optimizer
:members:
.. autoclass:: Fused_Weight_Norm
:members:
.. .. automodule:: apex.fp16_utils.loss_scaler
.. autoclass:: LossScaler
:members:
.. autoclass:: DynamicLossScaler
:members:
.. .. automodule:: apex.fp16_utils.fp16util
:members:
.. PyTorch documentation master file, created by
sphinx-quickstart on Fri Dec 23 13:31:47 2016.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
:github_url: https://gitlab-master.nvidia.com/csarofeen/apex
APEx (A PyTorch Extension)
===================================
This is a repo is designed to hold PyTorch modules and utilities that are under active development and experimental. This repo is not designed as a long term solution or a production solution. Things placed in here are intended to be eventually moved to upstream PyTorch.
A major focus of this extension is the training of neural networks using 16-bit precision floating point math, which offers significant performance benefits on latest NVIDIA GPU architectures. The reduced dynamic range of half precision, however, is more vulnerable to numerical overflow/underflow.
APEX is an NVIDIA-maintained repository of utilities, including some that are targeted to improve the accuracy and stability of half precision networks, while maintaining high performance. The utilities are designed to be minimally invasive and easy to use.
Installation requires CUDA9, PyTorch 0.3 or later, and Python 3. Installation can be done by running
::
git clone https://www.github.com/nvidia/apex
cd apex
python setup.py install
.. toctree::
:maxdepth: 1
:caption: apex
parallel
reparameterization
RNN
fp16_utils
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
.. role:: hidden
:class: hidden-section
apex.parallel
===================================
.. automodule:: apex.parallel
.. currentmodule:: apex.parallel
Still need to figure out how to document multiproc.py.
.. DistributedDataParallel
----------
.. autoclass:: DistributedDataParallel
:members:
.. role:: hidden
:class: hidden-section
apex.reparameterization
===================================
.. automodule:: apex.reparameterization
.. currentmodule:: apex.reparameterization
.. autoclass:: Reparameterization
:members:
.. autoclass:: WeightNorm
:members:
# Basic Multirpocess Example based on the MNIST example
This version of this examples requires APEx which can be installed from https://www.github.com/nvidia/apex. This example demonstrates how to modify a network to use a basic but effective distributed data parallel module. This parallel method is designed to easily run multi-gpu runs on a single node. It was created as current parallel methods integraded into pytorch can induce significant overhead due to python GIL lock. This method will reduce the influence of those overheads and potentially provide a benefit in performance, especially for networks with a significant number of fast running operations.
## Getting started
Prior to running please run
```pip install -r requirements.txt```
and start a single process run to allow the dataset to be downloaded (This will not work properly in multi-gpu. You can stop this job as soon as it starts iterating.).
```python main.py```
You can now the code multi-gpu with
```python -m apex.parallelmultiproc main.py ...```
adding any normal option you'd like.
## Converting your own model
To understand how to convert your own model to use the distributed module included, please see all sections of main.py within ```#=====START: ADDED FOR DISTRIBUTED======``` and ```#=====END: ADDED FOR DISTRIBUTED======``` flags.
## Requirements
Pytorch master branch built from source. This requirement is to use NCCL as a distributed backend.
APEx installed from https://www.github.com/nvidia/apex
\ No newline at end of file
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
#=====START: ADDED FOR DISTRIBUTED======
'''Add custom module for distributed'''
try:
from apex.parallel import DistributedDataParallel as DDP
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.")
'''Import distributed data loader'''
import torch.utils.data
import torch.utils.data.distributed
'''Import torch.distributed'''
import torch.distributed as dist
#=====END: ADDED FOR DISTRIBUTED======
# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument('--batch-size', type=int, default=64, metavar='N',
help='input batch size for training (default: 64)')
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
help='input batch size for testing (default: 1000)')
parser.add_argument('--epochs', type=int, default=10, metavar='N',
help='number of epochs to train (default: 10)')
parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
help='learning rate (default: 0.01)')
parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
help='SGD momentum (default: 0.5)')
parser.add_argument('--no-cuda', action='store_true', default=False,
help='disables CUDA training')
parser.add_argument('--seed', type=int, default=1, metavar='S',
help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
help='how many batches to wait before logging training status')
#======START: ADDED FOR DISTRIBUTED======
'''
Add some distributed options. For explanation of dist-url and dist-backend please see
http://pytorch.org/tutorials/intermediate/dist_tuto.html
--world-size and --rank are required parameters as they will be used by the multiproc.py launcher
but do not have to be set explicitly.
'''
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
help='distributed backend')
parser.add_argument('--world-size', default=1, type=int,
help='Number of GPUs to use. Can either be manually set ' +
'or automatically set by using \'python -m multiproc\'.')
parser.add_argument('--rank', default=0, type=int,
help='Used for multi-process training. Can either be manually set ' +
'or automatically set by using \'python -m multiproc\'.')
#=====END: ADDED FOR DISTRIBUTED======
args = parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()
#======START: ADDED FOR DISTRIBUTED======
'''Add a convenience flag to see if we are running distributed'''
args.distributed = args.world_size > 1
'''Check that we are running with cuda, as distributed is only supported for cuda.'''
if args.distributed:
assert args.cuda, "Distributed mode requires running with CUDA."
if args.distributed:
'''
Set cuda device so everything is done on the right GPU.
THIS MUST BE DONE AS SOON AS POSSIBLE.
'''
torch.cuda.set_device(args.rank % torch.cuda.device_count())
'''Initialize distributed communication'''
dist.init_process_group(args.dist_backend, init_method=args.dist_url,
world_size=args.world_size)
#=====END: ADDED FOR DISTRIBUTED======
torch.manual_seed(args.seed)
if args.cuda:
torch.cuda.manual_seed(args.seed)
kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
#=====START: ADDED FOR DISTRIBUTED======
'''
Change sampler to distributed if running distributed.
Shuffle data loader only if distributed.
'''
train_dataset = datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
]))
if args.distributed:
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
else:
train_sampler = None
train_loader = torch.utils.data.DataLoader(
train_dataset, sampler=train_sampler,
batch_size=args.batch_size, shuffle=(train_sampler is None), **kwargs
)
#=====END: ADDED FOR DISTRIBUTED======
test_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=args.test_batch_size, shuffle=True, **kwargs)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
self.conv2_drop = nn.Dropout2d()
self.fc1 = nn.Linear(320, 50)
self.fc2 = nn.Linear(50, 10)
def forward(self, x):
x = F.relu(F.max_pool2d(self.conv1(x), 2))
x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
x = x.view(-1, 320)
x = F.relu(self.fc1(x))
x = F.dropout(x, training=self.training)
x = self.fc2(x)
return F.log_softmax(x)
model = Net()
if args.cuda:
model.cuda()
#=====START: ADDED FOR DISTRIBUTED======
'''
Wrap model in our version of DistributedDataParallel.
This must be done AFTER the model is converted to cuda.
'''
if args.distributed:
model = DDP(model)
#=====END: ADDED FOR DISTRIBUTED======
optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
def train(epoch):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
if args.cuda:
data, target = data.cuda(), target.cuda()
data, target = Variable(data), Variable(target)
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
if batch_idx % args.log_interval == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.data[0]))
def test():
model.eval()
test_loss = 0
correct = 0
for data, target in test_loader:
if args.cuda:
data, target = data.cuda(), target.cuda()
data, target = Variable(data, volatile=True), Variable(target)
output = model(data)
test_loss += F.nll_loss(output, target, size_average=False).data[0] # sum up batch loss
pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
correct += pred.eq(target.data.view_as(pred)).cpu().sum()
test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
for epoch in range(1, args.epochs + 1):
train(epoch)
test()
python -m apex.parallel.multiproc main.py
#ifndef THC_TENSOR_INFO_INC
#define THC_TENSOR_INFO_INC
#include <cuda.h>
#include <cuda_fp16.h>
#include <assert.h>
// Maximum number of dimensions allowed for cutorch
#define MAX_CUTORCH_DIMS 10
// Warning string for tensor arguments that are too large or have too
// many dimensions
#define CUTORCH_STR(X) #X
#define CUTORCH_DIM_WARNING "tensor too large or too many (>" \
CUTORCH_STR(MAX_CUTORCH_DIMS) ") dimensions"
enum float_types { FLOAT = 0 , HALF = 1, DOUBLE = 2 };
// CUDA kernel argument that defines tensor layout
template <typename T, typename IndexType>
struct TensorInfo {
TensorInfo(T* p,
int dim,
IndexType sz[MAX_CUTORCH_DIMS],
IndexType st[MAX_CUTORCH_DIMS]);
TensorInfo(T* p,
int dim,
IndexType sz[MAX_CUTORCH_DIMS],
IndexType st[MAX_CUTORCH_DIMS],
float_types type);
//Good way to cast from another format
//template <TensorInfo<typename T2, typename I2> >
//TensorInfo(TensorInfo<T2, I2> &tinfo_in){
// data = reinterpret_cast<T*>(tinfo_in.data);
//}
T* data;
IndexType sizes[MAX_CUTORCH_DIMS];
IndexType strides[MAX_CUTORCH_DIMS];
int dims;
float_types type;
};
//Expand our combinations as convenient typedefs
typedef TensorInfo<half, int> t_hi;
typedef TensorInfo<half, long> t_hl;
typedef TensorInfo<float, int> t_fi;
typedef TensorInfo<float, long> t_fl;
template <typename T, typename IndexType>
TensorInfo<T, IndexType>::TensorInfo(T* p,
int dim,
IndexType sz[MAX_CUTORCH_DIMS],
IndexType st[MAX_CUTORCH_DIMS]) {
data = p;
dims = dim;
assert(dims > 0 && dims < MAX_CUTORCH_DIMS);
for (int i = 0; i < dim; ++i) {
sizes[i] = sz[i];
strides[i] = st[i];
}
}
template<typename T, typename IndexType>
TensorInfo<T, IndexType>::TensorInfo(T* p,
int dim,
IndexType sz[MAX_CUTORCH_DIMS],
IndexType st[MAX_CUTORCH_DIMS],
float_types _type){
data = p;
dims = dim;
assert(dims > 0 && dims < MAX_CUTORCH_DIMS);
for (int i = 0; i < dim; ++i) {
sizes[i] = sz[i];
strides[i] = st[i];
}
type=_type;
}
// Translate a linear index for the apply to a T* offset;
// specialized on `Dims` to reduce nvcc compilation time
template <typename T, typename IndexType, int Dims>
struct IndexToOffset {
static __forceinline__ __host__ __device__ IndexType get(
IndexType linearId,
const TensorInfo<T, IndexType>& info) {
IndexType offset = 0;
// Use static dims
for (int i = Dims - 1; i > 0; --i) {
for (int i = Dims - 1; i > 0; --i) {
offset += linearId % info.sizes[i] * info.strides[i];
linearId /= info.sizes[i];
}
offset += linearId * info.strides[0];
return offset;
}
}
};
// For contiguous tensors, the offset = index
template <typename T, typename IndexType>
struct IndexToOffset<T, IndexType, -2> {
static __forceinline__ __host__ __device__ IndexType
get(IndexType linearId, const TensorInfo<T, IndexType>& info) {
return linearId;
}
};
template <typename T, typename IndexType>
struct IndexToOffset<T, IndexType, -1> {
static __forceinline__ __host__ __device__ IndexType get(
IndexType linearId,
const TensorInfo<T, IndexType>& info) {
IndexType offset = 0;
// Use dynamic dims
for (int i = info.dims - 1; i >= 0; --i) {
IndexType curDimIndex = linearId % info.sizes[i];
IndexType curDimOffset = curDimIndex * info.strides[i];
offset += curDimOffset;
linearId /= info.sizes[i];
}
return offset;
}
};
#endif // THC_TENSOR_INFO_INC
#include "THCTensorInfo.cuh"
#include <iostream>
#include <cstdio>
#include <cassert>
#include <cuda.h>
// this is suboptimal, try forward declarations later
#include <vector>
#define Dims -2
#define DEVICE_LINEAR_GET(D_TENSOR, INDEX) D_TENSOR.data[IndexToOffset<T, IndexType, Dims>::get(INDEX, D_TENSOR)]
#define DEVICE_LINEAR_GET_F(D_TENSOR, INDEX) D_TENSOR.data[IndexToOffset<float, IndexType, Dims>::get(INDEX, D_TENSOR)]
// template <typename T, typename IndexType>
// void send_to_kernel(
// TensorInfo<T, IndexType> Input_1,
// TensorInfo<T, IndexType> Input_2,
// IndexType totalElems
// );
typedef int idxType;
struct send_to_fwd_wrapper
{
template<typename DataType,
typename AccumType,
typename IndexType>
static void call(std::vector<TensorInfo<void, idxType>>& tensors, int dim);
};
struct send_to_bwd_wrapper
{
template<typename DataType,
typename AccumType,
typename IndexType>
static void call(std::vector<TensorInfo<void, idxType>>& tensors, int dim);
};
template <typename In, typename Out>
struct ScalarConvert {
static __host__ __device__ __forceinline__ Out to(const In v) { return (Out) v; }
};
#ifdef CUDA_HALF_TENSOR
template <typename Out>
struct ScalarConvert<half, Out> {
static __host__ __device__ __forceinline__ Out to(const half v) {
#ifdef __CUDA_ARCH__
return (Out) __half2float(v);
#else
return (Out) THC_half2float(v);
#endif
}
};
template <typename In>
struct ScalarConvert<In, half> {
static __host__ __device__ __forceinline__ half to(const In v) {
#ifdef __CUDA_ARCH__
return __float2half((float) v);
#else
return THC_float2half((float) v);
#endif
}
};
template <>
struct ScalarConvert<half, half> {
static __host__ __device__ __forceinline__ half to(const half v) {
return v;
}
};
#endif
import re
import os
import shutil
import inspect
import distutils
import distutils.spawn
from distutils.command.clean import clean
from setuptools import setup, Extension, find_packages
from setuptools.command.install import install
import subprocess
import ctypes.util
import torch
#Takes a path to walk
#A function to decide if to keep
#collection if we want a list of all occurances
def find(path, regex_func, collect=False):
collection = [] if collect else None
for root, dirs, files in os.walk(path):
for file in files:
if regex_func(file):
if collect:
collection.append(os.path.join(root, file))
else:
return os.path.join(root, file)
return list(set(collection))
def findcuda():
CUDA_HOME = os.getenv('CUDA_HOME', '/usr/local/cuda')
if not os.path.exists(CUDA_HOME):
# We use nvcc path on Linux and cudart path on macOS
osname = platform.system()
if osname == 'Linux':
cuda_path = find_nvcc()
else:
cudart_path = ctypes.util.find_library('cudart')
if cudart_path is not None:
cuda_path = os.path.dirname(cudart_path)
else:
cuda_path = None
if cuda_path is not None:
CUDA_HOME = os.path.dirname(cuda_path)
else:
CUDA_HOME = None
WITH_CUDA = CUDA_HOME is not None
return CUDA_HOME
#Get some important paths
curdir = os.path.dirname(os.path.abspath(inspect.stack()[0][1]))
buildir = curdir+os.sep+"build"
if not os.path.exists(buildir):
os.makedirs(buildir)
torch_dir = os.path.split(torch.__file__)[0] + os.sep + "lib"
cuda_files = find(curdir, lambda file: file.endswith(".cu"), True)
cuda_headers = find(curdir, lambda file: file.endswith(".cuh"), True)
headers = find(curdir, lambda file: file.endswith(".h"), True)
libaten = find(torch_dir, re.compile("libaten", re.IGNORECASE).search, False)
aten_h = find(torch_dir, re.compile("aten.h", re.IGNORECASE).search, False)
include_dirs = [os.path.dirname(os.path.dirname(aten_h))]
library_dirs = []
for file in cuda_headers+headers:
dir = os.path.dirname(file)
if dir not in include_dirs:
include_dirs.append(dir)
assert libaten, "Could not find PyTorch's libATen."
assert aten_h, "Could not find PyTorch's ATen header."
library_dirs.append(os.path.dirname(libaten))
#create some places to collect important things
object_files = []
extra_link_args=[]
main_libraries = []
main_libraries += ['cudart', 'cuda', 'ATen']
extra_compile_args = ["--std=c++11",]
#findcuda returns root dir of CUDA
#include cuda/include and cuda/lib64 for python module build.
CUDA_HOME=findcuda()
library_dirs.append(os.path.join(CUDA_HOME, "lib64"))
include_dirs.append(os.path.join(CUDA_HOME, 'include'))
class RMBuild(clean):
def run(self):
#BE VERY CAUTIOUS WHEN USING RMTREE!!!
#These are some carefully written/crafted directories
if os.path.exists(buildir):
shutil.rmtree(buildir)
distdir = curdir+os.sep+"dist"
if os.path.exists(distdir):
shutil.rmtree(distdir)
eggdir = curdir+os.sep+"apex.egg-info"
if os.path.exists(eggdir):
shutil.rmtree(eggdir)
clean.run(self)
def CompileCudaFiles():
print()
print("Compiling cuda modules with nvcc:")
#Need arches to compile for. Compiles for 70 which requires CUDA9
nvcc_cmd = ['nvcc',
'-Xcompiler',
'-fPIC',
'-gencode', 'arch=compute_52,code=sm_52',
'-gencode', 'arch=compute_60,code=sm_60',
'-gencode', 'arch=compute_61,code=sm_61',
'-gencode', 'arch=compute_70,code=sm_70',
'-gencode', 'arch=compute_70,code=compute_70',
'--std=c++11',
'-O3',
]
for dir in include_dirs:
nvcc_cmd.append("-I"+dir)
for file in cuda_files:
object_name = os.path.basename(
os.path.splitext(file)[0]+".o"
)
object_file = os.path.join(buildir, object_name)
object_files.append(object_file)
file_opts = ['-c', file, '-o', object_file]
print(' '.join(nvcc_cmd+file_opts))
subprocess.check_call(nvcc_cmd+file_opts)
for object_file in object_files:
extra_link_args.append(object_file)
print()
print("Arguments used to build CUDA extension:")
print("extra_compile_args :", extra_compile_args)
print("include_dirs: ", include_dirs)
print("extra_link_args: ", extra_link_args)
print("library_dirs: ", library_dirs)
print("libraries: ", main_libraries)
print()
CompileCudaFiles()
print("Building CUDA extension.")
cuda_ext = Extension('apex._C',
[os.path.join('csrc', 'Module.cpp')],
extra_compile_args = extra_compile_args,
include_dirs=include_dirs,
extra_link_args=extra_link_args,
library_dirs=library_dirs,
runtime_library_dirs = library_dirs,
libraries=main_libraries
)
print("Building module.")
setup(
name='apex', version='0.1',
cmdclass={
'clean' : RMBuild,
},
ext_modules=[cuda_ext,],
description='PyTorch Extensions written by NVIDIA',
packages=find_packages(exclude=("build", "csrc", "include", "tests")),
)
import torch
import torch.nn as nn
from torch.autograd import Variable
import apex
from apex.RNN.models import bidirectionalRNN, stackedRNN, RNNCell
from torch.nn._functions.rnn import LSTMCell
import itertools
torch.backends.cudnn.enabled=False
batch_first = False #not implemented yet
dropout = 0.0 #How to validate?
bidirectional = False #True works, but differs in definition to PyTorch
rnn_types = ['LSTM', 'GRU', 'ReLU', 'Tanh']
sizes = [8,4,2]
seq_sizes = sizes
hidden_sizes = sizes
inp_sizes = sizes
batch_sizes = sizes
num_layerss = sizes
biases = [True]
def copy_param_set(pyt_rnn, my_rnn, layer=0, reverse=False):
my_params = None
rnn = None
if isinstance(my_rnn, bidirectionalRNN):
rnn = my_rnn.fwd.rnns[layer] if not reverse else my_rnn.bckwrd.rnns[layer]
elif isinstance(my_rnn, stackedRNN):
rnn = my_rnn.rnns[layer]
else:
raise RuntimeError()
param_names = ['w_ih', 'w_hh', 'b_ih', 'b_hh']
if not hasattr(rnn, 'b_hh'):
param_names = param_names[:2]
my_params = [getattr(rnn, param_name) for param_name in param_names]
pyt_params = None
param_names = ['weight_ih_', 'weight_hh_', 'bias_ih_', 'bias_hh_']
reverse_str = '_reverse' if reverse else ''
if not hasattr(pyt_rnn, 'bias_hh_l0'):
param_names=param_names[:2]
pyt_params =[getattr(pyt_rnn, param_name + 'l' + str(layer) + reverse_str )
for param_name in param_names ]
for pyt_param, my_param in zip(pyt_params, my_params):
pyt_param.data.copy_(my_param.data)
def copy_all_params(pyt_rnn, my_rnn):
for layer in range(num_layers):
copy_param_set(pyt_rnn, my_rnn, layer)
if bidirectional:
copy_param_set(pyt_rnn, my_rnn, layer, bidirectional)
def compare_variables(v1, v2, msg, params):
diff = float((v1.data-v2.data).abs().max())
if diff > 1e-5:
print("Error of ", diff, " found for ", msg, " for case: ", str(params))
def compare_tuple_variables(t1, t2, msg, params):
for var1, var2 in zip(t1, t2):
compare_variables(var1, var2, msg, params)
def maybe_compare(v1, v2, msg, params):
if isinstance(v1, Variable) and isinstance(v2, Variable):
compare_variables(v1, v2, msg, params)
else:
compare_tuple_variables(v1, v2, msg, params)
product = list(itertools.product(rnn_types, seq_sizes, hidden_sizes, inp_sizes, batch_sizes, num_layerss, biases))
for test_case in product:
rnn_type, seq_size, hidden_size, inp_size, batch_size, num_layers, bias = test_case
inp = torch.cuda.FloatTensor(seq_size, batch_size, inp_size).uniform_()
if rnn_type == 'ReLU' or rnn_type == 'Tanh':
pytorch_rnn = nn.RNN(inp_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional, nonlinearity=rnn_type.lower()).cuda()
else:
pytorch_rnn = getattr(nn, rnn_type)(inp_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional).cuda()
my_rnn = getattr(apex.RNN.models, rnn_type)(inp_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional).cuda()
copy_all_params(pytorch_rnn, my_rnn)
pyt_inp = Variable(inp, requires_grad=True)
my_inp = Variable(inp, requires_grad=True)
my_out, my_hiddens = my_rnn(my_inp)
pyt_out, pyt_hiddens = pytorch_rnn(pyt_inp)
pyt_out.sum().backward()
my_out.sum().backward()
maybe_compare(pyt_out, my_out, "out", test_case)
#If there's only one hidden state PyTorch doesn't return it in a tuple,
#apex does, so we wrap PyTorch's returned hidden state in a tuple.
if not isinstance(pyt_hiddens, tuple):
pyt_hiddens = (pyt_hiddens,)
try:
for i, (pyt_hid, my_hid) in enumerate(zip(pyt_hiddens, my_hiddens)):
maybe_compare(pyt_hid, my_hid , "hx_"+str(i), test_case)
except ValueError:
maybe_compare(pyt_hiddens, my_hiddens , "hx_0", test_case)
maybe_compare(pyt_inp.grad, my_inp.grad, "inp.grad", test_case)
print("Test passed.")
import torch
from torch.autograd import Variable
# import apex
import numpy as np
torch.manual_seed(2)
torch.cuda.manual_seed(2)
# torch.cuda.manual_seed_all(2)
torch.set_printoptions(precision=10)
rows = 3
cols = 20
dims = rows, cols
# Incoming gradient vectors we will use later
# Need to create the fp16 versions as a half() copy of a Tensor first rather than
# a Variable, because if you create pt_input_control as a Variable then say
# pt_input_fp16 = pt_input_control.half(), you are accidentally making pt_input_fp16 part of
# pLpOutput_control's computational graph, so it will not be a leaf!
pt_input_control = Variable(torch.randn(*dims).cuda(), requires_grad=True)
# pt_input_control = torch.ones(*dims).cuda()
pt_input_fp16 = pt_input_control.half()
pt_output_fp16 = pt_input_fp16.sum()
pt_output_control = pt_input_control.sum()
print("After sum()s, before backwards:")
print("pt_output_control.requires_grad = ", pt_output_control.requires_grad)
print("pt_output_control.volatile = ", pt_output_control.volatile)
print("pt_input_control.grad = ", pt_input_control.grad)
print("pt_input_fp16.grad = ", pt_input_fp16.grad)
print("\n\n")
pt_output_fp16.backward() # pt_input_fp16 is not the leaf of this graph, pt_input_control is.
print("After pt_output_fp16.backward():")
print("pt_input_control.grad = ", pt_input_control.grad)
print("pt_input_fp16.grad = ", pt_input_fp16.grad)
print("\n\n")
pt_output_control.backward() # Both backward() calls have pt_input_control as leaves, and so
# will accumulate gradients into pt_input_control.grad
print("After pt_output_control.backward():")
print("pt_input_control.grad = ", pt_input_control.grad)
print("pt_input_fp16.grad = ", pt_input_fp16.grad)
print("\n\n")
print("pt_output_control = ", pt_output_control)
print("pt_output_fp16 = ", pt_output_fp16)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment