Commit 23d9faaf authored by yanyan's avatar yanyan
Browse files

working on cutlass

parent 853302aa
......@@ -43,6 +43,7 @@ add_subdirectory(third_party/pybind11)
set(ALL_LIBS ${TORCH_LIBRARIES})
set(ALL_INCLUDE ${PROJECT_SOURCE_DIR}/include)
set(MP11_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/mp11/include)
if (SPCONV_BuildCUDA)
set(ALL_LIBS ${ALL_LIBS} ${CUDA_CUDART} ${CUDA_CUBLAS})
......
global:
console_url: localhost:50091
envs:
PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION: python # c++ libprotobuf and python will conflicit
analyzers: # only one analyzer is allowed for one type for now.
PythonAnalyzer:
SimpleCPPAnalyzer: # $<astex> devops.devs = ["_ci_dev_xxx"] </astex> is allowed in raw sources.
includes: ["*.cpp", "*.cu", "*.cc", "*.h", "*.hpp", "*.hxx", "*.cxx"]
observers:
# run test functions when that function change or marked function change.
test:
type: TestObserver
# run dev functions when that function change or marked function change.
dev:
type: DevObserver
pattern: _ci_dev_.*
clangdev:
type: CPPDevObserver
main_pattern: dev_.*\.(cc|cpp|cxx)
pattern: .*\.(cc|cpp|cxx|h|hpp|hxx)
compiler: clang++
executable: build/codeai_dev
includes: [
include,
/usr/local/cuda/include,
/home/yy/anaconda3/include,
/home/yy/anaconda3/include/python3.7m,
third_party/pybind11/include,
third_party/include,
/home/yy/library/boost_1_72_0,
]
libpaths: [
/home/yy/anaconda3/lib,
]
libraries: [-lnvinfer, -lpython3.7m, -lcublas, -lcudart, -ljpeg]
std: c++2a
options: [-Wall, -Wextra]
cudadev:
type: CPPDevObserver
main_pattern: dev_.*\.cu
pattern: .*\.(cc|cpp|cxx|h|hpp|hxx|cu)
compiler: nvcc
executable: build/codeai_dev_cuda
run_cmd: [$(executable)]
sources: []
includes: [
include,
/usr/local/cuda/include,
/home/yy/anaconda3/include,
/home/yy/anaconda3/include/python3.7m,
third_party/pybind11/include,
third_party/cutlass/include,
]
libpaths: [
/usr/local/cuda/lib64,
/home/yy/anaconda3/lib,
]
libraries: [-lpython3.7m, -lcudart, -lcublas, -ljpeg]
std: c++14
options: [
-Wno-deprecated-declarations,
"-gencode=arch=compute_52,code=sm_61",
"-gencode=arch=compute_61,code=sm_61",
"-gencode=arch=compute_60,code=sm_60",
"-gencode=arch=compute_70,code=sm_70",
"-gencode=arch=compute_75,code=sm_75",
]
torchdev:
type: CPPDevObserver
main_pattern: torchdev_.*\.(cu|cpp|cc|cxx)
pattern: .*\.(cc|cpp|cxx|h|hpp|hxx|cu)
compiler: clang++
executable: build/codeai_dev_torch
run_cmd: [$(executable)]
fail_cmds: # run cmd when pervious run fail with retcode
-6: [gdb, -ex, run, -ex, bt, -ex, quit, $(executable)] # segfault in unix
includes: [
include,
/home/yy/anaconda3/lib/python3.7/site-packages/torch/include,
/home/yy/anaconda3/lib/python3.7/site-packages/torch/include/torch/csrc/api/include,
/usr/local/cuda/include,
/home/yy/anaconda3/include,
/home/yy/anaconda3/include/python3.7m,
third_party/pybind11/include,
third_party/cutlass/include,
]
libpaths: [
/home/yy/anaconda3/lib/python3.7/site-packages/torch/lib,
/usr/local/cuda/lib64,
/home/yy/anaconda3/lib,
]
libraries: [-lpython3.7m, -lcublas, -lcudart, -ljpeg, -lpthread,
"-Wl,--no-as-needed,-lc10",
"-Wl,--no-as-needed,-ltorch",
"-Wl,--no-as-needed,-ltorch_cpu",
"-Wl,--no-as-needed,-lc10_cuda",
"-Wl,--no-as-needed,-ltorch_cuda"]
std: c++2a
options: [--cuda-gpu-arch=sm_61, -Wno-deprecated-declarations, -D_GLIBCXX_USE_CXX11_ABI=0]
cutlass @ e33d90b3
Subproject commit e33d90b36109f67915a80c532ebbb978b72c7bd2
......@@ -9,6 +9,10 @@ template <class... T> struct mp_list {};
template <class T, T... I>
using mp_list_c = mp_list<std::integral_constant<T, I>...>;
template <int... I>
using mp_list_int_c = mp_list<std::integral_constant<int, I>...>;
namespace detail {
template <class... Ts, class F>
......
......@@ -418,7 +418,8 @@ bool dispatch_container_noexcept(Iterator begin, Iterator end, F &&f) {
return;
}
if (count >= val_lst_size) {
TV_THROW_INVALID_ARG("iterator length invalid:", val_lst_size);
equal = false;
return;
}
constexpr auto c = decltype(E)::value;
if (c != *iter) {
......
......@@ -54,7 +54,7 @@ class CMakeBuild(build_ext):
'-DCMAKE_PREFIX_PATH={}'.format(LIBTORCH_ROOT),
'-DPYBIND11_PYTHON_VERSION={}'.format(PYTHON_VERSION),
'-DSPCONV_BuildTests=OFF',
'-DPYTORCH_VERSION={}'.format(PYTORCH_VERSION_NUMBER)
'-DPYTORCH_VERSION={}'.format(PYTORCH_VERSION_NUMBER),
] # -arch=sm_61
if not torch.cuda.is_available() and SPCONV_FORCE_BUILD_CUDA is None:
cmake_args += ['-DSPCONV_BuildCUDA=OFF']
......
......@@ -10,7 +10,7 @@ if(OpenMP_CXX_FOUND)
endif()
target_include_directories(spconv PRIVATE ${ALL_INCLUDE} )
target_include_directories(spconv PRIVATE ${ALL_INCLUDE} ${MP11_INCLUDE} )
set_property(TARGET spconv PROPERTY CUDA_STANDARD 14)
set_property(TARGET spconv PROPERTY CXX_STANDARD 14)
set_target_properties(spconv PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include <ATen/ATen.h>
#include <boost/mp11.hpp>
#include <chrono>
#include <cuhash/hash_table.h>
#include <limits>
......@@ -226,6 +227,18 @@ int create_submconv_indice_pair_cuda(
spatialVolume *= outSpatialShape[i];
}
auto dispatcher = tv::DispatchIntNoexcept<tv::mp_list_c<int, 1, 3, 5>>();
namespace mp11 = boost::mp11;
using kernel2_candidates_t =
mp11::mp_product<tv::mp_list, tv::mp_list_int_c<1, 3, 5>,
tv::mp_list_int_c<1, 3, 5>>;
using kernel3_candidates_t =
mp11::mp_product<tv::mp_list, tv::mp_list_int_c<1, 3, 5>,
tv::mp_list_int_c<1, 3, 5>,
tv::mp_list_int_c<1, 3, 5>>;
using kernel3_candidates_final_t = mp11::mp_push_back<kernel3_candidates_t>;
auto dispatcher2 = tv::DispatchContainerNoexcept<kernel2_candidates_t>();
auto dispatcher3 = tv::DispatchContainerNoexcept<kernel3_candidates_final_t>();
if (useHash) {
auto table = cuhash::HashTable();
......@@ -263,43 +276,33 @@ int create_submconv_indice_pair_cuda(
if (NDim == 2) {
tv::SimpleVector<Index, 2> ou_(outSpatialShape.begin(),
outSpatialShape.end());
dispatcher(kernelSize[0], [&](auto K0C) {
dispatcher(kernelSize[1], [&](auto K1C) {
constexpr int K0 = decltype(K0C)::value;
constexpr int K1 = decltype(K1C)::value;
dispatcher2(kernelSize.begin(), kernelSize.end(), [&](auto K) {
constexpr int K0 = mp11::mp_at_c<decltype(K), 0>::value;
constexpr int K1 = mp11::mp_at_c<decltype(K), 1>::value;
found = true;
getSubMIndicePairsHashUnrollKernel2<Index, K0, K1>
<<<tv::cuda::getBlocks(numActIn),
tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
tv::torch2tv<Index>(indicesIn),
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ou_, spatialVolume,
tableSize, tableData, constants, stash_constants,
stash_count);
});
tv::torch2tv<Index>(indiceNum), ou_,
spatialVolume, tableSize, tableData,
constants, stash_constants, stash_count);
});
} else if (NDim == 3) {
tv::SimpleVector<Index, 3> ou_(outSpatialShape.begin(),
outSpatialShape.end());
dispatcher(kernelSize[0], [&](auto K0C) {
dispatcher(kernelSize[1], [&](auto K1C) {
dispatcher(kernelSize[2], [&](auto K2C) {
constexpr int K0 = decltype(K0C)::value;
constexpr int K1 = decltype(K1C)::value;
constexpr int K2 = decltype(K2C)::value;
dispatcher3(kernelSize.begin(), kernelSize.end(), [&](auto K) {
constexpr int K0 = mp11::mp_at_c<decltype(K), 0>::value;
constexpr int K1 = mp11::mp_at_c<decltype(K), 1>::value;
constexpr int K2 = mp11::mp_at_c<decltype(K), 2>::value;
found = true;
getSubMIndicePairsHashUnrollKernel3<Index, K0, K1, K2>
<<<tv::cuda::getBlocks(numActIn),
tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
tv::torch2tv<Index>(indicesIn),
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ou_, spatialVolume,
tableSize, tableData, constants, stash_constants,
stash_count);
});
});
tv::torch2tv<Index>(indiceNum), ou_,
spatialVolume, tableSize, tableData,
constants, stash_constants, stash_count);
});
}
}
......@@ -338,24 +341,35 @@ int create_submconv_indice_pair_cuda(
if (NDim == 2) {
tv::SimpleVector<Index, 2> ou_(outSpatialShape.begin(),
outSpatialShape.end());
dispatcher(kernelSize[0], [&](auto K0C) {
dispatcher(kernelSize[1], [&](auto K1C) {
constexpr int K0 = decltype(K0C)::value;
constexpr int K1 = decltype(K1C)::value;
dispatcher2(kernelSize.begin(), kernelSize.end(), [&](auto K) {
constexpr int K0 = mp11::mp_at_c<decltype(K), 0>::value;
constexpr int K1 = mp11::mp_at_c<decltype(K), 1>::value;
found = true;
getSubMIndicePairsUnrollKernel2<Index, IndexGrid, K0, K1>
<<<tv::cuda::getBlocks(numActIn),
tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
tv::torch2tv<Index>(indicesIn),
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<IndexGrid>(gridsOut),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ou_, spatialVolume);
});
tv::torch2tv<Index>(indiceNum), ou_,
spatialVolume);
});
} else if (NDim == 3) {
tv::SimpleVector<Index, 3> ou_(outSpatialShape.begin(),
outSpatialShape.end());
dispatcher3(kernelSize.begin(), kernelSize.end(), [&](auto K) {
constexpr int K0 = mp11::mp_at_c<decltype(K), 0>::value;
constexpr int K1 = mp11::mp_at_c<decltype(K), 1>::value;
constexpr int K2 = mp11::mp_at_c<decltype(K), 2>::value;
found = true;
getSubMIndicePairsUnrollKernel3<Index, IndexGrid, K0, K1, K2>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<IndexGrid>(gridsOut),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ou_,
spatialVolume);
});
/*
dispatcher(kernelSize[0], [&](auto K0C) {
dispatcher(kernelSize[1], [&](auto K1C) {
dispatcher(kernelSize[2], [&](auto K2C) {
......@@ -372,7 +386,7 @@ int create_submconv_indice_pair_cuda(
tv::torch2tv<Index>(indiceNum), ou_, spatialVolume);
});
});
});
});*/
}
}
if (!found) {
......@@ -396,7 +410,8 @@ int create_submconv_indice_pair_cuda(
resetGridSubMKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(indicesIn.data_ptr<Index>(),
tv::torch2tv<IndexGrid>(gridsOut), ou, numActIn, spatialVolume);
tv::torch2tv<IndexGrid>(gridsOut), ou, numActIn,
spatialVolume);
TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
}
});
......
......@@ -35,7 +35,7 @@ using half_vec_t =
template <typename T>
using half_vec_sadd_t =
std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>;
using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
using kernel_block_t = tv::mp_list_c<int, 64, 32, 16, 8>;
void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
torch::Tensor indices, int size) {
......@@ -45,6 +45,7 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
auto stream = at::cuda::getCurrentCUDAStream();
auto dtype = features.scalar_type();
auto inds_dtype = indices.scalar_type();
// auto timer = spconv::CudaContextTimer<>();
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = decltype(TValue);
using vecload_type_t = half_vec_t<T>;
......@@ -53,8 +54,7 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
tv::mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
&notFound](auto NumTLP) {
tv::mp_for_each<kernel_block_t>([&](auto NumTLP) {
constexpr int NumILP = NumTLP / 4;
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
int nHotBlock = (size / NumTLP) * NumTLP;
......@@ -87,6 +87,7 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
features.data_ptr<T>(),
indices.data_ptr<Index>() + nHotBlock,
size - nHotBlock, numPlanes / vecloadFactor);
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(
......@@ -143,8 +144,7 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
constexpr int vecloadFactor =
sizeof(vecload_type_t) / sizeof(T); // important for half.
tv::mp_for_each<kernel_block_t>([=, &outFeatures, &buffer, &indices,
&notFound](auto NumTLP) {
tv::mp_for_each<kernel_block_t>([&](auto NumTLP) {
// constexpr int NumILP = NumTLP / (64 / (NumTLP /
// vecloadFactor));
constexpr int NumILP = NumTLP / 4;
......
......@@ -70,7 +70,7 @@ class Net(nn.Module):
def main():
voxels, coors, spatial_shape = waymo_data()
voxels_th = torch.from_numpy(voxels).cuda().float()
coors_th = torch.from_numpy(coors).cuda()
coors_th = torch.from_numpy(coors).cuda().int()
net = Net(spatial_shape[::-1]).cuda().eval().float()
print(coors_th.shape)
out = net(voxels_th, coors_th, 1)
......
......@@ -349,7 +349,7 @@ def scatter_nd(indices, updates, shape):
class TestSpConv(TestCase):
def testSpConv3d(self):
np.random.seed(484)
devices = ["cpu:0"]
devices = ["cuda:0"]
shapes = [[19, 18, 17]]
batchsizes = [1, 2]
......@@ -752,8 +752,8 @@ def main_subm(algo, dtype=torch.float32):
if __name__ == '__main__':
main(algo=spconv.ConvAlgo.Native, dtype=torch.float32)
main(algo=spconv.ConvAlgo.Native, dtype=torch.half)
main_subm(algo=spconv.ConvAlgo.Native, dtype=torch.float32)
# main(algo=spconv.ConvAlgo.Native, dtype=torch.half)
# TestCase().assertAllClose(out_my, out_ref)
# unittest.main()
# TestSpConv().testSpConv3d()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment