"git@developer.sourcefind.cn:OpenDAS/colossalai.git" did not exist on "4e9989344d20e3f8af44767f0eadeaab5fff8c00"
Commit db13f963 authored by DouJS's avatar DouJS Committed by Frank Lee
Browse files

[NFC] polish colossalai/kernel/cuda_native/csrc/multi_tensor_apply.cuh code style (#1264)

parent 5d7366b1
// modified from https://github.com/NVIDIA/apex/blob/master/csrc/multi_tensor_apply.cuh // modified from
// https://github.com/NVIDIA/apex/blob/master/csrc/multi_tensor_apply.cuh
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <ATen/AccumulateType.h> #include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h> #include <ATen/cuda/Exceptions.h>
#include <assert.h>
#include <c10/cuda/CUDAGuard.h> #include <c10/cuda/CUDAGuard.h>
#include "compat.h"
#include <assert.h> #include "compat.h"
// #include <iostream> // #include <iostream>
...@@ -17,117 +18,108 @@ constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30}; ...@@ -17,117 +18,108 @@ constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320}; constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
template <int n> template <int n>
struct TensorListMetadata struct TensorListMetadata {
{ void *addresses[n][depth_to_max_tensors[n - 1]];
void *addresses[n][depth_to_max_tensors[n - 1]]; int sizes[depth_to_max_tensors[n - 1]];
int sizes[depth_to_max_tensors[n - 1]]; unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
unsigned char block_to_tensor[depth_to_max_blocks[n - 1]]; int block_to_chunk[depth_to_max_blocks[n - 1]]; // I fear this needs to be a
int block_to_chunk[depth_to_max_blocks[n - 1]]; // I fear this needs to be a full int. // full int.
int start_tensor_this_launch; int start_tensor_this_launch;
}; };
template <typename T, typename U, typename... ArgTypes> template <typename T, typename U, typename... ArgTypes>
__global__ void multi_tensor_apply_kernel( __global__ void multi_tensor_apply_kernel(int chunk_size,
int chunk_size, volatile int *noop_flag, T tl,
volatile int *noop_flag, U callable, ArgTypes... args) {
T tl, // Hand the chunk information to the user-supplied functor to process however
U callable, // it likes.
ArgTypes... args) callable(chunk_size, noop_flag, tl, args...);
{
// Hand the chunk information to the user-supplied functor to process however it likes.
callable(chunk_size, noop_flag, tl, args...);
} }
template <int depth, typename T, typename... ArgTypes> template <int depth, typename T, typename... ArgTypes>
void multi_tensor_apply( void multi_tensor_apply(
int block_size, int block_size, int chunk_size, const at::Tensor &noop_flag,
int chunk_size, const std::vector<std::vector<at::Tensor>> &tensor_lists, T callable,
const at::Tensor &noop_flag, ArgTypes... args) {
const std::vector<std::vector<at::Tensor>> &tensor_lists, TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
T callable, int len0 = tensor_lists[0].size();
ArgTypes... args) TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
{ auto ref_device = tensor_lists[0][0].device();
TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth"); TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
int len0 = tensor_lists[0].size(); for (int l = 0; l < tensor_lists.size();
TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0"); l++) // No range-based for because I need indices
auto ref_device = tensor_lists[0][0].device(); {
TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda"); TORCH_CHECK(tensor_lists[l].size() == len0,
for (int l = 0; l < tensor_lists.size(); l++) // No range-based for because I need indices "Size mismatch among tensor lists");
{ for (int t = 0; t < tensor_lists[l].size(); t++) {
TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists"); // TODO: Print which tensor fails.
for (int t = 0; t < tensor_lists[l].size(); t++) bool contiguous_memory = tensor_lists[l][t].is_contiguous();
{
// TODO: Print which tensor fails.
bool contiguous_memory = tensor_lists[l][t].is_contiguous();
#ifdef VERSION_GE_1_5 #ifdef VERSION_GE_1_5
contiguous_memory = (contiguous_memory || tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast)); contiguous_memory =
(contiguous_memory ||
tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
#endif #endif
TORCH_CHECK(contiguous_memory, "A tensor was not contiguous."); TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
TORCH_CHECK(tensor_lists[l][t].device() == ref_device, "A tensor was not on the same device as the first tensor"); TORCH_CHECK(tensor_lists[l][t].device() == ref_device,
TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch"); "A tensor was not on the same device as the first tensor");
} TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(),
"Size mismatch");
} }
}
int ntensors = tensor_lists[0].size();
int ntensors = tensor_lists[0].size();
TensorListMetadata<depth> tl;
TensorListMetadata<depth> tl;
const at::cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
auto stream = at::cuda::getCurrentCUDAStream(); const at::cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
auto stream = at::cuda::getCurrentCUDAStream();
tl.start_tensor_this_launch = 0;
int loc_block_info = 0; tl.start_tensor_this_launch = 0;
int loc_tensor_info = 0; int loc_block_info = 0;
for (int t = 0; t < ntensors; t++) int loc_tensor_info = 0;
{ for (int t = 0; t < ntensors; t++) {
tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel(); tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
for (int d = 0; d < depth; d++) for (int d = 0; d < depth; d++)
tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr(); tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
loc_tensor_info++; loc_tensor_info++;
int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size; int chunks_this_tensor =
(tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
for (int chunk = 0; chunk < chunks_this_tensor; chunk++)
{ for (int chunk = 0; chunk < chunks_this_tensor; chunk++) {
// std::cout << chunks_this_tensor << std::endl; // std::cout << chunks_this_tensor << std::endl;
tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1; tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
tl.block_to_chunk[loc_block_info] = chunk; tl.block_to_chunk[loc_block_info] = chunk;
loc_block_info++; loc_block_info++;
bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth - 1] && bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
chunk == chunks_this_tensor - 1); chunk == chunks_this_tensor - 1);
bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]); bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]);
bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1); bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
if (tensors_full || blocks_full || last_chunk) if (tensors_full || blocks_full || last_chunk) {
{ // using accscalar_t = acc_type<scalar_t, true>;
// using accscalar_t = acc_type<scalar_t, true>; multi_tensor_apply_kernel<<<loc_block_info, block_size, 0, stream>>>(
multi_tensor_apply_kernel<<<loc_block_info, block_size, 0, stream>>>( chunk_size, noop_flag.DATA_PTR<int>(), tl, callable, args...);
chunk_size,
noop_flag.DATA_PTR<int>(), AT_CUDA_CHECK(cudaGetLastError());
tl,
callable, // Reset. The control flow possibilities here make my brain hurt.
args...); loc_block_info = 0;
if (chunk == chunks_this_tensor - 1) {
AT_CUDA_CHECK(cudaGetLastError()); // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3
// << std::endl;
// Reset. The control flow possibilities here make my brain hurt. loc_tensor_info = 0;
loc_block_info = 0; tl.start_tensor_this_launch = t + 1;
if (chunk == chunks_this_tensor - 1) } else {
{ // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3
// std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 << std::endl; // << std::endl;
loc_tensor_info = 0; tl.sizes[0] = tl.sizes[loc_tensor_info - 1];
tl.start_tensor_this_launch = t + 1; for (int d = 0; d < depth; d++)
} tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1];
else loc_tensor_info = 1;
{ tl.start_tensor_this_launch = t;
// std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 << std::endl;
tl.sizes[0] = tl.sizes[loc_tensor_info - 1];
for (int d = 0; d < depth; d++)
tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1];
loc_tensor_info = 1;
tl.start_tensor_this_launch = t;
}
}
} }
}
} }
}
} }
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment