#pragma once // from pytorch.aten #include "tensorview.h" namespace tv { namespace launch { template inline int DivUp(const T1 a, const T2 b) { return (a + b - 1) / b; } // Use 1024 threads per block, which requires cuda sm_2x or above constexpr int CUDA_NUM_THREADS = 1024; // CUDA: number of blocks for threads. inline int getBlocks(const int N) { TV_ASSERT_RT_ERR(N > 0, "CUDA kernel launch blocks must be positive, but got N=", N); return DivUp(N, CUDA_NUM_THREADS); } } // namespace launch } // namespace tv