constexpr int threadsPerBlock = 512; template constexpr inline T ceil_div(T n, T m) { return (n + m - 1) / m; }