removed the slower coalesced flavor

4c605c1e · anton · caff36e7 · 4c605c1e · 4c605c1e · 4c605c1e
Commit 4c605c1e authored Dec 25, 2020 by anton
Hide whitespace changes
Inline Side-by-side

Showing with 20 additions and 105 deletions

discounted_cumsum.cpp discounted_cumsum.cpp +4 -6

discounted_cumsum.py discounted_cumsum.py +9 -16

discounted_cumsum_kernel.cu discounted_cumsum_kernel.cu +7 -83

No files found.
--- a/discounted_cumsum.cpp
+++ b/discounted_cumsum.cpp
 #include <torch/extension.h>

-torch::Tensor discounted_cumsum_right_minthreads(torch::Tensor x, double gamma);
-torch::Tensor discounted_cumsum_right_coalesced(torch::Tensor x, double gamma);
+
+torch::Tensor discounted_cumsum_right(torch::Tensor x, double gamma);


 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("discounted_cumsum_right_minthreads", &discounted_cumsum_right_minthreads,
-      "Discounted Cumulative Sum Right Minimum Threads");
-  m.def("discounted_cumsum_right_coalesced", &discounted_cumsum_right_coalesced,
-      "Discounted Cumulative Sum Right Coalesced Writes");
+  m.def("discounted_cumsum_right", &discounted_cumsum_right,
+      "Discounted Cumulative Sum Right");
 }
--- a/discounted_cumsum.py
+++ b/discounted_cumsum.py
@@ -29,12 +29,8 @@ torch_discounted_cumsum = load(
 #         return d_input, d_weights, d_bias, d_old_h, d_old_cell


-def discounted_cumsum_right_minthreads(input, gamma):
-    return torch_discounted_cumsum.discounted_cumsum_right_minthreads(input, gamma)
-
-
-def discounted_cumsum_right_coalesced(input, gamma):
-    return torch_discounted_cumsum.discounted_cumsum_right_coalesced(input, gamma)
+def discounted_cumsum_right(input, gamma):
+    return torch_discounted_cumsum.discounted_cumsum_right(input, gamma)


 def discounted_cumsum_right_gold(input, gamma):
@@ -50,33 +46,30 @@ def discounted_cumsum_right_gold(input, gamma):
    return out


-def test_fn(fn):
+def test():
    torch.manual_seed(0)
    x = torch.full((10, 10000), fill_value=1.0, dtype=torch.float32).cuda()
    gamma = 0.99
    out_gold_32 = discounted_cumsum_right_gold(x, gamma)
    out_gold_64 = discounted_cumsum_right_gold(x.double(), gamma)
-    out_fn = fn(x, gamma)
+    out_fn = discounted_cumsum_right(x, gamma)
    diff_32 = (out_fn - out_gold_32).abs().max().item()
    diff_64 = (out_fn - out_gold_64).abs().max().item()
-    print(fn.__name__)
    print('diff_32', diff_32)
    print('diff_64', diff_64)


-def test_speed(fn, reps=10000):
+def test_speed(reps=10000):
    torch.manual_seed(0)
    x = torch.randn(10, 100000, dtype=torch.float32).cuda()
    gamma = 0.99
    t1 = time.time()
    for _ in range(reps):
-        fn(x, gamma)
+        discounted_cumsum_right(x, gamma)
    t2 = time.time()
-    print(fn.__name__, t2-t1)
+    print('sec:', t2-t1)


 if __name__ == '__main__':
-    test_fn(discounted_cumsum_right_minthreads)
-    test_fn(discounted_cumsum_right_coalesced)
-    test_speed(discounted_cumsum_right_minthreads)
-    test_speed(discounted_cumsum_right_coalesced)
+    test()
+    test_speed()
--- a/discounted_cumsum_kernel.cu
+++ b/discounted_cumsum_kernel.cu
@@ -14,14 +14,11 @@ int log2ceil(int x) {


 template <typename scalar_t>
-__global__ void discounted_cumsum_right_kernel_minthreads_stage(
+__global__ void discounted_cumsum_right_kernel_stage(
        torch::PackedTensorAccessor32<scalar_t, 2> x,
        const scalar_t gamma,
        int stage
 ) {
-    // Pros: Minimum required number of threads, assigns them dynamically to respective positions upon each iteration.
-    // Cons: Uncoalesced writes.
-
    const int len = x.size(1);
    const int threadidx = blockIdx.x * blockDim.x + threadIdx.x;
    const int threadidy = blockIdx.y * blockDim.y + threadIdx.y;
@@ -53,49 +50,9 @@ __global__ void discounted_cumsum_right_kernel_minthreads_stage(
 }


-template <typename scalar_t>
-__global__ void discounted_cumsum_right_kernel_coalesced_stage(
-        torch::PackedTensorAccessor32<scalar_t, 2> x,
-        const scalar_t gamma,
-        int stage
-) {
-    // Pros: Coalesced writes.
-    // Cons: Threads allocated statically per each element. Half of threads idles upon each iteration.
-
-    const int len = x.size(1);
-    const int threadidx = blockIdx.x * blockDim.x + threadIdx.x;
-    const int threadidy = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (threadidx >= len || threadidy >= x.size(0)) {
-        return;
-    }
-
-    int gr_prev_stride = 1 << stage;
-    int gr_cur_stride = gr_prev_stride << 1;
-
-    int gr_of_thread = threadidx >> (stage + 1);
-    int thread_in_gr = threadidx - (gr_of_thread << (stage + 1));
-
-    int change_pos = threadidx;
-    int discounted_pos = gr_of_thread * gr_cur_stride + gr_prev_stride;
-    int discount_power = gr_prev_stride - thread_in_gr;
-
-    if (thread_in_gr >= gr_prev_stride || discounted_pos >= len) {
-        return;
-    }
-
-    x[threadidy][change_pos] = discounted_sum_pow(
-        x[threadidy][change_pos],
-        x[threadidy][discounted_pos],
-        gamma,
-        discount_power
-    );
-}
-
-
-torch::Tensor discounted_cumsum_right_minthreads(torch::Tensor x, double gamma) {
-    // Pros: Minimum required number of threads, assigns them dynamically to respective positions upon each iteration.
-    // Cons: Uncoalesced writes.
+torch::Tensor discounted_cumsum_right(torch::Tensor x, double gamma) {
+    // Minimum required number of threads, assigns them dynamically to respective positions upon each iteration.
+    // Results in uncoalesced writes, which is still faster than coalesced writes with half threads idling.

    TORCH_CHECK(x.type().is_cuda(), "Input must be a CUDA tensor");
    TORCH_CHECK(x.is_contiguous(), "Input must be contiguous");
@@ -108,47 +65,14 @@ torch::Tensor discounted_cumsum_right_minthreads(torch::Tensor x, double gamma)

    auto y = x.clone();

-    const int threads = 32;
+    const int threads = 64;
    const int nstages = log2ceil(x.size(1));
    const int threads_total_x = 1 << (nstages - 1);
    const dim3 blocks((threads_total_x + threads - 1) / threads, x.size(0));

    for (int stage=0; stage<nstages; stage++) {
-        AT_DISPATCH_FLOATING_TYPES(x.type(), "discounted_cumsum_right_kernel_minthreads_stage", ([&] {
-            discounted_cumsum_right_kernel_minthreads_stage<scalar_t><<<blocks, threads>>>(
-                y.packed_accessor32<scalar_t, 2>(),
-                scalar_t(gamma),
-                stage
-            );
-        }));
-    }
-
-    return y;
-}
-
-
-torch::Tensor discounted_cumsum_right_coalesced(torch::Tensor x, double gamma) {
-    // Pros: Coalesced writes.
-    // Cons: Threads allocated statically per each element. Half of threads idles upon each iteration.
-
-    TORCH_CHECK(x.type().is_cuda(), "Input must be a CUDA tensor");
-    TORCH_CHECK(x.is_contiguous(), "Input must be contiguous");
-    TORCH_CHECK(x.dim() == 2, "Input must be 2-dimensional");
-    TORCH_CHECK(0.0 <= gamma && gamma <= 1.0, "Gamma must be in the range [0,1]");
-
-    if (x.size(1) == 0) {
-        return x;
-    }
-
-    auto y = x.clone();
-
-    const int threads = 32;
-    const int nstages = log2ceil(x.size(1));
-    const dim3 blocks((x.size(1) + threads - 1) / threads, x.size(0));
-
-    for (int stage=0; stage<nstages; stage++) {
-        AT_DISPATCH_FLOATING_TYPES(x.type(), "discounted_cumsum_right_kernel_coalesced_stage", ([&] {
-            discounted_cumsum_right_kernel_coalesced_stage<scalar_t><<<blocks, threads>>>(
+        AT_DISPATCH_FLOATING_TYPES(x.type(), "discounted_cumsum_right_kernel_stage", ([&] {
+            discounted_cumsum_right_kernel_stage<scalar_t><<<blocks, threads>>>(
                y.packed_accessor32<scalar_t, 2>(),
                scalar_t(gamma),
                stage