/* Code for CUDA stream compaction. Roughly based on:
    Billeter M, Olsson O, Assarsson U. Efficient Stream Compaction on Wide SIMD Many-Core Architectures.
        High Performance Graphics 2009.

    Notes:
        - paper recommends 128 threads/block, so this is hard coded.
        - I only implement the prefix-sum based compact primitive, and not the POPC one, as that is more
          complicated and performs poorly on current hardware
        - I only implement the scattered- and staged-write variant of phase III as it they have reasonable
          performance across most of the tested workloads in the paper. The selective variant is not
          implemented.
        - The prefix sum of per-block element counts (phase II) is not done in a particularly efficient
          manner. It is, however, done in a very easy to program manner, and integrated into the top of
          phase III, reducing the number of kernel invocations required. If one wanted to use existing code,
          it'd be easy to take the CUDA SDK scanLargeArray sample, and do a prefix sum over dgBlockCounts in
          a phase II kernel. You could also adapt the existing prescan128 to take an initial value, and scan
          dgBlockCounts in stages.

  Date:         23 Aug 2009
  Author:       CUDA version by Imran Haque (ihaque@cs.stanford.edu), converted to OpenCL by Peter Eastman
  Affiliation:  Stanford University
  License:      Public Domain
*/

// Phase 1: Count valid elements per thread block
// Hard-code 128 thd/blk
unsigned int sumReduce128(__local unsigned int* arr) {
    // Parallel reduce element counts
    // Assumes 128 thd/block
    int thread = get_local_id(0);
    if (thread < 64) arr[thread] += arr[thread+64];
    barrier(CLK_LOCAL_MEM_FENCE);
#ifdef WARPS_ARE_ATOMIC
    if (thread < 32) {
        arr[thread] += arr[thread+32];
        if (thread < 16) arr[thread] += arr[thread+16];
        if (thread < 8) arr[thread] += arr[thread+8];
        if (thread < 4) arr[thread] += arr[thread+4];
        if (thread < 2) arr[thread] += arr[thread+2];
        if (thread < 1) arr[thread] += arr[thread+1];
    }
#else
    if (thread < 32) arr[thread] += arr[thread+32];
    barrier(CLK_LOCAL_MEM_FENCE);
    if (thread < 16) arr[thread] += arr[thread+16];
    barrier(CLK_LOCAL_MEM_FENCE);
    if (thread < 8) arr[thread] += arr[thread+8];
    barrier(CLK_LOCAL_MEM_FENCE);
    if (thread < 4) arr[thread] += arr[thread+4];
    barrier(CLK_LOCAL_MEM_FENCE);
    if (thread < 2) arr[thread] += arr[thread+2];
    barrier(CLK_LOCAL_MEM_FENCE);
    if (thread < 1) arr[thread] += arr[thread+1];
#endif
    barrier(CLK_LOCAL_MEM_FENCE);
    return arr[0];
}

__kernel void countElts(__global unsigned int* restrict dgBlockCounts, __global const unsigned int* restrict dgValid, const unsigned int len, __local unsigned int* restrict dsCount) {
    dsCount[get_local_id(0)] = 0;
    unsigned int ub;
    const unsigned int eltsPerBlock = len/get_num_groups(0) + ((len % get_num_groups(0)) ? 1 : 0);
    ub = (len < (get_group_id(0)+1)*eltsPerBlock) ? len : ((get_group_id(0) + 1)*eltsPerBlock);
    for (int base = get_group_id(0) * eltsPerBlock; base < (get_group_id(0)+1)*eltsPerBlock; base += get_local_size(0)) {
        if ((base + get_local_id(0)) < ub && dgValid[base+get_local_id(0)])
            dsCount[get_local_id(0)]++;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    unsigned int blockCount = sumReduce128(dsCount);
    if (get_local_id(0) == 0) dgBlockCounts[get_group_id(0)] = blockCount;
    return;
}

// Phase 2/3: Move valid elements using SIMD compaction (phase 2 is done implicitly at top of __global__ method)
// Exclusive prefix scan over 128 elements
// Assumes 128 threads
// Taken from cuda SDK "scan" sample for naive scan, with small modifications
int exclusivePrescan128(__local const unsigned int* in, __local unsigned int* outAndTemp) {
    const int n=128;
    //TODO: this temp storage could be reduced since we write to shared memory in out anyway, and n is hardcoded
    //__shared__ int temp[2*n];
    __local unsigned int* temp = outAndTemp;
    int pout = 1, pin = 0;

    // load input into temp
    // This is exclusive scan, so shift right by one and set first elt to 0
    int thread = get_local_id(0);
    temp[pout*n + get_local_id(0)] = (get_local_id(0) > 0) ? in[get_local_id(0)-1] : 0;
    barrier(CLK_LOCAL_MEM_FENCE);

    for (int offset = 1; offset < n; offset *= 2)
    {
        pout = 1 - pout; // swap double buffer indices
        pin  = 1 - pout;
        barrier(CLK_LOCAL_MEM_FENCE);
        temp[pout*n+get_local_id(0)] = temp[pin*n+get_local_id(0)];
        if (get_local_id(0) >= offset)
            temp[pout*n+get_local_id(0)] += temp[pin*n+get_local_id(0) - offset];
    }

    //out[get_local_id(0)] = temp[pout*n+get_local_id(0)]; // write output
    barrier(CLK_LOCAL_MEM_FENCE);
    return outAndTemp[127]+in[127]; // Return sum of all elements
}

int compactSIMDPrefixSum(__local const unsigned int* dsData, __local const unsigned int* dsValid, __local unsigned int* dsCompact, __local unsigned int* dsLocalIndex) {
    int numValid = exclusivePrescan128(dsValid,dsLocalIndex);
    int thread = get_local_id(0);
    if (dsValid[get_local_id(0)]) dsCompact[dsLocalIndex[get_local_id(0)]] = dsData[get_local_id(0)];
    return numValid;
}

__kernel void moveValidElementsStaged(__global const unsigned int* restrict dgData, __global unsigned int* restrict dgCompact, __global const unsigned int* restrict dgValid,
            __global const unsigned int* restrict dgBlockCounts, unsigned int len, __global unsigned int* restrict dNumValidElements,
            __local unsigned int* restrict inBlock, __local unsigned int* restrict validBlock, __local unsigned int* restrict compactBlock) {
    __local unsigned int dsLocalIndex[256];
    int blockOutOffset=0;
    // Sum up the blockCounts before us to find our offset
    // This is totally inefficient - lots of repeated work b/w blocks, and uneven balancing.
    // Paper implements this as a prefix sum kernel in phase II
    // May still be faster than an extra kernel invocation?
    int thread = get_local_id(0);
    for (int base = 0; base < get_group_id(0); base += get_local_size(0)) {
        // Load up the count of valid elements for each block before us in batches of 128
        if ((base + get_local_id(0)) < get_group_id(0)) {
            validBlock[get_local_id(0)] = dgBlockCounts[base+get_local_id(0)];
        } else {
            validBlock[get_local_id(0)] = 0;
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        // Parallel reduce these counts
        // Accumulate in the final offset variable
        blockOutOffset += sumReduce128(validBlock);
        barrier(CLK_LOCAL_MEM_FENCE);
    }

    unsigned int ub;
    const unsigned int eltsPerBlock = len/get_num_groups(0) + ((len % get_num_groups(0)) ? 1 : 0);
    ub = (len < (get_group_id(0)+1)*eltsPerBlock) ? len : ((get_group_id(0) + 1)*eltsPerBlock);
    for (int base = get_group_id(0) * eltsPerBlock; base < (get_group_id(0)+1)*eltsPerBlock; base += get_local_size(0)) {
        if ((base + get_local_id(0)) < ub) {
            validBlock[get_local_id(0)] = dgValid[base+get_local_id(0)];
            inBlock[get_local_id(0)] = dgData[base+get_local_id(0)];
        } else {
            validBlock[get_local_id(0)] = 0;
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        int numValidBlock = compactSIMDPrefixSum(inBlock,validBlock,compactBlock,dsLocalIndex);
        barrier(CLK_LOCAL_MEM_FENCE);
        if (get_local_id(0) < numValidBlock) {
            dgCompact[blockOutOffset + get_local_id(0)] = compactBlock[get_local_id(0)];
        }
        blockOutOffset += numValidBlock;
    }
    if (get_group_id(0) == (get_num_groups(0)-1) && get_local_id(0) == 0) {
        *dNumValidElements = blockOutOffset;
    }
}