cudaCompact.cu 9.64 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185

/* Code for CUDA stream compaction. Roughly based on:
    Billeter M, Olsson O, Assarsson U. Efficient Stream Compaction on Wide SIMD Many-Core Architectures.
        High Performance Graphics 2009.

    Notes:
        - paper recommends 128 threads/block, so this is hard coded.
        - I only implement the prefix-sum based compact primitive, and not the POPC one, as that is more
          complicated and performs poorly on current hardware
        - I only implement the scattered- and staged-write variant of phase III as it they have reasonable
          performance across most of the tested workloads in the paper. The selective variant is not
          implemented.
        - The prefix sum of per-block element counts (phase II) is not done in a particularly efficient
          manner. It is, however, done in a very easy to program manner, and integrated into the top of
          phase III, reducing the number of kernel invocations required. If one wanted to use existing code,
          it'd be easy to take the CUDA SDK scanLargeArray sample, and do a prefix sum over dgBlockCounts in
          a phase II kernel. You could also adapt the existing prescan128 to take an initial value, and scan
          dgBlockCounts in stages.

  Date:         23 Aug 2009
  Author:       Imran Haque (ihaque@cs.stanford.edu)
  Affiliation:  Stanford University
  License:      Public Domain
*/

#include "cudaCompact.h"

typedef unsigned int T;

// Phase 1: Count valid elements per thread block
// Hard-code 128 thd/blk
__device__ unsigned int sumReduce128(unsigned int* arr) {
    // Parallel reduce element counts
    // Assumes 128 thd/block
    if (threadIdx.x < 64) arr[threadIdx.x] += arr[threadIdx.x+64];
    __syncthreads();
    if (threadIdx.x < 32) {
        arr[threadIdx.x] += arr[threadIdx.x+32];
        if (threadIdx.x < 16) arr[threadIdx.x] += arr[threadIdx.x+16];
        if (threadIdx.x < 8) arr[threadIdx.x] += arr[threadIdx.x+8];
        if (threadIdx.x < 4) arr[threadIdx.x] += arr[threadIdx.x+4];
        if (threadIdx.x < 2) arr[threadIdx.x] += arr[threadIdx.x+2];
        if (threadIdx.x < 1) arr[threadIdx.x] += arr[threadIdx.x+1];
    }
    __syncthreads();
    return arr[0];
}

__global__ void countElts(unsigned int* dgBlockCounts,const unsigned int* dgValid,const size_t eltsPerBlock,const size_t len) {
    __shared__ unsigned int dsCount[128];
    dsCount[threadIdx.x] = 0;
    size_t ub;
    ub = (len < (blockIdx.x+1)*eltsPerBlock) ? len : ((blockIdx.x + 1)*eltsPerBlock);
    for (int base = blockIdx.x * eltsPerBlock; base < (blockIdx.x+1)*eltsPerBlock; base += blockDim.x) {
        if ((base + threadIdx.x) < ub && dgValid[base+threadIdx.x])
            dsCount[threadIdx.x]++;
    }
    __syncthreads();
    unsigned int blockCount = sumReduce128(dsCount);
    if (threadIdx.x == 0) dgBlockCounts[blockIdx.x] = blockCount;
    return;
}

// Phase 2/3: Move valid elements using SIMD compaction (phase 2 is done implicitly at top of __global__ method)
// Exclusive prefix scan over 128 elements
// Assumes 128 threads
// Taken from cuda SDK "scan" sample for naive scan, with small modifications
__device__ int exclusivePrescan128(const unsigned int* in,unsigned int* outAndTemp) {
    const int n=128;
    //TODO: this temp storage could be reduced since we write to shared memory in out anyway, and n is hardcoded
    //__shared__ int temp[2*n];
    unsigned int* temp = outAndTemp;
    int pout = 1, pin = 0;

    // load input into temp
    // This is exclusive scan, so shift right by one and set first elt to 0
    temp[pout*n + threadIdx.x] = (threadIdx.x > 0) ? in[threadIdx.x-1] : 0;
    __syncthreads();

    for (int offset = 1; offset < n; offset *= 2)
    {
        pout = 1 - pout; // swap double buffer indices
        pin  = 1 - pout;
        __syncthreads();
        temp[pout*n+threadIdx.x] = temp[pin*n+threadIdx.x];
        if (threadIdx.x >= offset)
            temp[pout*n+threadIdx.x] += temp[pin*n+threadIdx.x - offset];
    }

    //out[threadIdx.x] = temp[pout*n+threadIdx.x]; // write output
    __syncthreads();
    return outAndTemp[127]+in[127]; // Return sum of all elements
}
__device__ int compactSIMDPrefixSum(const T* dsData,const unsigned int* dsValid,T* dsCompact) {
    __shared__ unsigned int dsLocalIndex[256];
    int numValid = exclusivePrescan128(dsValid,dsLocalIndex);
    if (dsValid[threadIdx.x]) dsCompact[dsLocalIndex[threadIdx.x]] = dsData[threadIdx.x];
    return numValid;
}

__global__ void moveValidElementsStaged(const T* dgData,T* dgCompact,const unsigned int* dgValid,const unsigned int* dgBlockCounts,size_t eltsPerBlock,size_t len,size_t* dNumValidElements) {
    __shared__ T inBlock[128];
    __shared__ unsigned int validBlock[128];
    __shared__ T compactBlock[128];
    int blockOutOffset=0;
    // Sum up the blockCounts before us to find our offset
    // This is totally inefficient - lots of repeated work b/w blocks, and uneven balancing.
    // Paper implements this as a prefix sum kernel in phase II
    // May still be faster than an extra kernel invocation?
    for (int base = 0; base < blockIdx.x; base += blockDim.x) {
        // Load up the count of valid elements for each block before us in batches of 128
        if ((base + threadIdx.x) < blockIdx.x) {
            validBlock[threadIdx.x] = dgBlockCounts[base+threadIdx.x];
        } else {
            validBlock[threadIdx.x] = 0;
        }
        __syncthreads();
        // Parallel reduce these counts
        // Accumulate in the final offset variable
        blockOutOffset += sumReduce128(validBlock);
    }

    size_t ub;
    ub = (len < (blockIdx.x+1)*eltsPerBlock) ? len : ((blockIdx.x + 1)*eltsPerBlock);
    for (int base = blockIdx.x * eltsPerBlock; base < (blockIdx.x+1)*eltsPerBlock; base += blockDim.x) {
        if ((base + threadIdx.x) < ub) {
            validBlock[threadIdx.x] = dgValid[base+threadIdx.x];
            inBlock[threadIdx.x] = dgData[base+threadIdx.x];
        } else {
            validBlock[threadIdx.x] = 0;
        }
        __syncthreads();
        int numValidBlock = compactSIMDPrefixSum(inBlock,validBlock,compactBlock);
        __syncthreads();
        if (threadIdx.x < numValidBlock) {
            dgCompact[blockOutOffset + threadIdx.x] = compactBlock[threadIdx.x];
        }
        blockOutOffset += numValidBlock;
    }
    if (blockIdx.x == (gridDim.x-1) && threadIdx.x == 0) {
        *dNumValidElements = blockOutOffset;
    }
}

__global__ void moveValidElementsScattered(const T* dgData,T* dgCompact,const unsigned int* dgValid,const unsigned int* dgBlockCounts,size_t eltsPerBlock,size_t len,size_t* dNumValidElements) {
    __shared__ T inBlock[128];
    __shared__ unsigned int validBlock[128];
    T* compactBlock=dgCompact;
    size_t blockOutOffset = 0;
    // Sum up the blockCounts before us to find our offset
    // This is totally inefficient - lots of repeated work b/w blocks, and uneven balancing.
    // Paper implements this as a prefix sum kernel in phase II
    // May still be faster than an extra kernel invocation?
    for (int base = 0; base < blockIdx.x; base += blockDim.x) {
        // Load up the count of valid elements for each block before us in batches of 128
        if ((base + threadIdx.x) < blockIdx.x) {
            validBlock[threadIdx.x] = dgBlockCounts[base+threadIdx.x];
        } else {
            validBlock[threadIdx.x] = 0;
        }
        __syncthreads();
        // Parallel reduce these counts
        // Accumulate in the final offset variable
        blockOutOffset += sumReduce128(validBlock);
    }
    compactBlock += blockOutOffset;
    size_t ub;
    ub = (len < (blockIdx.x+1)*eltsPerBlock) ? len : ((blockIdx.x + 1)*eltsPerBlock);
    for (int base = blockIdx.x * eltsPerBlock; base < (blockIdx.x+1)*eltsPerBlock; base += blockDim.x) {
        if ((base + threadIdx.x) < ub) {
            validBlock[threadIdx.x] = dgValid[base+threadIdx.x];
            inBlock[threadIdx.x] = dgData[base+threadIdx.x];
        } else {
            validBlock[threadIdx.x] = 0;
        }
        __syncthreads();
        int numValidBlock = compactSIMDPrefixSum(inBlock,validBlock,compactBlock);
        blockOutOffset += numValidBlock;
        compactBlock += numValidBlock;
    }
    if (blockIdx.x == (gridDim.x-1) && threadIdx.x == 0) {
        *dNumValidElements = blockOutOffset;
    }
}

186
void OPENMMCUDA_EXPORT planCompaction(compactionPlan& d,bool stageOutput) {
Peter Eastman's avatar
Peter Eastman committed
187
188
    int device;
    cudaGetDevice(&device);
189
    cudaDeviceProp deviceProp;
Peter Eastman's avatar
Peter Eastman committed
190
    cudaGetDeviceProperties(&deviceProp, device);
191
192
193
194
195
196
197
    d.nThreadBlocks = 16*deviceProp.multiProcessorCount;
    cudaMalloc((void**)&(d.dgBlockCounts), d.nThreadBlocks*sizeof(unsigned int));
    d.stageOutput = stageOutput;
    // TODO: make sure allocation worked
    d.valid = true;
}

198
void OPENMMCUDA_EXPORT destroyCompactionPlan(compactionPlan& d) {
199
200
201
    if (d.valid) cudaFree(d.dgBlockCounts);
}

202
int OPENMMCUDA_EXPORT compactStream(const compactionPlan& d,T* dOut,const T* dIn,const unsigned int* dValid,size_t len,size_t* dNumValid) {
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
    if (!d.valid) {
        return -1;
    }
    // Figure out # elements per block
    unsigned int numBlocks = d.nThreadBlocks;
    if (numBlocks*128 > len)
        numBlocks = (len+127)/128;
    const size_t eltsPerBlock = len/numBlocks + ((len % numBlocks) ? 1 : 0);

    // TODO: implement loop over blocks of 10M
    // Phase 1: Calculate number of valid elements per thread block
    countElts<<<numBlocks,128>>>(d.dgBlockCounts,dValid,eltsPerBlock,len);

    // Phase 2/3: Move valid elements using SIMD compaction
    if (d.stageOutput) {
        moveValidElementsStaged<<<numBlocks,128>>>(dIn,dOut,dValid,d.dgBlockCounts,eltsPerBlock,len,dNumValid);
    } else {
        moveValidElementsScattered<<<numBlocks,128>>>(dIn,dOut,dValid,d.dgBlockCounts,eltsPerBlock,len,dNumValid);
    }
    return 0;
}