utilities.cu 3.61 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
/**
 * This is called by the various functions below to clear a buffer.
 */
__device__ void clearSingleBuffer(int* __restrict__ buffer, int size) {
    int index = blockDim.x*blockIdx.x+threadIdx.x;
    int4* buffer4 = (int4*) buffer;
    int sizeDiv4 = size/4;
    while (index < sizeDiv4) {
        buffer4[index] = make_int4(0);
        index += blockDim.x*gridDim.x;
    }
    if (blockDim.x*blockIdx.x+threadIdx.x == 0)
        for (int i = sizeDiv4*4; i < size; i++)
            buffer[i] = 0;
}

/**
 * Fill a buffer with 0.
 */
__global__ void clearBuffer(int* __restrict__ buffer, int size) {
    clearSingleBuffer(buffer, size);
}

/**
 * Fill two buffers with 0.
 */
__global__ void clearTwoBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2) {
    clearSingleBuffer(buffer1, size1);
    clearSingleBuffer(buffer2, size2);
}

/**
 * Fill three buffers with 0.
 */
__global__ void clearThreeBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3) {
    clearSingleBuffer(buffer1, size1);
    clearSingleBuffer(buffer2, size2);
    clearSingleBuffer(buffer3, size3);
}

/**
 * Fill four buffers with 0.
 */
__global__ void clearFourBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3, int* __restrict__ buffer4, int size4) {
    clearSingleBuffer(buffer1, size1);
    clearSingleBuffer(buffer2, size2);
    clearSingleBuffer(buffer3, size3);
    clearSingleBuffer(buffer4, size4);
}

/**
 * Fill five buffers with 0.
 */
__global__ void clearFiveBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3, int* __restrict__ buffer4, int size4, int* __restrict__ buffer5, int size5) {
    clearSingleBuffer(buffer1, size1);
    clearSingleBuffer(buffer2, size2);
    clearSingleBuffer(buffer3, size3);
    clearSingleBuffer(buffer4, size4);
    clearSingleBuffer(buffer5, size5);
}

/**
 * Fill six buffers with 0.
 */
__global__ void clearSixBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3, int* __restrict__ buffer4, int size4, int* __restrict__ buffer5, int size5, int* __restrict__ buffer6, int size6) {
    clearSingleBuffer(buffer1, size1);
    clearSingleBuffer(buffer2, size2);
    clearSingleBuffer(buffer3, size3);
    clearSingleBuffer(buffer4, size4);
    clearSingleBuffer(buffer5, size5);
    clearSingleBuffer(buffer6, size6);
}

/**
 * Sum a collection of buffers into the first one.
 */

__global__ void reduceFloat4Buffer(float4* __restrict__ buffer, int bufferSize, int numBuffers) {
    int index = blockDim.x*blockIdx.x+threadIdx.x;
    int totalSize = bufferSize*numBuffers;
    while (index < bufferSize) {
        float4 sum = buffer[index];
        for (int i = index+bufferSize; i < totalSize; i += bufferSize)
            sum += buffer[i];
        buffer[index] = sum;
        index += blockDim.x*gridDim.x;
    }
}

/**
 * Sum the various buffers containing forces.
 */
__global__ void reduceForces(const long* __restrict__ longBuffer, float4* __restrict__ buffer, int bufferSize, int numBuffers) {
    int totalSize = bufferSize*numBuffers;
    float scale = 1.0f/(float) 0xFFFFFFFF;
    for (int index = blockDim.x*blockIdx.x+threadIdx.x; index < bufferSize; index += blockDim.x*gridDim.x) {
        float4 sum = make_float4(scale*longBuffer[index], scale*longBuffer[index+bufferSize], scale*longBuffer[index+2*bufferSize], 0.0f);
        for (int i = index; i < totalSize; i += bufferSize)
            sum += buffer[i];
        buffer[index] = sum;
    }
}