OpenCLSort.cpp 9.84 KB
Newer Older
1
2
3
4
5
6
7
8
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
9
 * Portions copyright (c) 2010-2025 Stanford University and the Authors.      *
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
 * This program is free software: you can redistribute it and/or modify       *
 * it under the terms of the GNU Lesser General Public License as published   *
 * by the Free Software Foundation, either version 3 of the License, or       *
 * (at your option) any later version.                                        *
 *                                                                            *
 * This program is distributed in the hope that it will be useful,            *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
 * GNU Lesser General Public License for more details.                        *
 *                                                                            *
 * You should have received a copy of the GNU Lesser General Public License   *
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

peastman's avatar
peastman committed
27
28
29
30
#ifdef _MSC_VER
    // Prevent Windows from defining macros that interfere with other code.
    #define NOMINMAX
#endif
31
32
#include "OpenCLSort.h"
#include "OpenCLKernelSources.h"
peastman's avatar
peastman committed
33
#include <algorithm>
34
#include <map>
peastman's avatar
peastman committed
35
#include <string>
36
37
38
39

using namespace OpenMM;
using namespace std;

40
OpenCLSort::OpenCLSort(OpenCLContext& context, ComputeSortImpl::SortTrait* trait, unsigned int length, bool uniform) :
41
        context(context), trait(trait), dataLength(length), uniform(uniform) {
42
43
44
45
46
47
48
49
50
    // Create kernels.

    std::map<std::string, std::string> replacements;
    replacements["DATA_TYPE"] = trait->getDataType();
    replacements["KEY_TYPE"] =  trait->getKeyType();
    replacements["SORT_KEY"] = trait->getSortKey();
    replacements["MIN_KEY"] = trait->getMinKey();
    replacements["MAX_KEY"] = trait->getMaxKey();
    replacements["MAX_VALUE"] = trait->getMaxValue();
51
    replacements["UNIFORM"] = (uniform ? "1" : "0");
52
53
    cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements));
    shortListKernel = cl::Kernel(program, "sortShortList");
peastman's avatar
peastman committed
54
    shortList2Kernel = cl::Kernel(program, "sortShortList2");
55
56
    computeRangeKernel = cl::Kernel(program, "computeRange");
    assignElementsKernel = cl::Kernel(program, "assignElementsToBuckets");
57
    assignElementsKernel = cl::Kernel(program, uniform ? "assignElementsToBuckets" : "assignElementsToBuckets2");
58
59
60
61
62
63
64
65
    computeBucketPositionsKernel = cl::Kernel(program, "computeBucketPositions");
    copyToBucketsKernel = cl::Kernel(program, "copyDataToBuckets");
    sortBucketsKernel = cl::Kernel(program, "sortBuckets");

    // Work out the work group sizes for various kernels.

    unsigned int maxGroupSize = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
    int maxSharedMem = context.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
66
67
    unsigned int maxRangeSize = std::min(maxGroupSize, (unsigned int) computeRangeKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
    unsigned int maxPositionsSize = std::min(maxGroupSize, (unsigned int) computeBucketPositionsKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
peastman's avatar
peastman committed
68
    int maxLocalBuffer = (maxSharedMem/trait->getDataSize())/2;
69
    int maxShortList = max(maxLocalBuffer, (int) OpenCLContext::ThreadBlockSize*context.getNumThreadBlocks());
peastman's avatar
Bug fix  
peastman committed
70
    string vendor = context.getDevice().getInfo<CL_DEVICE_VENDOR>();
71
72
    if (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA") {
        maxShortList = min(3000, maxShortList);
peastman's avatar
Bug fix  
peastman committed
73
        useShortList2 = (dataLength <= OpenCLContext::ThreadBlockSize*context.getNumThreadBlocks());
74
75
76
    }
    else {
        maxShortList = min(1024, maxShortList);
peastman's avatar
Bug fix  
peastman committed
77
        useShortList2 = false;
78
79
    }
    isShortList = (length <= maxShortList);
80
    for (rangeKernelSize = 1; rangeKernelSize*2 <= maxRangeSize; rangeKernelSize *= 2)
81
        ;
82
    positionsKernelSize = std::min(rangeKernelSize, maxPositionsSize);
83
84
85
86
87
88
89
90
91
92
93
94
95
96
    sortKernelSize = (isShortList ? rangeKernelSize : rangeKernelSize/2);
    if (rangeKernelSize > length)
        rangeKernelSize = length;
    if (sortKernelSize > maxLocalBuffer)
        sortKernelSize = maxLocalBuffer;
    unsigned int targetBucketSize = sortKernelSize/2;
    unsigned int numBuckets = length/targetBucketSize;
    if (numBuckets < 1)
        numBuckets = 1;
    if (positionsKernelSize > numBuckets)
        positionsKernelSize = numBuckets;

    // Create workspace arrays.

97
98
99
100
    dataRange.initialize(context, 2, trait->getKeySize(), "sortDataRange");
    bucketOffset.initialize<cl_uint>(context, numBuckets, "bucketOffset");
    bucketOfElement.initialize<cl_uint>(context, length, "bucketOfElement");
    offsetInBucket.initialize<cl_uint>(context, length, "offsetInBucket");
peastman's avatar
peastman committed
101
    buckets.initialize(context, length, trait->getDataSize(), "buckets");
102
103
104
105
106
107
}

OpenCLSort::~OpenCLSort() {
    delete trait;
}

108
void OpenCLSort::sort(ArrayInterface& data) {
109
110
111
112
    if (data.getSize() != dataLength || data.getElementSize() != trait->getDataSize())
        throw OpenMMException("OpenCLSort called with different data size");
    if (data.getSize() == 0)
        return;
113
    OpenCLArray& cldata = context.unwrap(data);
114
    if (isShortList) {
peastman's avatar
peastman committed
115
        // We can use a simpler sort kernel that does the entire operation in one kernel.
116
        
117
118
        try {
            if (useShortList2) {
119
                shortList2Kernel.setArg<cl::Buffer>(0, cldata.getDeviceBuffer());
120
121
122
                shortList2Kernel.setArg<cl::Buffer>(1, buckets.getDeviceBuffer());
                shortList2Kernel.setArg<cl_int>(2, dataLength);
                context.executeKernel(shortList2Kernel, dataLength);
123
                buckets.copyTo(cldata);
124
125
            }
            else {
126
                shortListKernel.setArg<cl::Buffer>(0, cldata.getDeviceBuffer());
127
128
129
130
131
                shortListKernel.setArg<cl_uint>(1, dataLength);
                shortListKernel.setArg(2, dataLength*trait->getDataSize(), NULL);
                context.executeKernel(shortListKernel, sortKernelSize, sortKernelSize);
            }
            return;
peastman's avatar
peastman committed
132
        }
133
134
135
136
137
        catch (exception& ex) {
            // This can happen if we chose too large a size for the kernel.  Switch
            // over to the standard sorting method.
            
            isShortList = false;
peastman's avatar
peastman committed
138
        }
139
    }
140
141
142
143

    // Compute the range of data values.

    unsigned int numBuckets = bucketOffset.getSize();
144
145
    computeRangeKernel.setArg<cl::Buffer>(0, cldata.getDeviceBuffer());
    computeRangeKernel.setArg<cl_uint>(1, cldata.getSize());
146
147
148
149
150
151
152
153
154
    computeRangeKernel.setArg<cl::Buffer>(2, dataRange.getDeviceBuffer());
    computeRangeKernel.setArg(3, rangeKernelSize*trait->getKeySize(), NULL);
    computeRangeKernel.setArg(4, rangeKernelSize*trait->getKeySize(), NULL);
    computeRangeKernel.setArg<cl_int>(5, numBuckets);
    computeRangeKernel.setArg<cl::Buffer>(6, bucketOffset.getDeviceBuffer());
    context.executeKernel(computeRangeKernel, rangeKernelSize, rangeKernelSize);

    // Assign array elements to buckets.

155
156
    assignElementsKernel.setArg<cl::Buffer>(0, cldata.getDeviceBuffer());
    assignElementsKernel.setArg<cl_int>(1, cldata.getSize());
157
158
159
160
161
    assignElementsKernel.setArg<cl_int>(2, numBuckets);
    assignElementsKernel.setArg<cl::Buffer>(3, dataRange.getDeviceBuffer());
    assignElementsKernel.setArg<cl::Buffer>(4, bucketOffset.getDeviceBuffer());
    assignElementsKernel.setArg<cl::Buffer>(5, bucketOfElement.getDeviceBuffer());
    assignElementsKernel.setArg<cl::Buffer>(6, offsetInBucket.getDeviceBuffer());
162
    context.executeKernel(assignElementsKernel, cldata.getSize());
163
164
165
166
167
168
169
170
171
172

    // Compute the position of each bucket.

    computeBucketPositionsKernel.setArg<cl_int>(0, numBuckets);
    computeBucketPositionsKernel.setArg<cl::Buffer>(1, bucketOffset.getDeviceBuffer());
    computeBucketPositionsKernel.setArg(2, positionsKernelSize*sizeof(cl_int), NULL);
    context.executeKernel(computeBucketPositionsKernel, positionsKernelSize, positionsKernelSize);

    // Copy the data into the buckets.

173
    copyToBucketsKernel.setArg<cl::Buffer>(0, cldata.getDeviceBuffer());
174
    copyToBucketsKernel.setArg<cl::Buffer>(1, buckets.getDeviceBuffer());
175
    copyToBucketsKernel.setArg<cl_int>(2, cldata.getSize());
176
177
178
    copyToBucketsKernel.setArg<cl::Buffer>(3, bucketOffset.getDeviceBuffer());
    copyToBucketsKernel.setArg<cl::Buffer>(4, bucketOfElement.getDeviceBuffer());
    copyToBucketsKernel.setArg<cl::Buffer>(5, offsetInBucket.getDeviceBuffer());
179
    context.executeKernel(copyToBucketsKernel, cldata.getSize());
180
181
182

    // Sort each bucket.

183
    sortBucketsKernel.setArg<cl::Buffer>(0, cldata.getDeviceBuffer());
184
185
186
187
    sortBucketsKernel.setArg<cl::Buffer>(1, buckets.getDeviceBuffer());
    sortBucketsKernel.setArg<cl_int>(2, numBuckets);
    sortBucketsKernel.setArg<cl::Buffer>(3, bucketOffset.getDeviceBuffer());
    sortBucketsKernel.setArg(4, sortKernelSize*trait->getDataSize(), NULL);
188
    context.executeKernel(sortBucketsKernel, ((cldata.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize);
189
}