#ifndef __OPENMM_OPENCLSORT_H__ #define __OPENMM_OPENCLSORT_H__ /* -------------------------------------------------------------------------- * * OpenMM * * -------------------------------------------------------------------------- * * This is part of the OpenMM molecular simulation toolkit originating from * * Simbios, the NIH National Center for Physics-Based Simulation of * * Biological Structures at Stanford, funded under the NIH Roadmap for * * Medical Research, grant U54 GM072970. See https://simtk.org. * * * * Portions copyright (c) 2010 Stanford University and the Authors. * * Authors: Peter Eastman * * Contributors: * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU Lesser General Public License as published * * by the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU Lesser General Public License for more details. * * * * You should have received a copy of the GNU Lesser General Public License * * along with this program. If not, see . * * -------------------------------------------------------------------------- */ #include "OpenCLArray.h" #include "OpenCLKernelSources.h" #include "openmm/internal/windowsExport.h" #include namespace OpenMM { /** * This class sorts arrays of values. It supports any type of values, not just scalars, * so long as an appropriate sorting key can be defined by which to sort them. * * The algorithm used is a bucket sort, followed by a bitonic sort within each bucket * (in local memory when possible, in global memory otherwise). This is similar to * the algorithm described in * * Shifu Chen, Jing Qin, Yongming Xie, Junping Zhao, and Pheng-Ann Heng. "An Efficient * Sorting Algorithm with CUDA" Journal of the Chinese Institute of Engineers, 32(7), * pp. 915-921 (2009) * * but with many modifications and simplifications. In particular, this algorithm * involves much less communication between host and device, which is critical to get * good performance with the array sizes we typically work with (10,000 to 100,000 * elements). */ template class OPENMM_EXPORT OpenCLSort { public: /** * Create an OpenCLSort object for sorting data of a particular type. * * @param context the context in which to perform calculations * @param length the length of the arrays this object will be used to sort * @param typeName the name of the data type being sorting (e.g. "float") * @param sortKey an expression that returns the value by which the variable "value" should be sorted. * For primitive types, this will simply be "value". */ OpenCLSort(OpenCLContext& context, int length, const std::string& typeName, const std::string& sortKey) : context(context), dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL) { // Create kernels. std::map replacements; replacements["TYPE"] = typeName; replacements["SORT_KEY"] = sortKey; cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements)); computeRangeKernel = cl::Kernel(program, "computeRange"); assignElementsKernel = cl::Kernel(program, "assignElementsToBuckets"); computeBucketPositionsKernel = cl::Kernel(program, "computeBucketPositions"); copyToBucketsKernel = cl::Kernel(program, "copyDataToBuckets"); sortBucketsKernel = cl::Kernel(program, "sortBuckets"); // Work out the work group sizes for various kernels. int maxGroupSize = context.getDevice().getInfo(); for (rangeKernelSize = 1; rangeKernelSize*2 <= maxGroupSize; rangeKernelSize *= 2) ; positionsKernelSize = rangeKernelSize; sortKernelSize = rangeKernelSize/2; if (rangeKernelSize > length) rangeKernelSize = length; int maxLocalBuffer = (context.getDevice().getInfo()/sizeof(TYPE))/2; if (sortKernelSize > maxLocalBuffer) sortKernelSize = maxLocalBuffer; int targetBucketSize = sortKernelSize/2; int numBuckets = length/targetBucketSize; if (numBuckets < 1) numBuckets = 1; if (positionsKernelSize > numBuckets) positionsKernelSize = numBuckets; // Create workspace arrays. dataRange = new OpenCLArray(context, 1, "sortDataRange"); bucketOffset = new OpenCLArray(context, numBuckets, "bucketOffset"); bucketOfElement = new OpenCLArray(context, length, "bucketOfElement"); offsetInBucket = new OpenCLArray(context, length, "offsetInBucket"); buckets = new OpenCLArray(context, length, "buckets"); } ~OpenCLSort() { if (dataRange != NULL) delete dataRange; if (bucketOfElement != NULL) delete bucketOfElement; if (offsetInBucket != NULL) delete offsetInBucket; if (bucketOffset != NULL) delete bucketOffset; if (buckets != NULL) delete buckets; } /** * Sort an array. */ void sort(OpenCLArray& data) { // Compute the range of data values. computeRangeKernel.setArg(0, data.getDeviceBuffer()); computeRangeKernel.setArg(1, data.getSize()); computeRangeKernel.setArg(2, dataRange->getDeviceBuffer()); computeRangeKernel.setArg(3, rangeKernelSize*sizeof(cl_float), NULL); context.executeKernel(computeRangeKernel, rangeKernelSize, rangeKernelSize); // Assign array elements to buckets. int numBuckets = bucketOffset->getSize(); context.clearBuffer(bucketOffset->getDeviceBuffer(), numBuckets); assignElementsKernel.setArg(0, data.getDeviceBuffer()); assignElementsKernel.setArg(1, data.getSize()); assignElementsKernel.setArg(2, numBuckets); assignElementsKernel.setArg(3, dataRange->getDeviceBuffer()); assignElementsKernel.setArg(4, bucketOffset->getDeviceBuffer()); assignElementsKernel.setArg(5, bucketOfElement->getDeviceBuffer()); assignElementsKernel.setArg(6, offsetInBucket->getDeviceBuffer()); context.executeKernel(assignElementsKernel, data.getSize()); // Compute the position of each bucket. computeBucketPositionsKernel.setArg(0, numBuckets); computeBucketPositionsKernel.setArg(1, bucketOffset->getDeviceBuffer()); computeBucketPositionsKernel.setArg(2, positionsKernelSize*sizeof(cl_int), NULL); context.executeKernel(computeBucketPositionsKernel, positionsKernelSize, positionsKernelSize); // Copy the data into the buckets. copyToBucketsKernel.setArg(0, data.getDeviceBuffer()); copyToBucketsKernel.setArg(1, buckets->getDeviceBuffer()); copyToBucketsKernel.setArg(2, data.getSize()); copyToBucketsKernel.setArg(3, bucketOffset->getDeviceBuffer()); copyToBucketsKernel.setArg(4, bucketOfElement->getDeviceBuffer()); copyToBucketsKernel.setArg(5, offsetInBucket->getDeviceBuffer()); context.executeKernel(copyToBucketsKernel, data.getSize()); // Sort each bucket. sortBucketsKernel.setArg(0, data.getDeviceBuffer()); sortBucketsKernel.setArg(1, buckets->getDeviceBuffer()); sortBucketsKernel.setArg(2, numBuckets); sortBucketsKernel.setArg(3, bucketOffset->getDeviceBuffer()); sortBucketsKernel.setArg(4, sortKernelSize*sizeof(mm_float2), NULL); context.executeKernel(sortBucketsKernel, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize); } private: OpenCLContext& context; OpenCLArray* dataRange; OpenCLArray* bucketOfElement; OpenCLArray* offsetInBucket; OpenCLArray* bucketOffset; OpenCLArray* buckets; cl::Kernel computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel; int rangeKernelSize, positionsKernelSize, sortKernelSize; }; } // namespace OpenMM #endif // __OPENMM_OPENCLSORT_H__