HipSort.cpp 7.84 KB
Newer Older
1
2
3
4
5
6
7
8
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
9
 * Portions copyright (c) 2010-2025 Stanford University and the Authors.      *
10
 * Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc.              *
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
 * Authors: Peter Eastman, Nicholas Curtis                                    *
 * Contributors:                                                              *
 *                                                                            *
 * This program is free software: you can redistribute it and/or modify       *
 * it under the terms of the GNU Lesser General Public License as published   *
 * by the Free Software Foundation, either version 3 of the License, or       *
 * (at your option) any later version.                                        *
 *                                                                            *
 * This program is distributed in the hope that it will be useful,            *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
 * GNU Lesser General Public License for more details.                        *
 *                                                                            *
 * You should have received a copy of the GNU Lesser General Public License   *
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

#include "HipSort.h"
#include "HipKernelSources.h"
#include <algorithm>
#include <map>

using namespace OpenMM;
using namespace std;

36
HipSort::HipSort(HipContext& context, ComputeSortImpl::SortTrait* trait, unsigned int length, bool uniform) :
37
        context(context), trait(trait), dataLength(length), uniform(uniform) {
38
39
40
41
42
43
44
45
46
    // Create kernels.

    map<string, string> replacements;
    replacements["DATA_TYPE"] = trait->getDataType();
    replacements["KEY_TYPE"] =  trait->getKeyType();
    replacements["SORT_KEY"] = trait->getSortKey();
    replacements["MIN_KEY"] = trait->getMinKey();
    replacements["MAX_KEY"] = trait->getMaxKey();
    replacements["MAX_VALUE"] = trait->getMaxValue();
47
    replacements["UNIFORM"] = (uniform ? "1" : "0");
48
49
50
51
    hipModule_t module = context.createModule(context.replaceStrings(HipKernelSources::sort, replacements));
    shortListKernel = context.getKernel(module, "sortShortList");
    shortList2Kernel = context.getKernel(module, "sortShortList2");
    computeRangeKernel = context.getKernel(module, "computeRange");
52
    assignElementsKernel = context.getKernel(module, uniform ? "assignElementsToBuckets" : "assignElementsToBuckets2");
53
54
55
56
57
58
59
60
61
    computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions");
    copyToBucketsKernel = context.getKernel(module, "copyDataToBuckets");
    sortBucketsKernel = context.getKernel(module, "sortBuckets");

    // Work out the work group sizes for various kernels.

    int maxSharedMem;
    hipDeviceGetAttribute(&maxSharedMem, hipDeviceAttributeMaxSharedMemoryPerBlock, context.getDevice());
    int maxLocalBuffer = (maxSharedMem/trait->getDataSize())/2;
62
    int maxShortList = min(1024, max(maxLocalBuffer, HipContext::ThreadBlockSize*context.getNumThreadBlocks()));
63
    isShortList = (length <= maxShortList);
64
65
    sortKernelSize = 256;
    rangeKernelSize = 256;
66
67
    if (rangeKernelSize > length)
        rangeKernelSize = length;
68
    rangeKernelBlocks = (length + rangeKernelSize - 1) / rangeKernelSize;
69
70
    if (sortKernelSize > maxLocalBuffer)
        sortKernelSize = maxLocalBuffer;
71
    unsigned int targetBucketSize = uniform ? sortKernelSize/2 : sortKernelSize/8;
72
73
74
    unsigned int numBuckets = length/targetBucketSize;
    if (numBuckets < 1)
        numBuckets = 1;
75
76
    // computeBucketPositions is executed as a single work group so larger block size is faster.
    positionsKernelSize = 1024;
77
78
79
80
81
82
    if (positionsKernelSize > numBuckets)
        positionsKernelSize = numBuckets;

    // Create workspace arrays.

    if (!isShortList) {
83
84
85
86
        counters.initialize<unsigned int>(context, 1, "counters");
        unsigned int zero = 0;
        counters.upload(&zero);
        dataRange.initialize(context, 2*rangeKernelBlocks, trait->getKeySize(), "sortDataRange");
87
88
89
90
91
92
93
94
95
96
97
        bucketOffset.initialize<uint1>(context, numBuckets, "bucketOffset");
        bucketOfElement.initialize<uint1>(context, length, "bucketOfElement");
        offsetInBucket.initialize<uint1>(context, length, "offsetInBucket");
    }
    buckets.initialize(context, length, trait->getDataSize(), "buckets");
}

HipSort::~HipSort() {
    delete trait;
}

98
void HipSort::sort(ArrayInterface& data) {
99
100
101
102
    if (data.getSize() != dataLength || data.getElementSize() != trait->getDataSize())
        throw OpenMMException("HipSort called with different data size");
    if (data.getSize() == 0)
        return;
103
    HipArray& cudata = context.unwrap(data);
104
105
106
107
    if (isShortList) {
        // We can use a simpler sort kernel that does the entire operation in one kernel.

        if (dataLength <= HipContext::ThreadBlockSize*context.getNumThreadBlocks()) {
108
            void* sortArgs[] = {&cudata.getDevicePointer(), &buckets.getDevicePointer(), &dataLength};
109
            context.executeKernel(shortList2Kernel, sortArgs, dataLength, HipContext::ThreadBlockSize, HipContext::ThreadBlockSize*trait->getKeySize());
110
            buckets.copyTo(cudata);
111
112
        }
        else {
113
            void* sortArgs[] = {&cudata.getDevicePointer(), &dataLength};
114
115
116
117
118
119
120
            context.executeKernel(shortListKernel, sortArgs, sortKernelSize, sortKernelSize, dataLength*trait->getDataSize());
        }
    }
    else {
        // Compute the range of data values.

        unsigned int numBuckets = bucketOffset.getSize();
121
        void* rangeArgs[] = {&cudata.getDevicePointer(), &dataLength, &dataRange.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer(), &counters.getDevicePointer()};
122
        context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelBlocks*rangeKernelSize, rangeKernelSize, 2*rangeKernelSize*trait->getKeySize());
123
124
125

        // Assign array elements to buckets.

126
        void* elementsArgs[] = {&cudata.getDevicePointer(), &dataLength, &numBuckets, &dataRange.getDevicePointer(),
127
                &bucketOffset.getDevicePointer(), &bucketOfElement.getDevicePointer(), &offsetInBucket.getDevicePointer()};
128
        context.executeKernel(assignElementsKernel, elementsArgs, cudata.getSize(), 128);
129
130
131

        // Compute the position of each bucket.

132
        void* computeArgs[] = {&numBuckets, &bucketOffset.getDevicePointer(), &counters.getDevicePointer()};
133
134
135
136
        context.executeKernel(computeBucketPositionsKernel, computeArgs, positionsKernelSize, positionsKernelSize, positionsKernelSize*sizeof(int));

        // Copy the data into the buckets.

137
        void* copyArgs[] = {&cudata.getDevicePointer(), &buckets.getDevicePointer(), &dataLength, &bucketOffset.getDevicePointer(),
138
                &bucketOfElement.getDevicePointer(), &offsetInBucket.getDevicePointer()};
139
        context.executeKernel(copyToBucketsKernel, copyArgs, cudata.getSize());
140
141
142

        // Sort each bucket.

143
        void* sortArgs[] = {&cudata.getDevicePointer(), &buckets.getDevicePointer(), &bucketOffset.getDevicePointer()};
144
        context.executeKernelFlat(sortBucketsKernel, sortArgs, numBuckets*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
145
146
    }
}