Unverified Commit 2975f44b authored by Peter Eastman's avatar Peter Eastman Committed by GitHub
Browse files

Optimized reducing energy (#3902)

parent e0c80069
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2021 Stanford University and the Authors. *
* Portions copyright (c) 2009-2023 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -435,21 +435,23 @@ void CudaContext::initialize() {
ContextSelector selector(*this);
string errorMessage = "Error initializing Context";
int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
int multiprocessors;
CHECK_RESULT2(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device), "Error checking GPU properties");
if (useDoublePrecision) {
energyBuffer.initialize<double>(*this, numEnergyBuffers, "energyBuffer");
energySum.initialize<double>(*this, 1, "energySum");
energySum.initialize<double>(*this, multiprocessors, "energySum");
int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
}
else if (useMixedPrecision) {
energyBuffer.initialize<double>(*this, numEnergyBuffers, "energyBuffer");
energySum.initialize<double>(*this, 1, "energySum");
energySum.initialize<double>(*this, multiprocessors, "energySum");
int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
}
else {
energyBuffer.initialize<float>(*this, numEnergyBuffers, "energyBuffer");
energySum.initialize<float>(*this, 1, "energySum");
energySum.initialize<float>(*this, multiprocessors, "energySum");
int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0));
}
......@@ -820,12 +822,18 @@ double CudaContext::reduceEnergy() {
int bufferSize = energyBuffer.getSize();
int workGroupSize = 512;
void* args[] = {&energyBuffer.getDevicePointer(), &energySum.getDevicePointer(), &bufferSize, &workGroupSize};
executeKernel(reduceEnergyKernel, args, workGroupSize, workGroupSize, workGroupSize*energyBuffer.getElementSize());
executeKernel(reduceEnergyKernel, args, workGroupSize*energySum.getSize(), workGroupSize, workGroupSize*energyBuffer.getElementSize());
energySum.download(pinnedBuffer);
if (getUseDoublePrecision() || getUseMixedPrecision())
return *((double*) pinnedBuffer);
else
return *((float*) pinnedBuffer);
double result = 0;
if (getUseDoublePrecision() || getUseMixedPrecision()) {
for (int i = 0; i < energySum.getSize(); i++)
result += ((double*) pinnedBuffer)[i];
}
else {
for (int i = 0; i < energySum.getSize(); i++)
result += ((float*) pinnedBuffer)[i];
}
return result;
}
void CudaContext::setCharges(const vector<double>& charges) {
......
......@@ -80,7 +80,7 @@ __global__ void reduceEnergy(const mixed* __restrict__ energyBuffer, mixed* __re
extern __shared__ mixed tempBuffer[];
const unsigned int thread = threadIdx.x;
mixed sum = 0;
for (unsigned int index = thread; index < bufferSize; index += blockDim.x)
for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < bufferSize; index += blockDim.x*gridDim.x)
sum += energyBuffer[index];
tempBuffer[thread] = sum;
for (int i = 1; i < workGroupSize; i *= 2) {
......@@ -89,7 +89,7 @@ __global__ void reduceEnergy(const mixed* __restrict__ energyBuffer, mixed* __re
tempBuffer[thread] += tempBuffer[thread+i];
}
if (thread == 0)
*result = tempBuffer[0];
result[blockIdx.x] = tempBuffer[0];
}
/**
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2020 Stanford University and the Authors. *
* Portions copyright (c) 2009-2023 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -497,23 +497,24 @@ void OpenCLContext::initialize() {
bonded->initialize(system);
numForceBuffers = std::max(numForceBuffers, (int) platformData.contexts.size());
int energyBufferSize = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
int numComputeUnits = device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
if (useDoublePrecision) {
forceBuffers.initialize<mm_double4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
force.initialize<mm_double4>(*this, &forceBuffers.getDeviceBuffer(), paddedNumAtoms, "force");
energyBuffer.initialize<cl_double>(*this, energyBufferSize, "energyBuffer");
energySum.initialize<cl_double>(*this, 1, "energySum");
energySum.initialize<cl_double>(*this, numComputeUnits, "energySum");
}
else if (useMixedPrecision) {
forceBuffers.initialize<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
force.initialize<mm_float4>(*this, &forceBuffers.getDeviceBuffer(), paddedNumAtoms, "force");
energyBuffer.initialize<cl_double>(*this, energyBufferSize, "energyBuffer");
energySum.initialize<cl_double>(*this, 1, "energySum");
energySum.initialize<cl_double>(*this, numComputeUnits, "energySum");
}
else {
forceBuffers.initialize<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
force.initialize<mm_float4>(*this, &forceBuffers.getDeviceBuffer(), paddedNumAtoms, "force");
energyBuffer.initialize<cl_float>(*this, energyBufferSize, "energyBuffer");
energySum.initialize<cl_float>(*this, 1, "energySum");
energySum.initialize<cl_float>(*this, numComputeUnits, "energySum");
}
reduceForcesKernel.setArg<cl::Buffer>(0, longForceBuffer.getDeviceBuffer());
reduceForcesKernel.setArg<cl::Buffer>(1, forceBuffers.getDeviceBuffer());
......@@ -798,17 +799,18 @@ double OpenCLContext::reduceEnergy() {
reduceEnergyKernel.setArg<cl_int>(2, energyBuffer.getSize());
reduceEnergyKernel.setArg<cl_int>(3, workGroupSize);
reduceEnergyKernel.setArg(4, workGroupSize*energyBuffer.getElementSize(), NULL);
executeKernel(reduceEnergyKernel, workGroupSize, workGroupSize);
executeKernel(reduceEnergyKernel, workGroupSize*energySum.getSize(), workGroupSize);
energySum.download(pinnedMemory);
double result = 0;
if (getUseDoublePrecision() || getUseMixedPrecision()) {
double energy;
energySum.download(&energy);
return energy;
for (int i = 0; i < energySum.getSize(); i++)
result += ((double*) pinnedMemory)[i];
}
else {
float energy;
energySum.download(&energy);
return energy;
for (int i = 0; i < energySum.getSize(); i++)
result += ((float*) pinnedMemory)[i];
}
return result;
}
void OpenCLContext::setCharges(const vector<double>& charges) {
......
......@@ -108,7 +108,7 @@ __kernel void reduceForces(__global long* restrict longBuffer, __global real4* r
__kernel void reduceEnergy(__global const mixed* restrict energyBuffer, __global mixed* restrict result, int bufferSize, int workGroupSize, __local mixed* tempBuffer) {
const unsigned int thread = get_local_id(0);
mixed sum = 0;
for (unsigned int index = thread; index < bufferSize; index += get_local_size(0))
for (unsigned int index = get_global_id(0); index < bufferSize; index += get_global_size(0))
sum += energyBuffer[index];
tempBuffer[thread] = sum;
for (int i = 1; i < workGroupSize; i *= 2) {
......@@ -117,7 +117,7 @@ __kernel void reduceEnergy(__global const mixed* restrict energyBuffer, __global
tempBuffer[thread] += tempBuffer[thread+i];
}
if (thread == 0)
*result = tempBuffer[0];
result[get_group_id(0)] = tempBuffer[0];
}
/**
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment