"platforms/vscode:/vscode.git/clone" did not exist on "729c09907584fde040dc58c860181d25b133b68d"
Unverified Commit 2975f44b authored by Peter Eastman's avatar Peter Eastman Committed by GitHub
Browse files

Optimized reducing energy (#3902)

parent e0c80069
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2021 Stanford University and the Authors. * * Portions copyright (c) 2009-2023 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -435,21 +435,23 @@ void CudaContext::initialize() { ...@@ -435,21 +435,23 @@ void CudaContext::initialize() {
ContextSelector selector(*this); ContextSelector selector(*this);
string errorMessage = "Error initializing Context"; string errorMessage = "Error initializing Context";
int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()); int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
int multiprocessors;
CHECK_RESULT2(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device), "Error checking GPU properties");
if (useDoublePrecision) { if (useDoublePrecision) {
energyBuffer.initialize<double>(*this, numEnergyBuffers, "energyBuffer"); energyBuffer.initialize<double>(*this, numEnergyBuffers, "energyBuffer");
energySum.initialize<double>(*this, 1, "energySum"); energySum.initialize<double>(*this, multiprocessors, "energySum");
int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers); int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0)); CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
} }
else if (useMixedPrecision) { else if (useMixedPrecision) {
energyBuffer.initialize<double>(*this, numEnergyBuffers, "energyBuffer"); energyBuffer.initialize<double>(*this, numEnergyBuffers, "energyBuffer");
energySum.initialize<double>(*this, 1, "energySum"); energySum.initialize<double>(*this, multiprocessors, "energySum");
int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers); int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0)); CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
} }
else { else {
energyBuffer.initialize<float>(*this, numEnergyBuffers, "energyBuffer"); energyBuffer.initialize<float>(*this, numEnergyBuffers, "energyBuffer");
energySum.initialize<float>(*this, 1, "energySum"); energySum.initialize<float>(*this, multiprocessors, "energySum");
int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers); int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0)); CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0));
} }
...@@ -820,12 +822,18 @@ double CudaContext::reduceEnergy() { ...@@ -820,12 +822,18 @@ double CudaContext::reduceEnergy() {
int bufferSize = energyBuffer.getSize(); int bufferSize = energyBuffer.getSize();
int workGroupSize = 512; int workGroupSize = 512;
void* args[] = {&energyBuffer.getDevicePointer(), &energySum.getDevicePointer(), &bufferSize, &workGroupSize}; void* args[] = {&energyBuffer.getDevicePointer(), &energySum.getDevicePointer(), &bufferSize, &workGroupSize};
executeKernel(reduceEnergyKernel, args, workGroupSize, workGroupSize, workGroupSize*energyBuffer.getElementSize()); executeKernel(reduceEnergyKernel, args, workGroupSize*energySum.getSize(), workGroupSize, workGroupSize*energyBuffer.getElementSize());
energySum.download(pinnedBuffer); energySum.download(pinnedBuffer);
if (getUseDoublePrecision() || getUseMixedPrecision()) double result = 0;
return *((double*) pinnedBuffer); if (getUseDoublePrecision() || getUseMixedPrecision()) {
else for (int i = 0; i < energySum.getSize(); i++)
return *((float*) pinnedBuffer); result += ((double*) pinnedBuffer)[i];
}
else {
for (int i = 0; i < energySum.getSize(); i++)
result += ((float*) pinnedBuffer)[i];
}
return result;
} }
void CudaContext::setCharges(const vector<double>& charges) { void CudaContext::setCharges(const vector<double>& charges) {
......
...@@ -80,7 +80,7 @@ __global__ void reduceEnergy(const mixed* __restrict__ energyBuffer, mixed* __re ...@@ -80,7 +80,7 @@ __global__ void reduceEnergy(const mixed* __restrict__ energyBuffer, mixed* __re
extern __shared__ mixed tempBuffer[]; extern __shared__ mixed tempBuffer[];
const unsigned int thread = threadIdx.x; const unsigned int thread = threadIdx.x;
mixed sum = 0; mixed sum = 0;
for (unsigned int index = thread; index < bufferSize; index += blockDim.x) for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < bufferSize; index += blockDim.x*gridDim.x)
sum += energyBuffer[index]; sum += energyBuffer[index];
tempBuffer[thread] = sum; tempBuffer[thread] = sum;
for (int i = 1; i < workGroupSize; i *= 2) { for (int i = 1; i < workGroupSize; i *= 2) {
...@@ -89,7 +89,7 @@ __global__ void reduceEnergy(const mixed* __restrict__ energyBuffer, mixed* __re ...@@ -89,7 +89,7 @@ __global__ void reduceEnergy(const mixed* __restrict__ energyBuffer, mixed* __re
tempBuffer[thread] += tempBuffer[thread+i]; tempBuffer[thread] += tempBuffer[thread+i];
} }
if (thread == 0) if (thread == 0)
*result = tempBuffer[0]; result[blockIdx.x] = tempBuffer[0];
} }
/** /**
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2020 Stanford University and the Authors. * * Portions copyright (c) 2009-2023 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -497,23 +497,24 @@ void OpenCLContext::initialize() { ...@@ -497,23 +497,24 @@ void OpenCLContext::initialize() {
bonded->initialize(system); bonded->initialize(system);
numForceBuffers = std::max(numForceBuffers, (int) platformData.contexts.size()); numForceBuffers = std::max(numForceBuffers, (int) platformData.contexts.size());
int energyBufferSize = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()); int energyBufferSize = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
int numComputeUnits = device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
if (useDoublePrecision) { if (useDoublePrecision) {
forceBuffers.initialize<mm_double4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers"); forceBuffers.initialize<mm_double4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
force.initialize<mm_double4>(*this, &forceBuffers.getDeviceBuffer(), paddedNumAtoms, "force"); force.initialize<mm_double4>(*this, &forceBuffers.getDeviceBuffer(), paddedNumAtoms, "force");
energyBuffer.initialize<cl_double>(*this, energyBufferSize, "energyBuffer"); energyBuffer.initialize<cl_double>(*this, energyBufferSize, "energyBuffer");
energySum.initialize<cl_double>(*this, 1, "energySum"); energySum.initialize<cl_double>(*this, numComputeUnits, "energySum");
} }
else if (useMixedPrecision) { else if (useMixedPrecision) {
forceBuffers.initialize<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers"); forceBuffers.initialize<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
force.initialize<mm_float4>(*this, &forceBuffers.getDeviceBuffer(), paddedNumAtoms, "force"); force.initialize<mm_float4>(*this, &forceBuffers.getDeviceBuffer(), paddedNumAtoms, "force");
energyBuffer.initialize<cl_double>(*this, energyBufferSize, "energyBuffer"); energyBuffer.initialize<cl_double>(*this, energyBufferSize, "energyBuffer");
energySum.initialize<cl_double>(*this, 1, "energySum"); energySum.initialize<cl_double>(*this, numComputeUnits, "energySum");
} }
else { else {
forceBuffers.initialize<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers"); forceBuffers.initialize<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
force.initialize<mm_float4>(*this, &forceBuffers.getDeviceBuffer(), paddedNumAtoms, "force"); force.initialize<mm_float4>(*this, &forceBuffers.getDeviceBuffer(), paddedNumAtoms, "force");
energyBuffer.initialize<cl_float>(*this, energyBufferSize, "energyBuffer"); energyBuffer.initialize<cl_float>(*this, energyBufferSize, "energyBuffer");
energySum.initialize<cl_float>(*this, 1, "energySum"); energySum.initialize<cl_float>(*this, numComputeUnits, "energySum");
} }
reduceForcesKernel.setArg<cl::Buffer>(0, longForceBuffer.getDeviceBuffer()); reduceForcesKernel.setArg<cl::Buffer>(0, longForceBuffer.getDeviceBuffer());
reduceForcesKernel.setArg<cl::Buffer>(1, forceBuffers.getDeviceBuffer()); reduceForcesKernel.setArg<cl::Buffer>(1, forceBuffers.getDeviceBuffer());
...@@ -798,17 +799,18 @@ double OpenCLContext::reduceEnergy() { ...@@ -798,17 +799,18 @@ double OpenCLContext::reduceEnergy() {
reduceEnergyKernel.setArg<cl_int>(2, energyBuffer.getSize()); reduceEnergyKernel.setArg<cl_int>(2, energyBuffer.getSize());
reduceEnergyKernel.setArg<cl_int>(3, workGroupSize); reduceEnergyKernel.setArg<cl_int>(3, workGroupSize);
reduceEnergyKernel.setArg(4, workGroupSize*energyBuffer.getElementSize(), NULL); reduceEnergyKernel.setArg(4, workGroupSize*energyBuffer.getElementSize(), NULL);
executeKernel(reduceEnergyKernel, workGroupSize, workGroupSize); executeKernel(reduceEnergyKernel, workGroupSize*energySum.getSize(), workGroupSize);
energySum.download(pinnedMemory);
double result = 0;
if (getUseDoublePrecision() || getUseMixedPrecision()) { if (getUseDoublePrecision() || getUseMixedPrecision()) {
double energy; for (int i = 0; i < energySum.getSize(); i++)
energySum.download(&energy); result += ((double*) pinnedMemory)[i];
return energy;
} }
else { else {
float energy; for (int i = 0; i < energySum.getSize(); i++)
energySum.download(&energy); result += ((float*) pinnedMemory)[i];
return energy;
} }
return result;
} }
void OpenCLContext::setCharges(const vector<double>& charges) { void OpenCLContext::setCharges(const vector<double>& charges) {
......
...@@ -108,7 +108,7 @@ __kernel void reduceForces(__global long* restrict longBuffer, __global real4* r ...@@ -108,7 +108,7 @@ __kernel void reduceForces(__global long* restrict longBuffer, __global real4* r
__kernel void reduceEnergy(__global const mixed* restrict energyBuffer, __global mixed* restrict result, int bufferSize, int workGroupSize, __local mixed* tempBuffer) { __kernel void reduceEnergy(__global const mixed* restrict energyBuffer, __global mixed* restrict result, int bufferSize, int workGroupSize, __local mixed* tempBuffer) {
const unsigned int thread = get_local_id(0); const unsigned int thread = get_local_id(0);
mixed sum = 0; mixed sum = 0;
for (unsigned int index = thread; index < bufferSize; index += get_local_size(0)) for (unsigned int index = get_global_id(0); index < bufferSize; index += get_global_size(0))
sum += energyBuffer[index]; sum += energyBuffer[index];
tempBuffer[thread] = sum; tempBuffer[thread] = sum;
for (int i = 1; i < workGroupSize; i *= 2) { for (int i = 1; i < workGroupSize; i *= 2) {
...@@ -117,7 +117,7 @@ __kernel void reduceEnergy(__global const mixed* restrict energyBuffer, __global ...@@ -117,7 +117,7 @@ __kernel void reduceEnergy(__global const mixed* restrict energyBuffer, __global
tempBuffer[thread] += tempBuffer[thread+i]; tempBuffer[thread] += tempBuffer[thread+i];
} }
if (thread == 0) if (thread == 0)
*result = tempBuffer[0]; result[get_group_id(0)] = tempBuffer[0];
} }
/** /**
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment