Commit 711c3a5a authored by Peter Eastman's avatar Peter Eastman
Browse files

Sum energy on the GPU

parent d4fcab2a
...@@ -285,6 +285,10 @@ public: ...@@ -285,6 +285,10 @@ public:
* Clear all buffers that have been registered with addAutoclearBuffer(). * Clear all buffers that have been registered with addAutoclearBuffer().
*/ */
void clearAutoclearBuffers(); void clearAutoclearBuffers();
/**
* Sum the buffer containing energy.
*/
double reduceEnergy();
/** /**
* Get the current simulation time. * Get the current simulation time.
*/ */
...@@ -638,6 +642,7 @@ private: ...@@ -638,6 +642,7 @@ private:
CUfunction clearFourBuffersKernel; CUfunction clearFourBuffersKernel;
CUfunction clearFiveBuffersKernel; CUfunction clearFiveBuffersKernel;
CUfunction clearSixBuffersKernel; CUfunction clearSixBuffersKernel;
CUfunction reduceEnergyKernel;
CUfunction setChargesKernel; CUfunction setChargesKernel;
std::vector<CudaForceInfo*> forces; std::vector<CudaForceInfo*> forces;
std::vector<Molecule> molecules; std::vector<Molecule> molecules;
...@@ -649,6 +654,7 @@ private: ...@@ -649,6 +654,7 @@ private:
CudaArray* velm; CudaArray* velm;
CudaArray* force; CudaArray* force;
CudaArray* energyBuffer; CudaArray* energyBuffer;
CudaArray* energySum;
CudaArray* energyParamDerivBuffer; CudaArray* energyParamDerivBuffer;
CudaArray* atomIndexDevice; CudaArray* atomIndexDevice;
CudaArray* chargeBuffer; CudaArray* chargeBuffer;
......
...@@ -108,7 +108,7 @@ static int executeInWindows(const string &command) { ...@@ -108,7 +108,7 @@ static int executeInWindows(const string &command) {
CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler, CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData, CudaContext* originalContext) : system(system), currentStream(0), const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData, CudaContext* originalContext) : system(system), currentStream(0),
time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), hasCompilerKernel(false), isNvccAvailable(false), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), hasCompilerKernel(false), isNvccAvailable(false),
pinnedBuffer(NULL), posq(NULL), posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL), chargeBuffer(NULL), pinnedBuffer(NULL), posq(NULL), posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), energySum(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL), chargeBuffer(NULL),
integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) { integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
// Determine what compiler to use. // Determine what compiler to use.
...@@ -307,6 +307,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -307,6 +307,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
clearFourBuffersKernel = getKernel(utilities, "clearFourBuffers"); clearFourBuffersKernel = getKernel(utilities, "clearFourBuffers");
clearFiveBuffersKernel = getKernel(utilities, "clearFiveBuffers"); clearFiveBuffersKernel = getKernel(utilities, "clearFiveBuffers");
clearSixBuffersKernel = getKernel(utilities, "clearSixBuffers"); clearSixBuffersKernel = getKernel(utilities, "clearSixBuffers");
reduceEnergyKernel = getKernel(utilities, "reduceEnergy");
setChargesKernel = getKernel(utilities, "setCharges"); setChargesKernel = getKernel(utilities, "setCharges");
// Set defines based on the requested precision. // Set defines based on the requested precision.
...@@ -420,6 +421,8 @@ CudaContext::~CudaContext() { ...@@ -420,6 +421,8 @@ CudaContext::~CudaContext() {
delete force; delete force;
if (energyBuffer != NULL) if (energyBuffer != NULL)
delete energyBuffer; delete energyBuffer;
if (energySum != NULL)
delete energySum;
if (energyParamDerivBuffer != NULL) if (energyParamDerivBuffer != NULL)
delete energyParamDerivBuffer; delete energyParamDerivBuffer;
if (atomIndexDevice != NULL) if (atomIndexDevice != NULL)
...@@ -450,16 +453,19 @@ void CudaContext::initialize() { ...@@ -450,16 +453,19 @@ void CudaContext::initialize() {
int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()); int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
if (useDoublePrecision) { if (useDoublePrecision) {
energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer"); energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer");
energySum = CudaArray::create<double>(*this, 1, "energySum");
int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers); int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0)); CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
} }
else if (useMixedPrecision) { else if (useMixedPrecision) {
energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer"); energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer");
energySum = CudaArray::create<double>(*this, 1, "energySum");
int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers); int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0)); CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
} }
else { else {
energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer"); energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer");
energySum = CudaArray::create<float>(*this, 1, "energySum");
int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers); int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0)); CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0));
} }
...@@ -879,6 +885,18 @@ void CudaContext::clearAutoclearBuffers() { ...@@ -879,6 +885,18 @@ void CudaContext::clearAutoclearBuffers() {
} }
} }
double CudaContext::reduceEnergy() {
int bufferSize = energyBuffer->getSize();
int workGroupSize = 512;
void* args[] = {&energyBuffer->getDevicePointer(), &energySum->getDevicePointer(), &bufferSize, &workGroupSize};
executeKernel(reduceEnergyKernel, args, workGroupSize, workGroupSize, workGroupSize*energyBuffer->getElementSize());
energySum->download(pinnedBuffer);
if (getUseDoublePrecision() || getUseMixedPrecision())
return *((double*) pinnedBuffer);
else
return *((float*) pinnedBuffer);
}
void CudaContext::setCharges(const vector<double>& charges) { void CudaContext::setCharges(const vector<double>& charges) {
if (chargeBuffer == NULL) if (chargeBuffer == NULL)
chargeBuffer = new CudaArray(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer"); chargeBuffer = new CudaArray(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer");
......
...@@ -115,21 +115,8 @@ double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bo ...@@ -115,21 +115,8 @@ double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bo
for (auto computation : cu.getPostComputations()) for (auto computation : cu.getPostComputations())
sum += computation->computeForceAndEnergy(includeForces, includeEnergy, groups); sum += computation->computeForceAndEnergy(includeForces, includeEnergy, groups);
cu.getIntegrationUtilities().distributeForcesFromVirtualSites(); cu.getIntegrationUtilities().distributeForcesFromVirtualSites();
if (includeEnergy) { if (includeEnergy)
CudaArray& energyArray = cu.getEnergyBuffer(); sum += cu.reduceEnergy();
if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
double* energy = (double*) cu.getPinnedBuffer();
energyArray.download(energy);
for (int i = 0; i < energyArray.getSize(); i++)
sum += energy[i];
}
else {
float* energy = (float*) cu.getPinnedBuffer();
energyArray.download(energy);
for (int i = 0; i < energyArray.getSize(); i++)
sum += energy[i];
}
}
if (!cu.getForcesValid()) if (!cu.getForcesValid())
valid = false; valid = false;
return sum; return sum;
......
...@@ -73,6 +73,25 @@ __global__ void clearSixBuffers(int* __restrict__ buffer1, int size1, int* __res ...@@ -73,6 +73,25 @@ __global__ void clearSixBuffers(int* __restrict__ buffer1, int size1, int* __res
clearSingleBuffer(buffer6, size6); clearSingleBuffer(buffer6, size6);
} }
/**
* Sum the energy buffer.
*/
__global__ void reduceEnergy(const mixed* __restrict__ energyBuffer, mixed* __restrict__ result, int bufferSize, int workGroupSize) {
extern __shared__ mixed tempBuffer[];
const unsigned int thread = threadIdx.x;
mixed sum = 0;
for (unsigned int index = thread; index < bufferSize; index += blockDim.x)
sum += energyBuffer[index];
tempBuffer[thread] = sum;
for (int i = 1; i < workGroupSize; i *= 2) {
__syncthreads();
if (thread%(i*2) == 0 && thread+i < workGroupSize)
tempBuffer[thread] += tempBuffer[thread+i];
}
if (thread == 0)
*result = tempBuffer[0];
}
/** /**
* Record the atomic charges into the posq array. * Record the atomic charges into the posq array.
*/ */
......
...@@ -364,9 +364,13 @@ public: ...@@ -364,9 +364,13 @@ public:
*/ */
void reduceBuffer(OpenCLArray& array, int numBuffers); void reduceBuffer(OpenCLArray& array, int numBuffers);
/** /**
* Sum the buffesr containing forces. * Sum the buffers containing forces.
*/ */
void reduceForces(); void reduceForces();
/**
* Sum the buffer containing energy.
*/
double reduceEnergy();
/** /**
* Get the current simulation time. * Get the current simulation time.
*/ */
...@@ -750,6 +754,7 @@ private: ...@@ -750,6 +754,7 @@ private:
cl::Kernel clearSixBuffersKernel; cl::Kernel clearSixBuffersKernel;
cl::Kernel reduceReal4Kernel; cl::Kernel reduceReal4Kernel;
cl::Kernel reduceForcesKernel; cl::Kernel reduceForcesKernel;
cl::Kernel reduceEnergyKernel;
cl::Kernel setChargesKernel; cl::Kernel setChargesKernel;
std::vector<OpenCLForceInfo*> forces; std::vector<OpenCLForceInfo*> forces;
std::vector<Molecule> molecules; std::vector<Molecule> molecules;
...@@ -764,6 +769,7 @@ private: ...@@ -764,6 +769,7 @@ private:
OpenCLArray* forceBuffers; OpenCLArray* forceBuffers;
OpenCLArray* longForceBuffer; OpenCLArray* longForceBuffer;
OpenCLArray* energyBuffer; OpenCLArray* energyBuffer;
OpenCLArray* energySum;
OpenCLArray* energyParamDerivBuffer; OpenCLArray* energyParamDerivBuffer;
OpenCLArray* atomIndexDevice; OpenCLArray* atomIndexDevice;
OpenCLArray* chargeBuffer; OpenCLArray* chargeBuffer;
......
...@@ -69,7 +69,7 @@ static void CL_CALLBACK errorCallback(const char* errinfo, const void* private_i ...@@ -69,7 +69,7 @@ static void CL_CALLBACK errorCallback(const char* errinfo, const void* private_i
OpenCLContext::OpenCLContext(const System& system, int platformIndex, int deviceIndex, const string& precision, OpenCLPlatform::PlatformData& platformData, OpenCLContext* originalContext) : OpenCLContext::OpenCLContext(const System& system, int platformIndex, int deviceIndex, const string& precision, OpenCLPlatform::PlatformData& platformData, OpenCLContext* originalContext) :
system(system), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), atomsWereReordered(false), posq(NULL), system(system), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), atomsWereReordered(false), posq(NULL),
posqCorrection(NULL), velm(NULL), forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL), posqCorrection(NULL), velm(NULL), forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), energySum(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL),
chargeBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) { chargeBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
if (precision == "single") { if (precision == "single") {
useDoublePrecision = false; useDoublePrecision = false;
...@@ -315,6 +315,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device ...@@ -315,6 +315,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
reduceReal4Kernel = cl::Kernel(utilities, "reduceReal4Buffer"); reduceReal4Kernel = cl::Kernel(utilities, "reduceReal4Buffer");
if (supports64BitGlobalAtomics) if (supports64BitGlobalAtomics)
reduceForcesKernel = cl::Kernel(utilities, "reduceForces"); reduceForcesKernel = cl::Kernel(utilities, "reduceForces");
reduceEnergyKernel = cl::Kernel(utilities, "reduceEnergy");
setChargesKernel = cl::Kernel(utilities, "setCharges"); setChargesKernel = cl::Kernel(utilities, "setCharges");
// Decide whether native_sqrt(), native_rsqrt(), and native_recip() are sufficiently accurate to use. // Decide whether native_sqrt(), native_rsqrt(), and native_recip() are sufficiently accurate to use.
...@@ -442,6 +443,8 @@ OpenCLContext::~OpenCLContext() { ...@@ -442,6 +443,8 @@ OpenCLContext::~OpenCLContext() {
delete longForceBuffer; delete longForceBuffer;
if (energyBuffer != NULL) if (energyBuffer != NULL)
delete energyBuffer; delete energyBuffer;
if (energySum != NULL)
delete energySum;
if (energyParamDerivBuffer != NULL) if (energyParamDerivBuffer != NULL)
delete energyParamDerivBuffer; delete energyParamDerivBuffer;
if (atomIndexDevice != NULL) if (atomIndexDevice != NULL)
...@@ -471,11 +474,19 @@ void OpenCLContext::initialize() { ...@@ -471,11 +474,19 @@ void OpenCLContext::initialize() {
forceBuffers = OpenCLArray::create<mm_double4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers"); forceBuffers = OpenCLArray::create<mm_double4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
force = OpenCLArray::create<mm_double4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force"); force = OpenCLArray::create<mm_double4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force");
energyBuffer = OpenCLArray::create<cl_double>(*this, energyBufferSize, "energyBuffer"); energyBuffer = OpenCLArray::create<cl_double>(*this, energyBufferSize, "energyBuffer");
energySum = OpenCLArray::create<cl_double>(*this, 1, "energySum");
} }
else { else if (useMixedPrecision) {
forceBuffers = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers"); forceBuffers = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
force = OpenCLArray::create<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force"); force = OpenCLArray::create<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force");
energyBuffer = OpenCLArray::create<cl_double>(*this, energyBufferSize, "energyBuffer"); energyBuffer = OpenCLArray::create<cl_double>(*this, energyBufferSize, "energyBuffer");
energySum = OpenCLArray::create<cl_double>(*this, 1, "energySum");
}
else {
forceBuffers = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
force = OpenCLArray::create<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force");
energyBuffer = OpenCLArray::create<cl_float>(*this, energyBufferSize, "energyBuffer");
energySum = OpenCLArray::create<cl_float>(*this, 1, "energySum");
} }
if (supports64BitGlobalAtomics) { if (supports64BitGlobalAtomics) {
longForceBuffer = OpenCLArray::create<cl_long>(*this, 3*paddedNumAtoms, "longForceBuffer"); longForceBuffer = OpenCLArray::create<cl_long>(*this, 3*paddedNumAtoms, "longForceBuffer");
...@@ -756,6 +767,28 @@ void OpenCLContext::reduceBuffer(OpenCLArray& array, int numBuffers) { ...@@ -756,6 +767,28 @@ void OpenCLContext::reduceBuffer(OpenCLArray& array, int numBuffers) {
executeKernel(reduceReal4Kernel, bufferSize, 128); executeKernel(reduceReal4Kernel, bufferSize, 128);
} }
double OpenCLContext::reduceEnergy() {
int workGroupSize = device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
if (workGroupSize > 512)
workGroupSize = 512;
reduceEnergyKernel.setArg<cl::Buffer>(0, energyBuffer->getDeviceBuffer());
reduceEnergyKernel.setArg<cl::Buffer>(1, energySum->getDeviceBuffer());
reduceEnergyKernel.setArg<cl_int>(2, energyBuffer->getSize());
reduceEnergyKernel.setArg<cl_int>(3, workGroupSize);
reduceEnergyKernel.setArg(4, workGroupSize*energyBuffer->getElementSize(), NULL);
executeKernel(reduceEnergyKernel, workGroupSize, workGroupSize);
if (getUseDoublePrecision() || getUseMixedPrecision()) {
double energy;
energySum->download(&energy);
return energy;
}
else {
float energy;
energySum->download(&energy);
return energy;
}
}
void OpenCLContext::setCharges(const vector<double>& charges) { void OpenCLContext::setCharges(const vector<double>& charges) {
if (chargeBuffer == NULL) if (chargeBuffer == NULL)
chargeBuffer = new OpenCLArray(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer"); chargeBuffer = new OpenCLArray(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer");
......
...@@ -139,21 +139,8 @@ double OpenCLCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, ...@@ -139,21 +139,8 @@ double OpenCLCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context,
sum += computation->computeForceAndEnergy(includeForces, includeEnergy, groups); sum += computation->computeForceAndEnergy(includeForces, includeEnergy, groups);
cl.reduceForces(); cl.reduceForces();
cl.getIntegrationUtilities().distributeForcesFromVirtualSites(); cl.getIntegrationUtilities().distributeForcesFromVirtualSites();
if (includeEnergy) { if (includeEnergy)
OpenCLArray& energyArray = cl.getEnergyBuffer(); sum += cl.reduceEnergy();
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
double* energy = (double*) cl.getPinnedBuffer();
energyArray.download(energy);
for (int i = 0; i < energyArray.getSize(); i++)
sum += energy[i];
}
else {
float* energy = (float*) cl.getPinnedBuffer();
energyArray.download(energy);
for (int i = 0; i < energyArray.getSize(); i++)
sum += energy[i];
}
}
if (!cl.getForcesValid()) if (!cl.getForcesValid())
valid = false; valid = false;
return sum; return sum;
......
...@@ -97,6 +97,24 @@ __kernel void reduceForces(__global const long* restrict longBuffer, __global re ...@@ -97,6 +97,24 @@ __kernel void reduceForces(__global const long* restrict longBuffer, __global re
} }
#endif #endif
/**
* Sum the energy buffer.
*/
__kernel void reduceEnergy(__global const mixed* restrict energyBuffer, __global mixed* restrict result, int bufferSize, int workGroupSize, __local mixed* tempBuffer) {
const unsigned int thread = get_local_id(0);
mixed sum = 0;
for (unsigned int index = thread; index < bufferSize; index += get_local_size(0))
sum += energyBuffer[index];
tempBuffer[thread] = sum;
for (int i = 1; i < workGroupSize; i *= 2) {
barrier(CLK_LOCAL_MEM_FENCE);
if (thread%(i*2) == 0 && thread+i < workGroupSize)
tempBuffer[thread] += tempBuffer[thread+i];
}
if (thread == 0)
*result = tempBuffer[0];
}
/** /**
* This is called to determine the accuracy of various native functions. * This is called to determine the accuracy of various native functions.
*/ */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment