"wrappers/python/vscode:/vscode.git/clone" did not exist on "a5415e25f7ffec35a9c29da934c9e9ede240d8fa"
Unverified Commit 94057ffb authored by Dann239's avatar Dann239 Committed by GitHub
Browse files

Made OpenCLContext::reduceBuffer additionally store the result into a 64-bit...

Made OpenCLContext::reduceBuffer additionally store the result into a 64-bit fixed point buffer. (#3209)
parent 6bdaa495
...@@ -364,11 +364,13 @@ public: ...@@ -364,11 +364,13 @@ public:
/** /**
* Given a collection of floating point buffers packed into an array, sum them and store * Given a collection of floating point buffers packed into an array, sum them and store
* the sum in the first buffer. * the sum in the first buffer.
* Also, write the result into a 64-bit fixed point buffer (overwriting its contents).
* *
* @param array the array containing the buffers to reduce * @param array the array containing the buffers to reduce
* @param longBuffer the 64-bit fixed point buffer to write the result into
* @param numBuffers the number of buffers packed into the array * @param numBuffers the number of buffers packed into the array
*/ */
void reduceBuffer(OpenCLArray& array, int numBuffers); void reduceBuffer(OpenCLArray& array, OpenCLArray& longBuffer, int numBuffers);
/** /**
* Sum the buffers containing forces. * Sum the buffers containing forces.
*/ */
......
...@@ -782,11 +782,12 @@ void OpenCLContext::reduceForces() { ...@@ -782,11 +782,12 @@ void OpenCLContext::reduceForces() {
executeKernel(reduceForcesKernel, paddedNumAtoms, 128); executeKernel(reduceForcesKernel, paddedNumAtoms, 128);
} }
void OpenCLContext::reduceBuffer(OpenCLArray& array, int numBuffers) { void OpenCLContext::reduceBuffer(OpenCLArray& array, OpenCLArray& longBuffer, int numBuffers) {
int bufferSize = array.getSize()/numBuffers; int bufferSize = array.getSize()/numBuffers;
reduceReal4Kernel.setArg<cl::Buffer>(0, array.getDeviceBuffer()); reduceReal4Kernel.setArg<cl::Buffer>(0, array.getDeviceBuffer());
reduceReal4Kernel.setArg<cl_int>(1, bufferSize); reduceReal4Kernel.setArg<cl::Buffer>(1, longBuffer.getDeviceBuffer());
reduceReal4Kernel.setArg<cl_int>(2, numBuffers); reduceReal4Kernel.setArg<cl_int>(2, bufferSize);
reduceReal4Kernel.setArg<cl_int>(3, numBuffers);
executeKernel(reduceReal4Kernel, bufferSize, 128); executeKernel(reduceReal4Kernel, bufferSize, 128);
} }
......
...@@ -179,7 +179,7 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c ...@@ -179,7 +179,7 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4)); int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
cl.getQueue().enqueueWriteBuffer(contextForces.getDeviceBuffer(), CL_FALSE, numAtoms*elementSize, cl.getQueue().enqueueWriteBuffer(contextForces.getDeviceBuffer(), CL_FALSE, numAtoms*elementSize,
numAtoms*(data.contexts.size()-1)*elementSize, pinnedForceMemory); numAtoms*(data.contexts.size()-1)*elementSize, pinnedForceMemory);
cl.reduceBuffer(contextForces, data.contexts.size()); cl.reduceBuffer(contextForces, cl.getLongForceBuffer(), data.contexts.size());
// Balance work between the contexts by transferring a little nonbonded work from the context that // Balance work between the contexts by transferring a little nonbonded work from the context that
// finished last to the one that finished first. // finished last to the one that finished first.
......
...@@ -67,9 +67,10 @@ __kernel void clearSixBuffers(__global int* restrict buffer1, int size1, __globa ...@@ -67,9 +67,10 @@ __kernel void clearSixBuffers(__global int* restrict buffer1, int size1, __globa
/** /**
* Sum a collection of buffers into the first one. * Sum a collection of buffers into the first one.
* Also, write the result into a 64-bit fixed point buffer (overwriting its contents).
*/ */
__kernel void reduceReal4Buffer(__global real4* restrict buffer, int bufferSize, int numBuffers) { __kernel void reduceReal4Buffer(__global real4* restrict buffer, __global long* restrict longBuffer, int bufferSize, int numBuffers) {
int index = get_global_id(0); int index = get_global_id(0);
int totalSize = bufferSize*numBuffers; int totalSize = bufferSize*numBuffers;
while (index < bufferSize) { while (index < bufferSize) {
...@@ -77,6 +78,9 @@ __kernel void reduceReal4Buffer(__global real4* restrict buffer, int bufferSize, ...@@ -77,6 +78,9 @@ __kernel void reduceReal4Buffer(__global real4* restrict buffer, int bufferSize,
for (int i = index+bufferSize; i < totalSize; i += bufferSize) for (int i = index+bufferSize; i < totalSize; i += bufferSize)
sum += buffer[i]; sum += buffer[i];
buffer[index] = sum; buffer[index] = sum;
longBuffer[index] = (long) (sum.x*0x100000000);
longBuffer[index+bufferSize] = (long) (sum.y*0x100000000);
longBuffer[index+2*bufferSize] = (long) (sum.z*0x100000000);
index += get_global_size(0); index += get_global_size(0);
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment