Commit 56b199be authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimizations to RMSDForce

parent f4dc3110
...@@ -6887,7 +6887,7 @@ double CudaCalcRMSDForceKernel::executeImpl(ContextImpl& context) { ...@@ -6887,7 +6887,7 @@ double CudaCalcRMSDForceKernel::executeImpl(ContextImpl& context) {
// Execute the first kernel. // Execute the first kernel.
int numParticles = particles->getSize(); int numParticles = particles->getSize();
int blockSize = 128; int blockSize = 256;
void* args1[] = {&numParticles, &cu.getPosq().getDevicePointer(), &referencePos->getDevicePointer(), void* args1[] = {&numParticles, &cu.getPosq().getDevicePointer(), &referencePos->getDevicePointer(),
&particles->getDevicePointer(), &buffer->getDevicePointer()}; &particles->getDevicePointer(), &buffer->getDevicePointer()};
cu.executeKernel(kernel1, args1, blockSize, blockSize, blockSize*sizeof(REAL)); cu.executeKernel(kernel1, args1, blockSize, blockSize, blockSize*sizeof(REAL));
......
...@@ -4,11 +4,16 @@ ...@@ -4,11 +4,16 @@
/** /**
* Sum a value over all threads. * Sum a value over all threads.
*/ */
__device__ real reduceValue(real value, real* temp) { __device__ real reduceValue(real value, volatile real* temp) {
const int thread = threadIdx.x; const int thread = threadIdx.x;
temp[thread] = value; temp[thread] = value;
__syncthreads(); __syncthreads();
for (uint step = 1; step < blockDim.x; step *= 2) { for (uint step = 1; step < 32; step *= 2) {
if (thread+step < blockDim.x && thread%(2*step) == 0)
temp[thread] = temp[thread] + temp[thread+step];
SYNC_WARPS
}
for (uint step = 32; step < blockDim.x; step *= 2) {
if (thread+step < blockDim.x && thread%(2*step) == 0) if (thread+step < blockDim.x && thread%(2*step) == 0)
temp[thread] = temp[thread] + temp[thread+step]; temp[thread] = temp[thread] + temp[thread+step];
__syncthreads(); __syncthreads();
...@@ -21,7 +26,7 @@ __device__ real reduceValue(real value, real* temp) { ...@@ -21,7 +26,7 @@ __device__ real reduceValue(real value, real* temp) {
*/ */
extern "C" __global__ void computeRMSDPart1(int numParticles, const real4* __restrict__ posq, const real4* __restrict__ referencePos, extern "C" __global__ void computeRMSDPart1(int numParticles, const real4* __restrict__ posq, const real4* __restrict__ referencePos,
const int* __restrict__ particles, real* buffer) { const int* __restrict__ particles, real* buffer) {
extern __shared__ real temp[]; extern __shared__ volatile real temp[];
// Compute the center of the particle positions. // Compute the center of the particle positions.
......
...@@ -7167,7 +7167,7 @@ double OpenCLCalcRMSDForceKernel::executeImpl(ContextImpl& context) { ...@@ -7167,7 +7167,7 @@ double OpenCLCalcRMSDForceKernel::executeImpl(ContextImpl& context) {
// Execute the first kernel. // Execute the first kernel.
int numParticles = particles->getSize(); int numParticles = particles->getSize();
int blockSize = 128; int blockSize = min(256, (int) kernel1.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(cl.getDevice()));
kernel1.setArg<cl_int>(0, numParticles); kernel1.setArg<cl_int>(0, numParticles);
kernel1.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer()); kernel1.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(2, referencePos->getDeviceBuffer()); kernel1.setArg<cl::Buffer>(2, referencePos->getDeviceBuffer());
......
...@@ -4,11 +4,16 @@ ...@@ -4,11 +4,16 @@
/** /**
* Sum a value over all threads. * Sum a value over all threads.
*/ */
real reduceValue(real value, __local real* temp) { real reduceValue(real value, __local volatile real* temp) {
const int thread = get_local_id(0); const int thread = get_local_id(0);
temp[thread] = value; temp[thread] = value;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (uint step = 1; step < get_local_size(0); step *= 2) { for (uint step = 1; step < 32; step *= 2) {
if (thread+step < get_local_size(0) && thread%(2*step) == 0)
temp[thread] = temp[thread] + temp[thread+step];
SYNC_WARPS
}
for (uint step = 32; step < get_local_size(0); step *= 2) {
if (thread+step < get_local_size(0) && thread%(2*step) == 0) if (thread+step < get_local_size(0) && thread%(2*step) == 0)
temp[thread] = temp[thread] + temp[thread+step]; temp[thread] = temp[thread] + temp[thread+step];
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
...@@ -20,7 +25,7 @@ real reduceValue(real value, __local real* temp) { ...@@ -20,7 +25,7 @@ real reduceValue(real value, __local real* temp) {
* Perform the first step of computing the RMSD. This is executed as a single work group. * Perform the first step of computing the RMSD. This is executed as a single work group.
*/ */
__kernel void computeRMSDPart1(int numParticles, __global const real4* restrict posq, __global const real4* restrict referencePos, __kernel void computeRMSDPart1(int numParticles, __global const real4* restrict posq, __global const real4* restrict referencePos,
__global const int* restrict particles, __global real* buffer, __local real* restrict temp) { __global const int* restrict particles, __global real* buffer, __local volatile real* restrict temp) {
// Compute the center of the particle positions. // Compute the center of the particle positions.
real3 center = (real3) 0; real3 center = (real3) 0;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment