Commit 56b199be authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimizations to RMSDForce

parent f4dc3110
......@@ -6887,7 +6887,7 @@ double CudaCalcRMSDForceKernel::executeImpl(ContextImpl& context) {
// Execute the first kernel.
int numParticles = particles->getSize();
int blockSize = 128;
int blockSize = 256;
void* args1[] = {&numParticles, &cu.getPosq().getDevicePointer(), &referencePos->getDevicePointer(),
&particles->getDevicePointer(), &buffer->getDevicePointer()};
cu.executeKernel(kernel1, args1, blockSize, blockSize, blockSize*sizeof(REAL));
......
......@@ -4,11 +4,16 @@
/**
* Sum a value over all threads.
*/
__device__ real reduceValue(real value, real* temp) {
__device__ real reduceValue(real value, volatile real* temp) {
const int thread = threadIdx.x;
temp[thread] = value;
__syncthreads();
for (uint step = 1; step < blockDim.x; step *= 2) {
for (uint step = 1; step < 32; step *= 2) {
if (thread+step < blockDim.x && thread%(2*step) == 0)
temp[thread] = temp[thread] + temp[thread+step];
SYNC_WARPS
}
for (uint step = 32; step < blockDim.x; step *= 2) {
if (thread+step < blockDim.x && thread%(2*step) == 0)
temp[thread] = temp[thread] + temp[thread+step];
__syncthreads();
......@@ -21,7 +26,7 @@ __device__ real reduceValue(real value, real* temp) {
*/
extern "C" __global__ void computeRMSDPart1(int numParticles, const real4* __restrict__ posq, const real4* __restrict__ referencePos,
const int* __restrict__ particles, real* buffer) {
extern __shared__ real temp[];
extern __shared__ volatile real temp[];
// Compute the center of the particle positions.
......
......@@ -7167,7 +7167,7 @@ double OpenCLCalcRMSDForceKernel::executeImpl(ContextImpl& context) {
// Execute the first kernel.
int numParticles = particles->getSize();
int blockSize = 128;
int blockSize = min(256, (int) kernel1.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(cl.getDevice()));
kernel1.setArg<cl_int>(0, numParticles);
kernel1.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(2, referencePos->getDeviceBuffer());
......
......@@ -4,11 +4,16 @@
/**
* Sum a value over all threads.
*/
real reduceValue(real value, __local real* temp) {
real reduceValue(real value, __local volatile real* temp) {
const int thread = get_local_id(0);
temp[thread] = value;
barrier(CLK_LOCAL_MEM_FENCE);
for (uint step = 1; step < get_local_size(0); step *= 2) {
for (uint step = 1; step < 32; step *= 2) {
if (thread+step < get_local_size(0) && thread%(2*step) == 0)
temp[thread] = temp[thread] + temp[thread+step];
SYNC_WARPS
}
for (uint step = 32; step < get_local_size(0); step *= 2) {
if (thread+step < get_local_size(0) && thread%(2*step) == 0)
temp[thread] = temp[thread] + temp[thread+step];
barrier(CLK_LOCAL_MEM_FENCE);
......@@ -20,7 +25,7 @@ real reduceValue(real value, __local real* temp) {
* Perform the first step of computing the RMSD. This is executed as a single work group.
*/
__kernel void computeRMSDPart1(int numParticles, __global const real4* restrict posq, __global const real4* restrict referencePos,
__global const int* restrict particles, __global real* buffer, __local real* restrict temp) {
__global const int* restrict particles, __global real* buffer, __local volatile real* restrict temp) {
// Compute the center of the particle positions.
real3 center = (real3) 0;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment