Commit 59bccb15 authored by peastman's avatar peastman
Browse files

Merge branch 'master' into pthreads

Conflicts:
	libraries/pthreads/include/pthread.h
parents 6cf75568 4bdbcf4d
...@@ -106,9 +106,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device ...@@ -106,9 +106,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
// if they supplied a valid deviceIndex, we only look through that one // if they supplied a valid deviceIndex, we only look through that one
if (i != deviceIndex && deviceIndex >= 0 && deviceIndex < (int) devices.size()) if (i != deviceIndex && deviceIndex >= 0 && deviceIndex < (int) devices.size())
continue; continue;
if (platformVendor == "Apple" && (devices[i].getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU))
if (platformVendor == "Apple" && (devices[i].getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU || devices[i].getInfo<CL_DEVICE_VENDOR>() == "AMD")) continue; // The CPU device on OS X won't work correctly.
continue; // The CPU device on OS X won't work correctly, and there are serious bugs using AMD GPUs.
int maxSize = devices[i].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0]; int maxSize = devices[i].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0];
int processingElementsPerComputeUnit = 8; int processingElementsPerComputeUnit = 8;
if (devices[i].getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU) { if (devices[i].getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU) {
...@@ -170,6 +169,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device ...@@ -170,6 +169,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
compilationDefines["WORK_GROUP_SIZE"] = intToString(ThreadBlockSize); compilationDefines["WORK_GROUP_SIZE"] = intToString(ThreadBlockSize);
if (platformVendor.size() >= 5 && platformVendor.substr(0, 5) == "Intel") if (platformVendor.size() >= 5 && platformVendor.substr(0, 5) == "Intel")
defaultOptimizationOptions = ""; defaultOptimizationOptions = "";
else if (platformVendor == "Apple")
defaultOptimizationOptions = "-cl-mad-enable -cl-no-signed-zeros";
else else
defaultOptimizationOptions = "-cl-fast-relaxed-math"; defaultOptimizationOptions = "-cl-fast-relaxed-math";
supports64BitGlobalAtomics = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_int64_base_atomics") != string::npos); supports64BitGlobalAtomics = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_int64_base_atomics") != string::npos);
...@@ -241,8 +242,6 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device ...@@ -241,8 +242,6 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
} }
else else
simdWidth = 1; simdWidth = 1;
if (platformVendor == "Apple" && vendor == "AMD")
compilationDefines["MAC_AMD_WORKAROUND"] = "";
if (supports64BitGlobalAtomics) if (supports64BitGlobalAtomics)
compilationDefines["SUPPORTS_64_BIT_ATOMICS"] = ""; compilationDefines["SUPPORTS_64_BIT_ATOMICS"] = "";
if (supportsDoublePrecision) if (supportsDoublePrecision)
......
...@@ -455,6 +455,9 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre ...@@ -455,6 +455,9 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre
case Operation::CEIL: case Operation::CEIL:
out << "ceil(" << getTempName(node.getChildren()[0], temps) << ")"; out << "ceil(" << getTempName(node.getChildren()[0], temps) << ")";
break; break;
case Operation::SELECT:
out << "(" << getTempName(node.getChildren()[0], temps) << " != 0 ? " << getTempName(node.getChildren()[1], temps) << " : " << getTempName(node.getChildren()[2], temps) << ")";
break;
default: default:
throw OpenMMException("Internal error: Unknown operation in user-defined expression: "+node.getOperation().getName()); throw OpenMMException("Internal error: Unknown operation in user-defined expression: "+node.getOperation().getName());
} }
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. * * Portions copyright (c) 2009-2015 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -35,25 +35,109 @@ ...@@ -35,25 +35,109 @@
using namespace OpenMM; using namespace OpenMM;
using namespace std; using namespace std;
OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize) : context(context), xsize(xsize), ysize(ysize), zsize(zsize) { OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize, bool realToComplex) :
zkernel = createKernel(xsize, ysize, zsize, zthreads); context(context), xsize(xsize), ysize(ysize), zsize(zsize) {
xkernel = createKernel(ysize, zsize, xsize, xthreads); packRealAsComplex = false;
ykernel = createKernel(zsize, xsize, ysize, ythreads); int packedXSize = xsize;
int packedYSize = ysize;
int packedZSize = zsize;
if (realToComplex) {
// If any axis size is even, we can pack the real values into a complex grid that is only half as large.
// Look for an appropriate axis.
packRealAsComplex = true;
int packedAxis, bufferSize;
if (xsize%2 == 0) {
packedAxis = 0;
packedXSize /= 2;
bufferSize = packedXSize;
}
else if (ysize%2 == 0) {
packedAxis = 1;
packedYSize /= 2;
bufferSize = packedYSize;
}
else if (zsize%2 == 0) {
packedAxis = 2;
packedZSize /= 2;
bufferSize = packedZSize;
}
else
packRealAsComplex = false;
if (packRealAsComplex) {
// Build the kernels for packing and unpacking the data.
map<string, string> defines;
defines["XSIZE"] = context.intToString(xsize);
defines["YSIZE"] = context.intToString(ysize);
defines["ZSIZE"] = context.intToString(zsize);
defines["PACKED_AXIS"] = context.intToString(packedAxis);
defines["PACKED_XSIZE"] = context.intToString(packedXSize);
defines["PACKED_YSIZE"] = context.intToString(packedYSize);
defines["PACKED_ZSIZE"] = context.intToString(packedZSize);
defines["M_PI"] = context.doubleToString(M_PI);
cl::Program program = context.createProgram(OpenCLKernelSources::fftR2C, defines);
packForwardKernel = cl::Kernel(program, "packForwardData");
unpackForwardKernel = cl::Kernel(program, "unpackForwardData");
unpackForwardKernel.setArg(2, bufferSize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2)), NULL);
packBackwardKernel = cl::Kernel(program, "packBackwardData");
packBackwardKernel.setArg(2, bufferSize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2)), NULL);
unpackBackwardKernel = cl::Kernel(program, "unpackBackwardData");
}
}
bool inputIsReal = (realToComplex && !packRealAsComplex);
zkernel = createKernel(packedXSize, packedYSize, packedZSize, zthreads, 0, true, inputIsReal);
xkernel = createKernel(packedYSize, packedZSize, packedXSize, xthreads, 1, true, inputIsReal);
ykernel = createKernel(packedZSize, packedXSize, packedYSize, ythreads, 2, true, inputIsReal);
invzkernel = createKernel(packedXSize, packedYSize, packedZSize, zthreads, 0, false, inputIsReal);
invxkernel = createKernel(packedYSize, packedZSize, packedXSize, xthreads, 1, false, inputIsReal);
invykernel = createKernel(packedZSize, packedXSize, packedYSize, ythreads, 2, false, inputIsReal);
} }
void OpenCLFFT3D::execFFT(OpenCLArray& in, OpenCLArray& out, bool forward) { void OpenCLFFT3D::execFFT(OpenCLArray& in, OpenCLArray& out, bool forward) {
zkernel.setArg<cl::Buffer>(0, in.getDeviceBuffer()); cl::Kernel kernel1 = (forward ? zkernel : invzkernel);
zkernel.setArg<cl::Buffer>(1, out.getDeviceBuffer()); cl::Kernel kernel2 = (forward ? xkernel : invxkernel);
zkernel.setArg<cl_int>(2, forward ? 1 : -1); cl::Kernel kernel3 = (forward ? ykernel : invykernel);
context.executeKernel(zkernel, xsize*ysize*zsize, zthreads); if (packRealAsComplex) {
xkernel.setArg<cl::Buffer>(0, out.getDeviceBuffer()); cl::Kernel packKernel = (forward ? packForwardKernel : packBackwardKernel);
xkernel.setArg<cl::Buffer>(1, in.getDeviceBuffer()); cl::Kernel unpackKernel = (forward ? unpackForwardKernel : unpackBackwardKernel);
xkernel.setArg<cl_int>(2, forward ? 1 : -1); int gridSize = xsize*ysize*zsize/2;
context.executeKernel(xkernel, xsize*ysize*zsize, xthreads);
ykernel.setArg<cl::Buffer>(0, in.getDeviceBuffer()); // Pack the data into a half sized grid.
ykernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
ykernel.setArg<cl_int>(2, forward ? 1 : -1); packKernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
context.executeKernel(ykernel, xsize*ysize*zsize, ythreads); packKernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
context.executeKernel(packKernel, gridSize);
// Perform the FFT.
kernel1.setArg<cl::Buffer>(0, out.getDeviceBuffer());
kernel1.setArg<cl::Buffer>(1, in.getDeviceBuffer());
context.executeKernel(kernel1, gridSize, zthreads);
kernel2.setArg<cl::Buffer>(0, in.getDeviceBuffer());
kernel2.setArg<cl::Buffer>(1, out.getDeviceBuffer());
context.executeKernel(kernel2, gridSize, xthreads);
kernel3.setArg<cl::Buffer>(0, out.getDeviceBuffer());
kernel3.setArg<cl::Buffer>(1, in.getDeviceBuffer());
context.executeKernel(kernel3, gridSize, ythreads);
// Unpack the data.
unpackKernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
unpackKernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
context.executeKernel(unpackKernel, gridSize);
}
else {
kernel1.setArg<cl::Buffer>(0, in.getDeviceBuffer());
kernel1.setArg<cl::Buffer>(1, out.getDeviceBuffer());
context.executeKernel(kernel1, xsize*ysize*zsize, zthreads);
kernel2.setArg<cl::Buffer>(0, out.getDeviceBuffer());
kernel2.setArg<cl::Buffer>(1, in.getDeviceBuffer());
context.executeKernel(kernel2, xsize*ysize*zsize, xthreads);
kernel3.setArg<cl::Buffer>(0, in.getDeviceBuffer());
kernel3.setArg<cl::Buffer>(1, out.getDeviceBuffer());
context.executeKernel(kernel3, xsize*ysize*zsize, ythreads);
}
} }
int OpenCLFFT3D::findLegalDimension(int minimum) { int OpenCLFFT3D::findLegalDimension(int minimum) {
...@@ -73,8 +157,10 @@ int OpenCLFFT3D::findLegalDimension(int minimum) { ...@@ -73,8 +157,10 @@ int OpenCLFFT3D::findLegalDimension(int minimum) {
} }
} }
cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads) { cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads, int axis, bool forward, bool inputIsReal) {
int maxThreads = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>()); int maxThreads = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
while (maxThreads > 128 && maxThreads-64 >= zsize)
maxThreads -= 64;
bool isCPU = context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU; bool isCPU = context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU;
while (true) { while (true) {
bool loopRequired = (zsize > maxThreads || isCPU); bool loopRequired = (zsize > maxThreads || isCPU);
...@@ -137,10 +223,10 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa ...@@ -137,10 +223,10 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source<<"real2 b2 = "<<context.doubleToString((2*cos(2*M_PI/7)-cos(4*M_PI/7)-cos(6*M_PI/7))/3)<<"*(d0-d4);\n"; source<<"real2 b2 = "<<context.doubleToString((2*cos(2*M_PI/7)-cos(4*M_PI/7)-cos(6*M_PI/7))/3)<<"*(d0-d4);\n";
source<<"real2 b3 = "<<context.doubleToString((cos(2*M_PI/7)-2*cos(4*M_PI/7)+cos(6*M_PI/7))/3)<<"*(d4-d2);\n"; source<<"real2 b3 = "<<context.doubleToString((cos(2*M_PI/7)-2*cos(4*M_PI/7)+cos(6*M_PI/7))/3)<<"*(d4-d2);\n";
source<<"real2 b4 = "<<context.doubleToString((cos(2*M_PI/7)+cos(4*M_PI/7)-2*cos(6*M_PI/7))/3)<<"*(d2-d0);\n"; source<<"real2 b4 = "<<context.doubleToString((cos(2*M_PI/7)+cos(4*M_PI/7)-2*cos(6*M_PI/7))/3)<<"*(d2-d0);\n";
source<<"real2 b5 = -sign*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d7+d1);\n"; source<<"real2 b5 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d7+d1);\n";
source<<"real2 b6 = -sign*"<<context.doubleToString((2*sin(2*M_PI/7)-sin(4*M_PI/7)+sin(6*M_PI/7))/3)<<"*(d1-d5);\n"; source<<"real2 b6 = -(SIGN)*"<<context.doubleToString((2*sin(2*M_PI/7)-sin(4*M_PI/7)+sin(6*M_PI/7))/3)<<"*(d1-d5);\n";
source<<"real2 b7 = -sign*"<<context.doubleToString((sin(2*M_PI/7)-2*sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d5-d3);\n"; source<<"real2 b7 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)-2*sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d5-d3);\n";
source<<"real2 b8 = -sign*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)+2*sin(6*M_PI/7))/3)<<"*(d3-d1);\n"; source<<"real2 b8 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)+2*sin(6*M_PI/7))/3)<<"*(d3-d1);\n";
source<<"real2 t0 = b0+b1;\n"; source<<"real2 t0 = b0+b1;\n";
source<<"real2 t1 = b2+b3;\n"; source<<"real2 t1 = b2+b3;\n";
source<<"real2 t2 = b4-b3;\n"; source<<"real2 t2 = b4-b3;\n";
...@@ -178,8 +264,8 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa ...@@ -178,8 +264,8 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source<<"real2 d7 = d6+d5;\n"; source<<"real2 d7 = d6+d5;\n";
source<<"real2 d8 = d6-d5;\n"; source<<"real2 d8 = d6-d5;\n";
string coeff = context.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI)); string coeff = context.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
source<<"real2 d9 = sign*(real2) (d2.y+"<<coeff<<"*d3.y, -d2.x-"<<coeff<<"*d3.x);\n"; source<<"real2 d9 = (SIGN)*(real2) (d2.y+"<<coeff<<"*d3.y, -d2.x-"<<coeff<<"*d3.x);\n";
source<<"real2 d10 = sign*(real2) ("<<coeff<<"*d2.y-d3.y, d3.x-"<<coeff<<"*d2.x);\n"; source<<"real2 d10 = (SIGN)*(real2) ("<<coeff<<"*d2.y-d3.y, d3.x-"<<coeff<<"*d2.x);\n";
source<<"data"<<output<<"[base+4*j*"<<m<<"] = c0+d4;\n"; source<<"data"<<output<<"[base+4*j*"<<m<<"] = c0+d4;\n";
source<<"data"<<output<<"[base+(4*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(5*L)<<"], d7+d9);\n"; source<<"data"<<output<<"[base+(4*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(5*L)<<"], d7+d9);\n";
source<<"data"<<output<<"[base+(4*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(5*L)<<"], d8+d10);\n"; source<<"data"<<output<<"[base+(4*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(5*L)<<"], d8+d10);\n";
...@@ -194,7 +280,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa ...@@ -194,7 +280,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source<<"real2 d0 = c0+c2;\n"; source<<"real2 d0 = c0+c2;\n";
source<<"real2 d1 = c0-c2;\n"; source<<"real2 d1 = c0-c2;\n";
source<<"real2 d2 = c1+c3;\n"; source<<"real2 d2 = c1+c3;\n";
source<<"real2 d3 = sign*(real2) (c1.y-c3.y, c3.x-c1.x);\n"; source<<"real2 d3 = (SIGN)*(real2) (c1.y-c3.y, c3.x-c1.x);\n";
source<<"data"<<output<<"[base+3*j*"<<m<<"] = d0+d2;\n"; source<<"data"<<output<<"[base+3*j*"<<m<<"] = d0+d2;\n";
source<<"data"<<output<<"[base+(3*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(4*L)<<"], d1+d3);\n"; source<<"data"<<output<<"[base+(3*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(4*L)<<"], d1+d3);\n";
source<<"data"<<output<<"[base+(3*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(4*L)<<"], d0-d2);\n"; source<<"data"<<output<<"[base+(3*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(4*L)<<"], d0-d2);\n";
...@@ -206,7 +292,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa ...@@ -206,7 +292,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n"; source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
source<<"real2 d0 = c1+c2;\n"; source<<"real2 d0 = c1+c2;\n";
source<<"real2 d1 = c0-0.5f*d0;\n"; source<<"real2 d1 = c0-0.5f*d0;\n";
source<<"real2 d2 = sign*"<<context.doubleToString(sin(M_PI/3.0))<<"*(real2) (c1.y-c2.y, c2.x-c1.x);\n"; source<<"real2 d2 = (SIGN)*"<<context.doubleToString(sin(M_PI/3.0))<<"*(real2) (c1.y-c2.y, c2.x-c1.x);\n";
source<<"data"<<output<<"[base+2*j*"<<m<<"] = c0+d0;\n"; source<<"data"<<output<<"[base+2*j*"<<m<<"] = c0+d0;\n";
source<<"data"<<output<<"[base+(2*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(3*L)<<"], d1+d2);\n"; source<<"data"<<output<<"[base+(2*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(3*L)<<"], d1+d2);\n";
source<<"data"<<output<<"[base+(2*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(3*L)<<"], d1-d2);\n"; source<<"data"<<output<<"[base+(2*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(3*L)<<"], d1-d2);\n";
...@@ -226,13 +312,27 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa ...@@ -226,13 +312,27 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
// Create the kernel. // Create the kernel.
bool outputIsReal = (inputIsReal && axis == 2 && !forward);
bool outputIsPacked = (inputIsReal && axis == 2 && forward);
string outputSuffix = (outputIsReal ? ".x" : "");
if (loopRequired) { if (loopRequired) {
if (outputIsPacked)
source<<"if (x < XSIZE/2+1)\n";
source<<"for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))\n"; source<<"for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))\n";
source<<"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"<<(stage%2)<<"[z];\n"; if (outputIsPacked)
source<<"out[y*(ZSIZE*(XSIZE/2+1))+z*(XSIZE/2+1)+x] = data"<<(stage%2)<<"[z]"<<outputSuffix<<";\n";
else
source<<"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"<<(stage%2)<<"[z]"<<outputSuffix<<";\n";
} }
else { else {
source<<"if (index < XSIZE*YSIZE)\n"; if (outputIsPacked) {
source<<"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)];\n"; source<<"if (index < XSIZE*YSIZE && x < XSIZE/2+1)\n";
source<<"out[y*(ZSIZE*(XSIZE/2+1))+(get_local_id(0)%ZSIZE)*(XSIZE/2+1)+x] = data"<<(stage%2)<<"[get_local_id(0)]"<<outputSuffix<<";\n";
}
else {
source<<"if (index < XSIZE*YSIZE)\n";
source<<"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)]"<<outputSuffix<<";\n";
}
} }
map<string, string> replacements; map<string, string> replacements;
replacements["XSIZE"] = context.intToString(xsize); replacements["XSIZE"] = context.intToString(xsize);
...@@ -242,6 +342,12 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa ...@@ -242,6 +342,12 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
replacements["M_PI"] = context.doubleToString(M_PI); replacements["M_PI"] = context.doubleToString(M_PI);
replacements["COMPUTE_FFT"] = source.str(); replacements["COMPUTE_FFT"] = source.str();
replacements["LOOP_REQUIRED"] = (loopRequired ? "1" : "0"); replacements["LOOP_REQUIRED"] = (loopRequired ? "1" : "0");
replacements["SIGN"] = (forward ? "1" : "-1");
replacements["INPUT_TYPE"] = (inputIsReal && axis == 0 && forward ? "real" : "real2");
replacements["OUTPUT_TYPE"] = (outputIsReal ? "real" : "real2");
replacements["INPUT_IS_REAL"] = (inputIsReal && axis == 0 && forward ? "1" : "0");
replacements["INPUT_IS_PACKED"] = (inputIsReal && axis == 0 && !forward ? "1" : "0");
replacements["OUTPUT_IS_PACKED"] = (outputIsPacked ? "1" : "0");
cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::fft, replacements)); cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::fft, replacements));
cl::Kernel kernel(program, "execFFT"); cl::Kernel kernel(program, "execFFT");
threads = (isCPU ? 1 : blocksPerGroup*zsize); threads = (isCPU ? 1 : blocksPerGroup*zsize);
...@@ -253,9 +359,9 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa ...@@ -253,9 +359,9 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
continue; continue;
} }
int bufferSize = blocksPerGroup*zsize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2)); int bufferSize = blocksPerGroup*zsize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2));
kernel.setArg(2, bufferSize, NULL);
kernel.setArg(3, bufferSize, NULL); kernel.setArg(3, bufferSize, NULL);
kernel.setArg(4, bufferSize, NULL); kernel.setArg(4, bufferSize, NULL);
kernel.setArg(5, bufferSize, NULL);
return kernel; return kernel;
} }
} }
...@@ -1628,6 +1628,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1628,6 +1628,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
pmeDefines["GRID_SIZE_Y"] = cl.intToString(gridSizeY); pmeDefines["GRID_SIZE_Y"] = cl.intToString(gridSizeY);
pmeDefines["GRID_SIZE_Z"] = cl.intToString(gridSizeZ); pmeDefines["GRID_SIZE_Z"] = cl.intToString(gridSizeZ);
pmeDefines["EPSILON_FACTOR"] = cl.doubleToString(sqrt(ONE_4PI_EPS0)); pmeDefines["EPSILON_FACTOR"] = cl.doubleToString(sqrt(ONE_4PI_EPS0));
pmeDefines["M_PI"] = cl.doubleToString(M_PI);
bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU); bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
if (deviceIsCpu) if (deviceIsCpu)
pmeDefines["DEVICE_IS_CPU"] = "1"; pmeDefines["DEVICE_IS_CPU"] = "1";
...@@ -1652,8 +1653,11 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1652,8 +1653,11 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
int elementSize = (cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float)); int elementSize = (cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
pmeGrid = new OpenCLArray(cl, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid"); pmeGrid = new OpenCLArray(cl, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid");
cl.addAutoclearBuffer(*pmeGrid);
pmeGrid2 = new OpenCLArray(cl, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid2"); pmeGrid2 = new OpenCLArray(cl, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid2");
if (cl.getSupports64BitGlobalAtomics())
cl.addAutoclearBuffer(*pmeGrid2);
else
cl.addAutoclearBuffer(*pmeGrid);
pmeBsplineModuliX = new OpenCLArray(cl, gridSizeX, elementSize, "pmeBsplineModuliX"); pmeBsplineModuliX = new OpenCLArray(cl, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY = new OpenCLArray(cl, gridSizeY, elementSize, "pmeBsplineModuliY"); pmeBsplineModuliY = new OpenCLArray(cl, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ = new OpenCLArray(cl, gridSizeZ, elementSize, "pmeBsplineModuliZ"); pmeBsplineModuliZ = new OpenCLArray(cl, gridSizeZ, elementSize, "pmeBsplineModuliZ");
...@@ -1661,9 +1665,12 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1661,9 +1665,12 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
pmeAtomRange = OpenCLArray::create<cl_int>(cl, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange"); pmeAtomRange = OpenCLArray::create<cl_int>(cl, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
pmeAtomGridIndex = OpenCLArray::create<mm_int2>(cl, numParticles, "pmeAtomGridIndex"); pmeAtomGridIndex = OpenCLArray::create<mm_int2>(cl, numParticles, "pmeAtomGridIndex");
sort = new OpenCLSort(cl, new SortTrait(), cl.getNumAtoms()); sort = new OpenCLSort(cl, new SortTrait(), cl.getNumAtoms());
fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ); fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ, true);
string vendor = cl.getDevice().getInfo<CL_DEVICE_VENDOR>(); string vendor = cl.getDevice().getInfo<CL_DEVICE_VENDOR>();
usePmeQueue = (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA"); bool isNvidia = (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA");
if (isNvidia)
pmeDefines["USE_ALTERNATE_MEMORY_ACCESS_PATTERN"] = "1";
usePmeQueue = isNvidia;
if (usePmeQueue) { if (usePmeQueue) {
pmeQueue = cl::CommandQueue(cl.getContext(), cl.getDevice()); pmeQueue = cl::CommandQueue(cl.getContext(), cl.getDevice());
int recipForceGroup = force.getReciprocalSpaceForceGroup(); int recipForceGroup = force.getReciprocalSpaceForceGroup();
...@@ -1800,6 +1807,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1800,6 +1807,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeZIndexKernel = cl::Kernel(program, "recordZIndex"); pmeZIndexKernel = cl::Kernel(program, "recordZIndex");
pmeSpreadChargeKernel = cl::Kernel(program, "gridSpreadCharge"); pmeSpreadChargeKernel = cl::Kernel(program, "gridSpreadCharge");
pmeConvolutionKernel = cl::Kernel(program, "reciprocalConvolution"); pmeConvolutionKernel = cl::Kernel(program, "reciprocalConvolution");
pmeEvalEnergyKernel = cl::Kernel(program, "gridEvaluateEnergy");
pmeInterpolateForceKernel = cl::Kernel(program, "gridInterpolateForce"); pmeInterpolateForceKernel = cl::Kernel(program, "gridInterpolateForce");
int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4)); int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
pmeUpdateBsplinesKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer()); pmeUpdateBsplinesKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
...@@ -1814,20 +1822,28 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1814,20 +1822,28 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeSpreadChargeKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer()); pmeSpreadChargeKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(1, pmeAtomGridIndex->getDeviceBuffer()); pmeSpreadChargeKernel.setArg<cl::Buffer>(1, pmeAtomGridIndex->getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(2, pmeAtomRange->getDeviceBuffer()); pmeSpreadChargeKernel.setArg<cl::Buffer>(2, pmeAtomRange->getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid->getDeviceBuffer()); if (cl.getSupports64BitGlobalAtomics())
pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid2->getDeviceBuffer());
else
pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid->getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta->getDeviceBuffer()); pmeSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(0, pmeGrid2->getDeviceBuffer()); pmeConvolutionKernel.setArg<cl::Buffer>(0, pmeGrid2->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(1, cl.getEnergyBuffer().getDeviceBuffer()); pmeConvolutionKernel.setArg<cl::Buffer>(1, pmeBsplineModuliX->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(2, pmeBsplineModuliX->getDeviceBuffer()); pmeConvolutionKernel.setArg<cl::Buffer>(2, pmeBsplineModuliY->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(3, pmeBsplineModuliY->getDeviceBuffer()); pmeConvolutionKernel.setArg<cl::Buffer>(3, pmeBsplineModuliZ->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(4, pmeBsplineModuliZ->getDeviceBuffer()); pmeEvalEnergyKernel.setArg<cl::Buffer>(0, pmeGrid2->getDeviceBuffer());
pmeEvalEnergyKernel.setArg<cl::Buffer>(1, cl.getEnergyBuffer().getDeviceBuffer());
pmeEvalEnergyKernel.setArg<cl::Buffer>(2, pmeBsplineModuliX->getDeviceBuffer());
pmeEvalEnergyKernel.setArg<cl::Buffer>(3, pmeBsplineModuliY->getDeviceBuffer());
pmeEvalEnergyKernel.setArg<cl::Buffer>(4, pmeBsplineModuliZ->getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer()); pmeInterpolateForceKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(1, cl.getForceBuffers().getDeviceBuffer()); pmeInterpolateForceKernel.setArg<cl::Buffer>(1, cl.getForceBuffers().getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid->getDeviceBuffer()); pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid->getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(7, pmeAtomGridIndex->getDeviceBuffer()); pmeInterpolateForceKernel.setArg<cl::Buffer>(7, pmeAtomGridIndex->getDeviceBuffer());
if (cl.getSupports64BitGlobalAtomics()) { if (cl.getSupports64BitGlobalAtomics()) {
pmeFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge"); pmeFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid->getDeviceBuffer()); pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2->getDeviceBuffer());
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid->getDeviceBuffer());
} }
} }
} }
...@@ -1851,7 +1867,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1851,7 +1867,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(ewaldForcesKernel, cl.getNumAtoms()); cl.executeKernel(ewaldForcesKernel, cl.getNumAtoms());
} }
if (pmeGrid != NULL && includeReciprocal) { if (pmeGrid != NULL && includeReciprocal) {
if (usePmeQueue) if (usePmeQueue && !includeEnergy)
cl.setQueue(pmeQueue); cl.setQueue(pmeQueue);
// Invert the periodic box vectors. // Invert the periodic box vectors.
...@@ -1926,19 +1942,24 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1926,19 +1942,24 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
} }
fft->execFFT(*pmeGrid, *pmeGrid2, true); fft->execFFT(*pmeGrid, *pmeGrid2, true);
mm_double4 boxSize = cl.getPeriodicBoxSizeDouble(); mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
double scaleFactor = 1.0/(M_PI*boxSize.x*boxSize.y*boxSize.z);
if (cl.getUseDoublePrecision()) { if (cl.getUseDoublePrecision()) {
pmeConvolutionKernel.setArg<mm_double4>(5, recipBoxVectors[0]); pmeConvolutionKernel.setArg<mm_double4>(4, recipBoxVectors[0]);
pmeConvolutionKernel.setArg<mm_double4>(6, recipBoxVectors[1]); pmeConvolutionKernel.setArg<mm_double4>(5, recipBoxVectors[1]);
pmeConvolutionKernel.setArg<mm_double4>(7, recipBoxVectors[2]); pmeConvolutionKernel.setArg<mm_double4>(6, recipBoxVectors[2]);
pmeConvolutionKernel.setArg<cl_double>(8, scaleFactor); pmeEvalEnergyKernel.setArg<mm_double4>(5, recipBoxVectors[0]);
pmeEvalEnergyKernel.setArg<mm_double4>(6, recipBoxVectors[1]);
pmeEvalEnergyKernel.setArg<mm_double4>(7, recipBoxVectors[2]);
} }
else { else {
pmeConvolutionKernel.setArg<mm_float4>(5, recipBoxVectorsFloat[0]); pmeConvolutionKernel.setArg<mm_float4>(4, recipBoxVectorsFloat[0]);
pmeConvolutionKernel.setArg<mm_float4>(6, recipBoxVectorsFloat[1]); pmeConvolutionKernel.setArg<mm_float4>(5, recipBoxVectorsFloat[1]);
pmeConvolutionKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[2]); pmeConvolutionKernel.setArg<mm_float4>(6, recipBoxVectorsFloat[2]);
pmeConvolutionKernel.setArg<cl_float>(8, (float) scaleFactor); pmeEvalEnergyKernel.setArg<mm_float4>(5, recipBoxVectorsFloat[0]);
} pmeEvalEnergyKernel.setArg<mm_float4>(6, recipBoxVectorsFloat[1]);
pmeEvalEnergyKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[2]);
}
if (includeEnergy)
cl.executeKernel(pmeEvalEnergyKernel, cl.getNumAtoms());
cl.executeKernel(pmeConvolutionKernel, cl.getNumAtoms()); cl.executeKernel(pmeConvolutionKernel, cl.getNumAtoms());
fft->execFFT(*pmeGrid2, *pmeGrid, false); fft->execFFT(*pmeGrid2, *pmeGrid, false);
setPeriodicBoxSizeArg(cl, pmeInterpolateForceKernel, 3); setPeriodicBoxSizeArg(cl, pmeInterpolateForceKernel, 3);
...@@ -5972,7 +5993,7 @@ string OpenCLIntegrateCustomStepKernel::createGlobalComputation(const string& va ...@@ -5972,7 +5993,7 @@ string OpenCLIntegrateCustomStepKernel::createGlobalComputation(const string& va
variables["dt"] = "dt[0].y"; variables["dt"] = "dt[0].y";
variables["uniform"] = "uniform"; variables["uniform"] = "uniform";
variables["gaussian"] = "gaussian"; variables["gaussian"] = "gaussian";
variables[energyName] = "energy[0]"; variables[energyName] = "energy";
for (int i = 0; i < integrator.getNumGlobalVariables(); i++) for (int i = 0; i < integrator.getNumGlobalVariables(); i++)
variables[integrator.getGlobalVariableName(i)] = "globals["+cl.intToString(i)+"]"; variables[integrator.getGlobalVariableName(i)] = "globals["+cl.intToString(i)+"]";
for (int i = 0; i < (int) parameterNames.size(); i++) for (int i = 0; i < (int) parameterNames.size(); i++)
...@@ -6008,7 +6029,7 @@ string OpenCLIntegrateCustomStepKernel::createPerDofComputation(const string& va ...@@ -6008,7 +6029,7 @@ string OpenCLIntegrateCustomStepKernel::createPerDofComputation(const string& va
variables["m"] = "mass"; variables["m"] = "mass";
variables["dt"] = "stepSize"; variables["dt"] = "stepSize";
if (energyName != "") if (energyName != "")
variables[energyName] = "energy[0]"; variables[energyName] = "energy";
for (int i = 0; i < integrator.getNumGlobalVariables(); i++) for (int i = 0; i < integrator.getNumGlobalVariables(); i++)
variables[integrator.getGlobalVariableName(i)] = "globals["+cl.intToString(i)+"]"; variables[integrator.getGlobalVariableName(i)] = "globals["+cl.intToString(i)+"]";
for (int i = 0; i < integrator.getNumPerDofVariables(); i++) for (int i = 0; i < integrator.getNumPerDofVariables(); i++)
...@@ -6238,8 +6259,7 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context ...@@ -6238,8 +6259,7 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
kernel.setArg<cl::Buffer>(index++, globalValues->getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, globalValues->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, contextParameterValues->getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, contextParameterValues->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, sumBuffer->getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, sumBuffer->getDeviceBuffer());
index += 3; index += 4;
kernel.setArg<cl::Buffer>(index++, potentialEnergy->getDeviceBuffer());
for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++) for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++)
kernel.setArg<cl::Memory>(index++, perDofValues->getBuffers()[i].getMemory()); kernel.setArg<cl::Memory>(index++, perDofValues->getBuffers()[i].getMemory());
if (stepType[step] == CustomIntegrator::ComputeSum) { if (stepType[step] == CustomIntegrator::ComputeSum) {
...@@ -6284,8 +6304,6 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context ...@@ -6284,8 +6304,6 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
kernel.setArg<cl::Buffer>(index++, integration.getStepSize().getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, integration.getStepSize().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, globalValues->getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, globalValues->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, contextParameterValues->getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, contextParameterValues->getDeviceBuffer());
index += 2;
kernel.setArg<cl::Buffer>(index++, potentialEnergy->getDeviceBuffer());
} }
else if (stepType[step] == CustomIntegrator::ConstrainPositions) { else if (stepType[step] == CustomIntegrator::ConstrainPositions) {
// Apply position constraints. // Apply position constraints.
...@@ -6326,16 +6344,6 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context ...@@ -6326,16 +6344,6 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
randomKernel.setArg<cl::Buffer>(1, uniformRandoms->getDeviceBuffer()); randomKernel.setArg<cl::Buffer>(1, uniformRandoms->getDeviceBuffer());
randomKernel.setArg<cl::Buffer>(2, randomSeed->getDeviceBuffer()); randomKernel.setArg<cl::Buffer>(2, randomSeed->getDeviceBuffer());
// Create the kernel for summing the potential energy.
cl::Program program = cl.createProgram(OpenCLKernelSources::customIntegrator, defines);
sumPotentialEnergyKernel = cl::Kernel(program, cl.getUseDoublePrecision() ? "computeDoubleSum" : "computeFloatSum");
int index = 0;
sumPotentialEnergyKernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer());
sumPotentialEnergyKernel.setArg<cl::Buffer>(index++, potentialEnergy->getDeviceBuffer());
sumPotentialEnergyKernel.setArg<cl_int>(index++, 0);
sumPotentialEnergyKernel.setArg<cl_int>(index++, cl.getEnergyBuffer().getSize());
// Create the kernel for computing kinetic energy. // Create the kernel for computing kinetic energy.
stringstream computeKE; stringstream computeKE;
...@@ -6359,9 +6367,9 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context ...@@ -6359,9 +6367,9 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
replacements["PARAMETER_ARGUMENTS"] = args.str(); replacements["PARAMETER_ARGUMENTS"] = args.str();
if (defines.find("LOAD_POS_AS_DELTA") != defines.end()) if (defines.find("LOAD_POS_AS_DELTA") != defines.end())
defines.erase("LOAD_POS_AS_DELTA"); defines.erase("LOAD_POS_AS_DELTA");
program = cl.createProgram(cl.replaceStrings(OpenCLKernelSources::customIntegratorPerDof, replacements), defines); cl::Program program = cl.createProgram(cl.replaceStrings(OpenCLKernelSources::customIntegratorPerDof, replacements), defines);
kineticEnergyKernel = cl::Kernel(program, "computePerDof"); kineticEnergyKernel = cl::Kernel(program, "computePerDof");
index = 0; int index = 0;
kineticEnergyKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer()); kineticEnergyKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
setPosqCorrectionArg(cl, kineticEnergyKernel, index++); setPosqCorrectionArg(cl, kineticEnergyKernel, index++);
kineticEnergyKernel.setArg<cl::Buffer>(index++, integration.getPosDelta().getDeviceBuffer()); kineticEnergyKernel.setArg<cl::Buffer>(index++, integration.getPosDelta().getDeviceBuffer());
...@@ -6373,7 +6381,10 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context ...@@ -6373,7 +6381,10 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
kineticEnergyKernel.setArg<cl::Buffer>(index++, sumBuffer->getDeviceBuffer()); kineticEnergyKernel.setArg<cl::Buffer>(index++, sumBuffer->getDeviceBuffer());
index += 2; index += 2;
kineticEnergyKernel.setArg<cl::Buffer>(index++, uniformRandoms->getDeviceBuffer()); kineticEnergyKernel.setArg<cl::Buffer>(index++, uniformRandoms->getDeviceBuffer());
kineticEnergyKernel.setArg<cl::Buffer>(index++, potentialEnergy->getDeviceBuffer()); if (cl.getUseDoublePrecision())
kineticEnergyKernel.setArg<cl_double>(index++, 0.0);
else
kineticEnergyKernel.setArg<cl_float>(index++, 0.0f);
for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++) for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++)
kineticEnergyKernel.setArg<cl::Memory>(index++, perDofValues->getBuffers()[i].getMemory()); kineticEnergyKernel.setArg<cl::Memory>(index++, perDofValues->getBuffers()[i].getMemory());
keNeedsForce = usesVariable(keExpression, "f"); keNeedsForce = usesVariable(keExpression, "f");
...@@ -6480,9 +6491,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr ...@@ -6480,9 +6491,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
} }
else { else {
recordChangedParameters(context); recordChangedParameters(context);
context.calcForcesAndEnergy(computeForce, computeEnergy, forceGroup[i]); energy = context.calcForcesAndEnergy(computeForce, computeEnergy, forceGroup[i]);
if (computeEnergy)
cl.executeKernel(sumPotentialEnergyKernel, OpenCLContext::ThreadBlockSize, OpenCLContext::ThreadBlockSize);
forcesAreValid = true; forcesAreValid = true;
} }
} }
...@@ -6490,6 +6499,10 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr ...@@ -6490,6 +6499,10 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
kernels[i][0].setArg<cl_uint>(10, integration.prepareRandomNumbers(requiredGaussian[i])); kernels[i][0].setArg<cl_uint>(10, integration.prepareRandomNumbers(requiredGaussian[i]));
kernels[i][0].setArg<cl::Buffer>(9, integration.getRandom().getDeviceBuffer()); kernels[i][0].setArg<cl::Buffer>(9, integration.getRandom().getDeviceBuffer());
kernels[i][0].setArg<cl::Buffer>(11, uniformRandoms->getDeviceBuffer()); kernels[i][0].setArg<cl::Buffer>(11, uniformRandoms->getDeviceBuffer());
if (cl.getUseDoublePrecision())
kernels[i][0].setArg<cl_double>(12, energy);
else
kernels[i][0].setArg<cl_float>(12, (cl_float) energy);
if (requiredUniform[i] > 0) if (requiredUniform[i] > 0)
cl.executeKernel(randomKernel, numAtoms); cl.executeKernel(randomKernel, numAtoms);
cl.executeKernel(kernels[i][0], numAtoms); cl.executeKernel(kernels[i][0], numAtoms);
...@@ -6497,12 +6510,20 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr ...@@ -6497,12 +6510,20 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
else if (stepType[i] == CustomIntegrator::ComputeGlobal && !merged[i]) { else if (stepType[i] == CustomIntegrator::ComputeGlobal && !merged[i]) {
kernels[i][0].setArg<cl_float>(3, (cl_float) SimTKOpenMMUtilities::getUniformlyDistributedRandomNumber()); kernels[i][0].setArg<cl_float>(3, (cl_float) SimTKOpenMMUtilities::getUniformlyDistributedRandomNumber());
kernels[i][0].setArg<cl_float>(4, (cl_float) SimTKOpenMMUtilities::getNormallyDistributedRandomNumber()); kernels[i][0].setArg<cl_float>(4, (cl_float) SimTKOpenMMUtilities::getNormallyDistributedRandomNumber());
if (cl.getUseDoublePrecision())
kernels[i][0].setArg<cl_double>(5, energy);
else
kernels[i][0].setArg<cl_float>(5, (cl_float) energy);
cl.executeKernel(kernels[i][0], 1, 1); cl.executeKernel(kernels[i][0], 1, 1);
} }
else if (stepType[i] == CustomIntegrator::ComputeSum) { else if (stepType[i] == CustomIntegrator::ComputeSum) {
kernels[i][0].setArg<cl_uint>(10, integration.prepareRandomNumbers(requiredGaussian[i])); kernels[i][0].setArg<cl_uint>(10, integration.prepareRandomNumbers(requiredGaussian[i]));
kernels[i][0].setArg<cl::Buffer>(9, integration.getRandom().getDeviceBuffer()); kernels[i][0].setArg<cl::Buffer>(9, integration.getRandom().getDeviceBuffer());
kernels[i][0].setArg<cl::Buffer>(11, uniformRandoms->getDeviceBuffer()); kernels[i][0].setArg<cl::Buffer>(11, uniformRandoms->getDeviceBuffer());
if (cl.getUseDoublePrecision())
kernels[i][0].setArg<cl_double>(12, energy);
else
kernels[i][0].setArg<cl_float>(12, (cl_float) energy);
if (requiredUniform[i] > 0) if (requiredUniform[i] > 0)
cl.executeKernel(randomKernel, numAtoms); cl.executeKernel(randomKernel, numAtoms);
cl.clearBuffer(*sumBuffer); cl.clearBuffer(*sumBuffer);
...@@ -6552,9 +6573,7 @@ double OpenCLIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& contex ...@@ -6552,9 +6573,7 @@ double OpenCLIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& contex
bool willNeedEnergy = false; bool willNeedEnergy = false;
for (int i = 0; i < integrator.getNumComputations(); i++) for (int i = 0; i < integrator.getNumComputations(); i++)
willNeedEnergy |= needsEnergy[i]; willNeedEnergy |= needsEnergy[i];
context.calcForcesAndEnergy(true, willNeedEnergy, -1); energy = context.calcForcesAndEnergy(true, willNeedEnergy, -1);
if (willNeedEnergy)
cl.executeKernel(sumPotentialEnergyKernel, OpenCLContext::ThreadBlockSize, OpenCLContext::ThreadBlockSize);
forcesAreValid = true; forcesAreValid = true;
} }
cl.clearBuffer(*sumBuffer); cl.clearBuffer(*sumBuffer);
......
...@@ -303,6 +303,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -303,6 +303,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
defines["PADDED_CUTOFF_SQUARED"] = context.doubleToString(paddedCutoff*paddedCutoff); defines["PADDED_CUTOFF_SQUARED"] = context.doubleToString(paddedCutoff*paddedCutoff);
defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles->getSize()); defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles->getSize());
defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks()); defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
defines["SIMD_WIDTH"] = context.intToString(context.getSIMDWidth());
if (usePeriodic) if (usePeriodic)
defines["USE_PERIODIC"] = "1"; defines["USE_PERIODIC"] = "1";
int maxExclusions = 0; int maxExclusions = 0;
......
...@@ -109,7 +109,7 @@ bool OpenCLPlatform::supportsDoublePrecision() const { ...@@ -109,7 +109,7 @@ bool OpenCLPlatform::supportsDoublePrecision() const {
bool OpenCLPlatform::isPlatformSupported() { bool OpenCLPlatform::isPlatformSupported() {
// Return false for OpenCL implementations that are known // Return false for OpenCL implementations that are known
// to be buggy (Apple OSX since 10.7.5) // to be buggy (Apple OS X prior to 10.10).
#ifdef __APPLE__ #ifdef __APPLE__
char str[256]; char str[256];
...@@ -122,12 +122,10 @@ bool OpenCLPlatform::isPlatformSupported() { ...@@ -122,12 +122,10 @@ bool OpenCLPlatform::isPlatformSupported() {
if (sscanf(str, "%d.%d.%d", &major, &minor, &micro) != 3) if (sscanf(str, "%d.%d.%d", &major, &minor, &micro) != 3)
return false; return false;
if ((major > 11) || (major == 11 && minor > 4) || (major == 11 && minor == 4 && micro >= 2)) if (major < 14 || (major == 14 && minor < 3))
// 11.4.2 is the darwin release corresponding to OSX 10.7.5, which is the // 14.3.0 is the darwin release corresponding to OS X 10.10.3. Versions prior to that
// point at which a number of serious bugs were introduced into the // contained a number of serious bugs in the Apple OpenCL libraries.
// Apple OpenCL libraries, resulting in catistrophically incorrect MD simulations // (See https://github.com/SimTk/openmm/issues/395 for example.)
// (see https://github.com/SimTk/openmm/issues/395 for example). Once a fix is released,
// this version check should be updated.
return false; return false;
#endif #endif
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2010-2013 Stanford University and the Authors. * * Portions copyright (c) 2010-2015 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -42,7 +42,6 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le ...@@ -42,7 +42,6 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
replacements["MIN_KEY"] = trait->getMinKey(); replacements["MIN_KEY"] = trait->getMinKey();
replacements["MAX_KEY"] = trait->getMaxKey(); replacements["MAX_KEY"] = trait->getMaxKey();
replacements["MAX_VALUE"] = trait->getMaxValue(); replacements["MAX_VALUE"] = trait->getMaxValue();
replacements["VALUE_IS_INT2"] = (trait->getDataType() == std::string("int2") ? "1" : "0");
cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements)); cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements));
shortListKernel = cl::Kernel(program, "sortShortList"); shortListKernel = cl::Kernel(program, "sortShortList");
computeRangeKernel = cl::Kernel(program, "computeRange"); computeRangeKernel = cl::Kernel(program, "computeRange");
...@@ -59,7 +58,11 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le ...@@ -59,7 +58,11 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
unsigned int maxRangeSize = std::min(maxGroupSize, (unsigned int) computeRangeKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice())); unsigned int maxRangeSize = std::min(maxGroupSize, (unsigned int) computeRangeKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
unsigned int maxPositionsSize = std::min(maxGroupSize, (unsigned int) computeBucketPositionsKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice())); unsigned int maxPositionsSize = std::min(maxGroupSize, (unsigned int) computeBucketPositionsKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
unsigned int maxShortListSize = shortListKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()); unsigned int maxShortListSize = shortListKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice());
isShortList = (length <= maxLocalBuffer && length < maxShortListSize); // On Qualcomm's OpenCL, it's essential to check against maxShortListSize. Otherwise you get a crash.
// But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
// maximum, so including the check hurts performance. For the moment I'm going to just comment it out.
// If we officially support Qualcomm in the future, we'll need to do something better.
isShortList = (length <= maxLocalBuffer/* && length < maxShortListSize*/);
for (rangeKernelSize = 1; rangeKernelSize*2 <= maxRangeSize; rangeKernelSize *= 2) for (rangeKernelSize = 1; rangeKernelSize*2 <= maxRangeSize; rangeKernelSize *= 2)
; ;
positionsKernelSize = std::min(rangeKernelSize, maxPositionsSize); positionsKernelSize = std::min(rangeKernelSize, maxPositionsSize);
......
__kernel void computeGlobal(__global mixed2* restrict dt, __global mixed* restrict globals, __global mixed* restrict params, __kernel void computeGlobal(__global mixed2* restrict dt, __global mixed* restrict globals, __global mixed* restrict params,
float uniform, float gaussian, __global const real* restrict energy) { float uniform, float gaussian, const real energy) {
COMPUTE_STEP COMPUTE_STEP
} }
...@@ -26,7 +26,7 @@ void storePos(__global real4* restrict posq, __global real4* restrict posqCorrec ...@@ -26,7 +26,7 @@ void storePos(__global real4* restrict posq, __global real4* restrict posqCorrec
__kernel void computePerDof(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta, __kernel void computePerDof(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta,
__global mixed4* restrict velm, __global const real4* restrict force, __global const mixed2* restrict dt, __global const mixed* restrict globals, __global mixed4* restrict velm, __global const real4* restrict force, __global const mixed2* restrict dt, __global const mixed* restrict globals,
__global const mixed* restrict params, __global mixed* restrict sum, __global const float4* restrict gaussianValues, __global const mixed* restrict params, __global mixed* restrict sum, __global const float4* restrict gaussianValues,
unsigned int gaussianBaseIndex, __global const float4* restrict uniformValues, __global const real* restrict energy unsigned int gaussianBaseIndex, __global const float4* restrict uniformValues, const real energy
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
mixed stepSize = dt[0].y; mixed stepSize = dt[0].y;
int index = get_global_id(0); int index = get_global_id(0);
......
...@@ -2,26 +2,57 @@ real2 multiplyComplex(real2 c1, real2 c2) { ...@@ -2,26 +2,57 @@ real2 multiplyComplex(real2 c1, real2 c2) {
return (real2) (c1.x*c2.x-c1.y*c2.y, c1.x*c2.y+c1.y*c2.x); return (real2) (c1.x*c2.x-c1.y*c2.y, c1.x*c2.y+c1.y*c2.x);
} }
/**
* Load a value from the half-complex grid produces by a real-to-complex transform.
*/
real2 loadComplexValue(__global const real2* restrict in, int x, int y, int z) {
const int inputZSize = ZSIZE/2+1;
if (z < inputZSize)
return in[x*YSIZE*inputZSize+y*inputZSize+z];
int xp = (x == 0 ? 0 : XSIZE-x);
int yp = (y == 0 ? 0 : YSIZE-y);
real2 value = in[xp*YSIZE*inputZSize+yp*inputZSize+(ZSIZE-z)];
return (real2) (value.x, -value.y);
}
/** /**
* Perform a 1D FFT on each row along one axis. * Perform a 1D FFT on each row along one axis.
*/ */
__kernel void execFFT(__global const real2* restrict in, __global real2* restrict out, int sign, __local real2* restrict w, __kernel void execFFT(__global const INPUT_TYPE* restrict in, __global OUTPUT_TYPE* restrict out, __local real2* restrict w,
__local real2* restrict data0, __local real2* restrict data1) { __local real2* restrict data0, __local real2* restrict data1) {
for (int i = get_local_id(0); i < ZSIZE; i += get_local_size(0)) for (int i = get_local_id(0); i < ZSIZE; i += get_local_size(0))
w[i] = (real2) (cos(-sign*i*2*M_PI/ZSIZE), sin(-sign*i*2*M_PI/ZSIZE)); w[i] = (real2) (cos(-(SIGN)*i*2*M_PI/ZSIZE), sin(-(SIGN)*i*2*M_PI/ZSIZE));
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (int baseIndex = get_group_id(0)*BLOCKS_PER_GROUP; baseIndex < XSIZE*YSIZE; baseIndex += get_num_groups(0)*BLOCKS_PER_GROUP) { for (int baseIndex = get_group_id(0)*BLOCKS_PER_GROUP; baseIndex < XSIZE*YSIZE; baseIndex += get_num_groups(0)*BLOCKS_PER_GROUP) {
int index = baseIndex+get_local_id(0)/ZSIZE; int index = baseIndex+get_local_id(0)/ZSIZE;
int x = index/YSIZE; int x = index/YSIZE;
int y = index-x*YSIZE; int y = index-x*YSIZE;
#if OUTPUT_IS_PACKED
if (x < XSIZE/2+1) {
#endif
#if LOOP_REQUIRED #if LOOP_REQUIRED
for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0)) for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))
#if INPUT_IS_REAL
data0[z] = (real2) (in[x*(YSIZE*ZSIZE)+y*ZSIZE+z], 0);
#elif INPUT_IS_PACKED
data0[z] = loadComplexValue(in, x, y, z);
#else
data0[z] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+z]; data0[z] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+z];
#endif
#else #else
if (index < XSIZE*YSIZE) if (index < XSIZE*YSIZE)
#if INPUT_IS_REAL
data0[get_local_id(0)] = (real2) (in[x*(YSIZE*ZSIZE)+y*ZSIZE+get_local_id(0)%ZSIZE], 0);
#elif INPUT_IS_PACKED
data0[get_local_id(0)] = loadComplexValue(in, x, y, get_local_id(0)%ZSIZE);
#else
data0[get_local_id(0)] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+get_local_id(0)%ZSIZE]; data0[get_local_id(0)] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+get_local_id(0)%ZSIZE];
#endif
#endif
#if OUTPUT_IS_PACKED
}
#endif #endif
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
COMPUTE_FFT COMPUTE_FFT
......
/**
* Combine the two halves of a real grid into a complex grid that is half as large.
*/
__kernel void packForwardData(__global const real* restrict in, __global real2* restrict out) {
const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
int y = remainder/PACKED_ZSIZE;
int z = remainder-y*PACKED_ZSIZE;
#if PACKED_AXIS == 0
real2 value = (real2) (in[2*x*YSIZE*ZSIZE+y*ZSIZE+z], in[(2*x+1)*YSIZE*ZSIZE+y*ZSIZE+z]);
#elif PACKED_AXIS == 1
real2 value = (real2) (in[x*YSIZE*ZSIZE+2*y*ZSIZE+z], in[x*YSIZE*ZSIZE+(2*y+1)*ZSIZE+z]);
#else
real2 value = (real2) (in[x*YSIZE*ZSIZE+y*ZSIZE+2*z], in[x*YSIZE*ZSIZE+y*ZSIZE+(2*z+1)]);
#endif
out[index] = value;
}
}
/**
* Split the transformed data back into a full sized, symmetric grid.
*/
__kernel void unpackForwardData(__global const real2* restrict in, __global real2* restrict out, __local real2* restrict w) {
// Compute the phase factors.
#if PACKED_AXIS == 0
for (int i = get_local_id(0); i < PACKED_XSIZE; i += get_local_size(0))
w[i] = (real2) (sin(i*2*M_PI/XSIZE), cos(i*2*M_PI/XSIZE));
#elif PACKED_AXIS == 1
for (int i = get_local_id(0); i < PACKED_YSIZE; i += get_local_size(0))
w[i] = (real2) (sin(i*2*M_PI/YSIZE), cos(i*2*M_PI/YSIZE));
#else
for (int i = get_local_id(0); i < PACKED_ZSIZE; i += get_local_size(0))
w[i] = (real2) (sin(i*2*M_PI/ZSIZE), cos(i*2*M_PI/ZSIZE));
#endif
barrier(CLK_LOCAL_MEM_FENCE);
// Transform the data.
const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
const int outputZSize = ZSIZE/2+1;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
int y = remainder/PACKED_ZSIZE;
int z = remainder-y*PACKED_ZSIZE;
int xp = (x == 0 ? 0 : PACKED_XSIZE-x);
int yp = (y == 0 ? 0 : PACKED_YSIZE-y);
int zp = (z == 0 ? 0 : PACKED_ZSIZE-z);
real2 z1 = in[x*PACKED_YSIZE*PACKED_ZSIZE+y*PACKED_ZSIZE+z];
real2 z2 = in[xp*PACKED_YSIZE*PACKED_ZSIZE+yp*PACKED_ZSIZE+zp];
#if PACKED_AXIS == 0
real2 wfac = w[x];
#elif PACKED_AXIS == 1
real2 wfac = w[y];
#else
real2 wfac = w[z];
#endif
real2 output = (real2) ((z1.x+z2.x - wfac.x*(z1.x-z2.x) + wfac.y*(z1.y+z2.y))/2, (z1.y-z2.y - wfac.y*(z1.x-z2.x) - wfac.x*(z1.y+z2.y))/2);
if (z < outputZSize)
out[x*YSIZE*outputZSize+y*outputZSize+z] = output;
xp = (x == 0 ? 0 : XSIZE-x);
yp = (y == 0 ? 0 : YSIZE-y);
zp = (z == 0 ? 0 : ZSIZE-z);
if (zp < outputZSize) {
#if PACKED_AXIS == 0
if (x == 0)
out[PACKED_XSIZE*YSIZE*outputZSize+yp*outputZSize+zp] = (real2) ((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
#elif PACKED_AXIS == 1
if (y == 0)
out[xp*YSIZE*outputZSize+PACKED_YSIZE*outputZSize+zp] = (real2) ((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
#else
if (z == 0)
out[xp*YSIZE*outputZSize+yp*outputZSize+PACKED_ZSIZE] = (real2) ((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
#endif
else
out[xp*YSIZE*outputZSize+yp*outputZSize+zp] = (real2) (output.x, -output.y);
}
}
}
/**
* Load a value from the half-complex grid produced by a real-to-complex transform.
*/
real2 loadComplexValue(__global const real2* restrict in, int x, int y, int z) {
const int inputZSize = ZSIZE/2+1;
if (z < inputZSize)
return in[x*YSIZE*inputZSize+y*inputZSize+z];
int xp = (x == 0 ? 0 : XSIZE-x);
int yp = (y == 0 ? 0 : YSIZE-y);
real2 value = in[xp*YSIZE*inputZSize+yp*inputZSize+(ZSIZE-z)];
return (real2) (value.x, -value.y);
}
/**
* Repack the symmetric complex grid into one half as large in preparation for doing an inverse complex-to-real transform.
*/
__kernel void packBackwardData(__global const real2* restrict in, __global real2* restrict out, __local real2* restrict w) {
// Compute the phase factors.
#if PACKED_AXIS == 0
for (int i = get_local_id(0); i < PACKED_XSIZE; i += get_local_size(0))
w[i] = (real2) (cos(i*2*M_PI/XSIZE), sin(i*2*M_PI/XSIZE));
#elif PACKED_AXIS == 1
for (int i = get_local_id(0); i < PACKED_YSIZE; i += get_local_size(0))
w[i] = (real2) (cos(i*2*M_PI/YSIZE), sin(i*2*M_PI/YSIZE));
#else
for (int i = get_local_id(0); i < PACKED_ZSIZE; i += get_local_size(0))
w[i] = (real2) (cos(i*2*M_PI/ZSIZE), sin(i*2*M_PI/ZSIZE));
#endif
barrier(CLK_LOCAL_MEM_FENCE);
// Transform the data.
const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
int y = remainder/PACKED_ZSIZE;
int z = remainder-y*PACKED_ZSIZE;
int xp = (x == 0 ? 0 : PACKED_XSIZE-x);
int yp = (y == 0 ? 0 : PACKED_YSIZE-y);
int zp = (z == 0 ? 0 : PACKED_ZSIZE-z);
real2 z1 = loadComplexValue(in, x, y, z);
#if PACKED_AXIS == 0
real2 wfac = w[x];
real2 z2 = loadComplexValue(in, PACKED_XSIZE-x, yp, zp);
#elif PACKED_AXIS == 1
real2 wfac = w[y];
real2 z2 = loadComplexValue(in, xp, PACKED_YSIZE-y, zp);
#else
real2 wfac = w[z];
real2 z2 = loadComplexValue(in, xp, yp, PACKED_ZSIZE-z);
#endif
real2 even = (real2) ((z1.x+z2.x)/2, (z1.y-z2.y)/2);
real2 odd = (real2) ((z1.x-z2.x)/2, (z1.y+z2.y)/2);
odd = (real2) (odd.x*wfac.x-odd.y*wfac.y, odd.y*wfac.x+odd.x*wfac.y);
out[x*PACKED_YSIZE*PACKED_ZSIZE+y*PACKED_ZSIZE+z] = (real2) (even.x-odd.y, even.y+odd.x);
}
}
/**
* Split the data back into a full sized, real grid after an inverse transform.
*/
__kernel void unpackBackwardData(__global const real2* restrict in, __global real* restrict out) {
const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
int y = remainder/PACKED_ZSIZE;
int z = remainder-y*PACKED_ZSIZE;
real2 value = 2*in[index];
#if PACKED_AXIS == 0
out[2*x*YSIZE*ZSIZE+y*ZSIZE+z] = value.x;
out[(2*x+1)*YSIZE*ZSIZE+y*ZSIZE+z] = value.y;
#elif PACKED_AXIS == 1
out[x*YSIZE*ZSIZE+2*y*ZSIZE+z] = value.x;
out[x*YSIZE*ZSIZE+(2*y+1)*ZSIZE+z] = value.y;
#else
out[x*YSIZE*ZSIZE+y*ZSIZE+2*z] = value.x;
out[x*YSIZE*ZSIZE+y*ZSIZE+(2*z+1)] = value.y;
#endif
}
}
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
#define BUFFER_SIZE 256
/** /**
* Find a bounding box for the atoms in each block. * Find a bounding box for the atoms in each block.
...@@ -65,6 +64,10 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c ...@@ -65,6 +64,10 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c
} }
} }
#if SIMD_WIDTH <= 32
#define BUFFER_SIZE 256
__kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
__global unsigned int* restrict interactionCount, __global int* restrict interactingTiles, __global unsigned int* restrict interactingAtoms, __global unsigned int* restrict interactionCount, __global int* restrict interactingTiles, __global unsigned int* restrict interactingAtoms,
__global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex, unsigned int numBlocks, __global real2* restrict sortedBlocks, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex, unsigned int numBlocks, __global real2* restrict sortedBlocks,
...@@ -133,8 +136,8 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi ...@@ -133,8 +136,8 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
int block2 = block2Base+indexInWarp; int block2 = block2Base+indexInWarp;
bool includeBlock2 = (block2 < NUM_BLOCKS); bool includeBlock2 = (block2 < NUM_BLOCKS);
if (includeBlock2) { if (includeBlock2) {
real4 blockCenterY = (block2 < NUM_BLOCKS ? sortedBlockCenter[block2] : (real4) (0)); real4 blockCenterY = sortedBlockCenter[block2];
real4 blockSizeY = (block2 < NUM_BLOCKS ? sortedBlockBoundingBox[block2] : (real4) (0)); real4 blockSizeY = sortedBlockBoundingBox[block2];
real4 blockDelta = blockCenterX-blockCenterY; real4 blockDelta = blockCenterX-blockCenterY;
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(blockDelta) APPLY_PERIODIC_TO_DELTA(blockDelta)
...@@ -251,4 +254,280 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi ...@@ -251,4 +254,280 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0))
oldPositions[i] = posq[i]; oldPositions[i] = posq[i];
} }
\ No newline at end of file
#else
// This is the old implementation of finding interacting blocks. It is quite a bit more complicated,
// and slower on most GPUs. On AMD, however, it is faster, so we keep it around to use there.
#define BUFFER_SIZE BUFFER_GROUPS*GROUP_SIZE
#define WARP_SIZE 32
#define INVALID 0xFFFF
/**
* Perform a parallel prefix sum over an array. The input values are all assumed to be 0 or 1.
*/
void prefixSum(__local short* sum, __local ushort2* temp) {
for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
temp[i].x = sum[i];
barrier(CLK_LOCAL_MEM_FENCE);
int whichBuffer = 0;
for (int offset = 1; offset < BUFFER_SIZE; offset *= 2) {
if (whichBuffer == 0)
for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
temp[i].y = (i < offset ? temp[i].x : temp[i].x+temp[i-offset].x);
else
for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
temp[i].x = (i < offset ? temp[i].y : temp[i].y+temp[i-offset].y);
whichBuffer = 1-whichBuffer;
barrier(CLK_LOCAL_MEM_FENCE);
}
if (whichBuffer == 0)
for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
sum[i] = temp[i].x;
else
for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
sum[i] = temp[i].y;
barrier(CLK_LOCAL_MEM_FENCE);
}
/**
* This is called by findBlocksWithInteractions(). It compacts the list of blocks, identifies interactions
* in them, and writes the result to global memory.
*/
void storeInteractionData(int x, __local unsigned short* buffer, __local short* sum, __local ushort2* temp, __local int* atoms, __local int* numAtoms,
__local int* baseIndex, __global unsigned int* interactionCount, __global int* interactingTiles, __global unsigned int* interactingAtoms, real4 periodicBoxSize,
real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, __global const real4* posq, __local real4* posBuffer,
real4 blockCenterX, real4 blockSizeX, unsigned int maxTiles, bool finish) {
const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF);
if (get_local_id(0) < TILE_SIZE) {
real4 pos = posq[x*TILE_SIZE+get_local_id(0)];
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, blockCenterX)
}
#endif
posBuffer[get_local_id(0)] = pos;
}
// The buffer is full, so we need to compact it and write out results. Start by doing a parallel prefix sum.
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
sum[i] = (buffer[i] == INVALID ? 0 : 1);
barrier(CLK_LOCAL_MEM_FENCE);
prefixSum(sum, temp);
int numValid = sum[BUFFER_SIZE-1];
// Compact the buffer.
for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
if (buffer[i] != INVALID)
temp[sum[i]-1].x = buffer[i];
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
buffer[i] = temp[i].x;
barrier(CLK_LOCAL_MEM_FENCE);
// Loop over the tiles and find specific interactions in them.
const int indexInWarp = get_local_id(0)%WARP_SIZE;
for (int base = 0; base < numValid; base += BUFFER_SIZE/WARP_SIZE) {
for (int i = get_local_id(0)/WARP_SIZE; i < BUFFER_SIZE/WARP_SIZE && base+i < numValid; i += GROUP_SIZE/WARP_SIZE) {
// Check each atom in block Y for interactions.
real4 pos = posq[buffer[base+i]*TILE_SIZE+indexInWarp];
#ifdef USE_PERIODIC
if (singlePeriodicCopy)
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, blockCenterX)
#endif
bool interacts = false;
#ifdef USE_PERIODIC
if (!singlePeriodicCopy) {
for (int j = 0; j < TILE_SIZE; j++) {
real4 delta = pos-posBuffer[j];
APPLY_PERIODIC_TO_DELTA(delta)
interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
}
}
else {
#endif
for (int j = 0; j < TILE_SIZE; j++) {
real4 delta = pos-posBuffer[j];
interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
}
#ifdef USE_PERIODIC
}
#endif
sum[i*WARP_SIZE+indexInWarp] = (interacts ? 1 : 0);
}
for (int i = numValid-base+get_local_id(0)/WARP_SIZE; i < BUFFER_SIZE/WARP_SIZE; i += GROUP_SIZE/WARP_SIZE)
sum[i*WARP_SIZE+indexInWarp] = 0;
// Compact the list of atoms.
barrier(CLK_LOCAL_MEM_FENCE);
prefixSum(sum, temp);
for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
if (sum[i] != (i == 0 ? 0 : sum[i-1]))
atoms[*numAtoms+sum[i]-1] = buffer[base+i/WARP_SIZE]*TILE_SIZE+indexInWarp;
// Store them to global memory.
int atomsToStore = *numAtoms+sum[BUFFER_SIZE-1];
bool storePartialTile = (finish && base >= numValid-BUFFER_SIZE/WARP_SIZE);
int tilesToStore = (storePartialTile ? (atomsToStore+TILE_SIZE-1)/TILE_SIZE : atomsToStore/TILE_SIZE);
if (tilesToStore > 0) {
if (get_local_id(0) == 0)
*baseIndex = atom_add(interactionCount, tilesToStore);
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) == 0)
*numAtoms = atomsToStore-tilesToStore*TILE_SIZE;
if (*baseIndex+tilesToStore <= maxTiles) {
if (get_local_id(0) < tilesToStore)
interactingTiles[*baseIndex+get_local_id(0)] = x;
for (int i = get_local_id(0); i < tilesToStore*TILE_SIZE; i += get_local_size(0))
interactingAtoms[*baseIndex*TILE_SIZE+i] = (i < atomsToStore ? atoms[i] : NUM_ATOMS);
}
}
else {
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) == 0)
*numAtoms += sum[BUFFER_SIZE-1];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < *numAtoms && !storePartialTile)
atoms[get_local_id(0)] = atoms[tilesToStore*TILE_SIZE+get_local_id(0)];
}
if (numValid == 0 && *numAtoms > 0 && finish) {
// We didn't have any more tiles to process, but there were some atoms left over from a
// previous call to this function. Save them now.
if (get_local_id(0) == 0)
*baseIndex = atom_add(interactionCount, 1);
barrier(CLK_LOCAL_MEM_FENCE);
if (*baseIndex < maxTiles) {
if (get_local_id(0) == 0)
interactingTiles[*baseIndex] = x;
if (get_local_id(0) < TILE_SIZE)
interactingAtoms[*baseIndex*TILE_SIZE+get_local_id(0)] = (get_local_id(0) < *numAtoms ? atoms[get_local_id(0)] : NUM_ATOMS);
}
}
// Reset the buffer for processing more tiles.
for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
buffer[i] = INVALID;
barrier(CLK_LOCAL_MEM_FENCE);
}
/**
* Compare the bounding boxes for each pair of blocks. If they are sufficiently far apart,
* mark them as non-interacting.
*/
__kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
__global unsigned int* restrict interactionCount, __global int* restrict interactingTiles, __global unsigned int* restrict interactingAtoms,
__global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex, unsigned int numBlocks, __global real2* restrict sortedBlocks,
__global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real4* restrict oldPositions,
__global const int* restrict rebuildNeighborList) {
__local unsigned short buffer[BUFFER_SIZE];
__local short sum[BUFFER_SIZE];
__local ushort2 temp[BUFFER_SIZE];
__local int atoms[BUFFER_SIZE+TILE_SIZE];
__local real4 posBuffer[TILE_SIZE];
__local int exclusionsForX[MAX_EXCLUSIONS];
__local int bufferFull;
__local int globalIndex;
__local int numAtoms;
#ifdef AMD_ATOMIC_WORK_AROUND
// Do a byte write to force all memory accesses to interactionCount to use the complete path.
// This avoids the atomic access from causing all word accesses to other buffers from using the slow complete path.
// The IF actually causes the write to never be executed, its presence is all that is needed.
// AMD APP SDK 2.4 has this problem.
if (get_global_id(0) == get_local_id(0)+1)
((__global char*)interactionCount)[sizeof(unsigned int)+1] = 0;
#endif
if (rebuildNeighborList[0] == 0)
return; // The neighbor list doesn't need to be rebuilt.
int valuesInBuffer = 0;
if (get_local_id(0) == 0)
bufferFull = false;
for (int i = 0; i < BUFFER_GROUPS; ++i)
buffer[i*GROUP_SIZE+get_local_id(0)] = INVALID;
barrier(CLK_LOCAL_MEM_FENCE);
// Loop over blocks sorted by size.
for (int i = startBlockIndex+get_group_id(0); i < startBlockIndex+numBlocks; i += get_num_groups(0)) {
if (get_local_id(0) == get_local_size(0)-1)
numAtoms = 0;
real2 sortedKey = sortedBlocks[i];
int x = (int) sortedKey.y;
real4 blockCenterX = sortedBlockCenter[i];
real4 blockSizeX = sortedBlockBoundingBox[i];
// Load exclusion data for block x.
const int exclusionStart = exclusionRowIndices[x];
const int exclusionEnd = exclusionRowIndices[x+1];
const int numExclusions = exclusionEnd-exclusionStart;
for (int j = get_local_id(0); j < numExclusions; j += get_local_size(0))
exclusionsForX[j] = exclusionIndices[exclusionStart+j];
barrier(CLK_LOCAL_MEM_FENCE);
// Compare it to other blocks after this one in sorted order.
for (int base = i+1; base < NUM_BLOCKS; base += get_local_size(0)) {
int j = base+get_local_id(0);
real2 sortedKey2 = (j < NUM_BLOCKS ? sortedBlocks[j] : (real2) 0);
real4 blockCenterY = (j < NUM_BLOCKS ? sortedBlockCenter[j] : (real4) 0);
real4 blockSizeY = (j < NUM_BLOCKS ? sortedBlockBoundingBox[j] : (real4) 0);
unsigned short y = (unsigned short) sortedKey2.y;
real4 delta = blockCenterX-blockCenterY;
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
delta.x = max((real) 0, fabs(delta.x)-blockSizeX.x-blockSizeY.x);
delta.y = max((real) 0, fabs(delta.y)-blockSizeX.y-blockSizeY.y);
delta.z = max((real) 0, fabs(delta.z)-blockSizeX.z-blockSizeY.z);
bool hasExclusions = false;
for (int k = 0; k < numExclusions; k++)
hasExclusions |= (exclusionsForX[k] == y);
if (j < NUM_BLOCKS && delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED && !hasExclusions) {
// Add this tile to the buffer.
int bufferIndex = valuesInBuffer*GROUP_SIZE+get_local_id(0);
buffer[bufferIndex] = y;
valuesInBuffer++;
if (!bufferFull && valuesInBuffer == BUFFER_GROUPS)
bufferFull = true;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (bufferFull) {
storeInteractionData(x, buffer, sum, temp, atoms, &numAtoms, &globalIndex, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ, posq, posBuffer, blockCenterX, blockSizeX, maxTiles, false);
valuesInBuffer = 0;
if (get_local_id(0) == 0)
bufferFull = false;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
storeInteractionData(x, buffer, sum, temp, atoms, &numAtoms, &globalIndex, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ, posq, posBuffer, blockCenterX, blockSizeX, maxTiles, true);
}
// Record the positions the neighbor list is based on.
for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0))
oldPositions[i] = posq[i];
}
#endif
...@@ -138,35 +138,34 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con ...@@ -138,35 +138,34 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0); zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex; int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
real add = pos.w*data[ix].x*data[iy].y*data[iz].z; real add = pos.w*data[ix].x*data[iy].y*data[iz].z;
#ifdef USE_DOUBLE_PRECISION #ifdef USE_ALTERNATE_MEMORY_ACCESS_PATTERN
atom_add(&pmeGrid[2*index], (long) (add*0x100000000)); // On Nvidia devices (at least Maxwell anyway), this split ordering produces much higher performance. Why?
// I have no idea! And of course on AMD it produces slower performance. GPUs are not meant to be understood.
atom_add(&pmeGrid[index%2 == 0 ? index/2 : (index+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z)/2], (long) (add*0x100000000));
#else #else
atom_add(&pmeGrid[index], (long) (add*0x100000000)); atom_add(&pmeGrid[index], (long) (add*0x100000000));
#endif #endif
} }
} }
} }
} }
} }
__kernel void finishSpreadCharge(__global long* restrict pmeGrid) { __kernel void finishSpreadCharge(__global long* restrict fixedGrid, __global real* restrict realGrid) {
__global real2* realGrid = (__global real2*) pmeGrid;
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
real scale = EPSILON_FACTOR/(real) 0x100000000; real scale = EPSILON_FACTOR/(real) 0x100000000;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) { for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
#ifdef USE_DOUBLE_PRECISION #ifdef USE_ALTERNATE_MEMORY_ACCESS_PATTERN
long value = pmeGrid[2*index]; long value = fixedGrid[index%2 == 0 ? index/2 : (index+gridSize)/2];
#else #else
long value = pmeGrid[index]; long value = fixedGrid[index];
#endif #endif
real2 realValue = (real2) ((real) (value*scale), 0); realGrid[index] = (real) (value*scale);
realGrid[index] = realValue;
} }
} }
#elif defined(DEVICE_IS_CPU) #elif defined(DEVICE_IS_CPU)
__kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange, __kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
__global real2* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) { __global real* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
const int firstx = get_global_id(0)*GRID_SIZE_X/get_global_size(0); const int firstx = get_global_id(0)*GRID_SIZE_X/get_global_size(0);
const int lastx = (get_global_id(0)+1)*GRID_SIZE_X/get_global_size(0); const int lastx = (get_global_id(0)+1)*GRID_SIZE_X/get_global_size(0);
if (firstx == lastx) if (firstx == lastx)
...@@ -230,7 +229,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con ...@@ -230,7 +229,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
int zindex = gridIndex.z+iz; int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0); zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex; int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
pmeGrid[index].x += EPSILON_FACTOR*pos.w*data[ix].x*data[iy].y*data[iz].z; pmeGrid[index] += EPSILON_FACTOR*pos.w*data[ix].x*data[iy].y*data[iz].z;
} }
} }
} }
...@@ -238,7 +237,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con ...@@ -238,7 +237,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
} }
#else #else
__kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange, __kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
__global real2* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta) { __global real* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta) {
unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
for (int gridIndex = get_global_id(0); gridIndex < numGridPoints; gridIndex += get_global_size(0)) { for (int gridIndex = get_global_id(0); gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
// Compute the charge on a grid point. // Compute the charge on a grid point.
...@@ -290,22 +289,23 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con ...@@ -290,22 +289,23 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
} }
} }
} }
pmeGrid[gridIndex] = (real2) (result*EPSILON_FACTOR, 0); pmeGrid[gridIndex] = result*EPSILON_FACTOR;
} }
} }
#endif #endif
__kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global real* restrict energyBuffer, __global const real* restrict pmeBsplineModuliX, __kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global const real* restrict pmeBsplineModuliX,
__global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, real recipScaleFactor) { __global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; // R2C stores into a half complex matrix where the last dimension is cut by half
real energy = 0.0f; const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
const real recipScaleFactor = (1.0f/M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) { for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
int kx = index/(GRID_SIZE_Y*GRID_SIZE_Z); // real indices
int remainder = index-kx*GRID_SIZE_Y*GRID_SIZE_Z; int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
int ky = remainder/GRID_SIZE_Z; int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
int kz = remainder-ky*GRID_SIZE_Z; int ky = remainder/(GRID_SIZE_Z/2+1);
if (kx == 0 && ky == 0 && kz == 0) int kz = remainder-ky*(GRID_SIZE_Z/2+1);
continue;
int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X); int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y); int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z); int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
...@@ -319,13 +319,53 @@ __kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global r ...@@ -319,13 +319,53 @@ __kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global r
real m2 = mhx*mhx+mhy*mhy+mhz*mhz; real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
real denom = m2*bx*by*bz; real denom = m2*bx*by*bz;
real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom; real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
pmeGrid[index] = (real2) (grid.x*eterm, grid.y*eterm); if (kx != 0 || ky != 0 || kz != 0) {
energy += eterm*(grid.x*grid.x + grid.y*grid.y); pmeGrid[index] = (real2) (grid.x*eterm, grid.y*eterm);
}
}
}
__kernel void gridEvaluateEnergy(__global real2* restrict pmeGrid, __global real* restrict energyBuffer,
__global const real* restrict pmeBsplineModuliX, __global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ,
real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
// R2C stores into a half complex matrix where the last dimension is cut by half
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
const real recipScaleFactor = (1.0f/M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
real energy = 0;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
// real indices
int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z));
int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z);
int ky = remainder/(GRID_SIZE_Z);
int kz = remainder-ky*(GRID_SIZE_Z);
int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
real mhx = mx*recipBoxVecX.x;
real mhy = mx*recipBoxVecY.x+my*recipBoxVecY.y;
real mhz = mx*recipBoxVecZ.x+my*recipBoxVecZ.y+mz*recipBoxVecZ.z;
real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
real bx = pmeBsplineModuliX[kx];
real by = pmeBsplineModuliY[ky];
real bz = pmeBsplineModuliZ[kz];
real denom = m2*bx*by*bz;
real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
if (kz >= (GRID_SIZE_Z/2+1)) {
kx = ((kx == 0) ? kx : GRID_SIZE_X-kx);
ky = ((ky == 0) ? ky : GRID_SIZE_Y-ky);
kz = GRID_SIZE_Z-kz;
}
int indexInHalfComplexGrid = kz + ky*(GRID_SIZE_Z/2+1)+kx*(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
real2 grid = pmeGrid[indexInHalfComplexGrid];
if (kx != 0 || ky != 0 || kz != 0) {
energy += eterm*(grid.x*grid.x + grid.y*grid.y);
}
} }
energyBuffer[get_global_id(0)] += 0.5f*energy; energyBuffer[get_global_id(0)] += 0.5f*energy;
} }
__kernel void gridInterpolateForce(__global const real4* restrict posq, __global real4* restrict forceBuffers, __global const real2* restrict pmeGrid, __kernel void gridInterpolateForce(__global const real4* restrict posq, __global real4* restrict forceBuffers, __global const real* restrict pmeGrid,
real4 periodicBoxSize, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, __global int2* restrict pmeAtomGridIndex) { real4 periodicBoxSize, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, __global int2* restrict pmeAtomGridIndex) {
const real4 scale = 1/(real) (PME_ORDER-1); const real4 scale = 1/(real) (PME_ORDER-1);
real4 data[PME_ORDER]; real4 data[PME_ORDER];
...@@ -385,7 +425,7 @@ __kernel void gridInterpolateForce(__global const real4* restrict posq, __global ...@@ -385,7 +425,7 @@ __kernel void gridInterpolateForce(__global const real4* restrict posq, __global
int zindex = gridIndex.z+iz; int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0); zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex; int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
real gridvalue = pmeGrid[index].x; real gridvalue = pmeGrid[index];
force.x += ddata[ix].x*data[iy].y*data[iz].z*gridvalue; force.x += ddata[ix].x*data[iy].y*data[iz].z*gridvalue;
force.y += data[ix].x*ddata[iy].y*data[iz].z*gridvalue; force.y += data[ix].x*ddata[iy].y*data[iz].z*gridvalue;
force.z += data[ix].x*data[iy].y*ddata[iz].z*gridvalue; force.z += data[ix].x*data[iy].y*ddata[iz].z*gridvalue;
......
...@@ -109,13 +109,7 @@ __kernel void assignElementsToBuckets(__global const DATA_TYPE* restrict data, u ...@@ -109,13 +109,7 @@ __kernel void assignElementsToBuckets(__global const DATA_TYPE* restrict data, u
float maxValue = (float) (range[1]); float maxValue = (float) (range[1]);
float bucketWidth = (maxValue-minValue)/numBuckets; float bucketWidth = (maxValue-minValue)/numBuckets;
for (uint index = get_global_id(0); index < length; index += get_global_size(0)) { for (uint index = get_global_id(0); index < length; index += get_global_size(0)) {
#if defined(MAC_AMD_WORKAROUND) && VALUE_IS_INT2
__global int* d = (__global int*) data;
int2 element = (int2) (d[2*index], d[2*index+1]);
float key = (float) getValue(element);
#else
float key = (float) getValue(data[index]); float key = (float) getValue(data[index]);
#endif
uint bucketIndex = min((uint) ((key-minValue)/bucketWidth), numBuckets-1); uint bucketIndex = min((uint) ((key-minValue)/bucketWidth), numBuckets-1);
offsetInBucket[index] = atom_inc(&bucketOffset[bucketIndex]); offsetInBucket[index] = atom_inc(&bucketOffset[bucketIndex]);
bucketOfElement[index] = bucketIndex; bucketOfElement[index] = bucketIndex;
......
...@@ -334,7 +334,7 @@ void testMonteCarlo() { ...@@ -334,7 +334,7 @@ void testMonteCarlo() {
integrator.addComputePerDof("oldx", "x"); integrator.addComputePerDof("oldx", "x");
integrator.addComputePerDof("x", "x+dt*gaussian"); integrator.addComputePerDof("x", "x+dt*gaussian");
integrator.addComputeGlobal("accept", "step(exp((oldE-energy)/kT)-uniform)"); integrator.addComputeGlobal("accept", "step(exp((oldE-energy)/kT)-uniform)");
integrator.addComputePerDof("x", "accept*x + (1-accept)*oldx"); integrator.addComputePerDof("x", "select(accept, x, oldx)");
HarmonicBondForce* forceField = new HarmonicBondForce(); HarmonicBondForce* forceField = new HarmonicBondForce();
forceField->addBond(0, 1, 2.0, 10.0); forceField->addBond(0, 1, 2.0, 10.0);
system.addForce(forceField); system.addForce(forceField);
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2011 Stanford University and the Authors. * * Portions copyright (c) 2011-2015 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -51,7 +51,7 @@ using namespace std; ...@@ -51,7 +51,7 @@ using namespace std;
static OpenCLPlatform platform; static OpenCLPlatform platform;
template <class Real2> template <class Real2>
void testTransform() { void testTransform(bool realToComplex, int xsize, int ysize, int zsize) {
System system; System system;
system.addParticle(0.0); system.addParticle(0.0);
OpenCLPlatform::PlatformData platformData(system, "", "", platform.getPropertyDefaultValue("OpenCLPrecision"), "false"); OpenCLPlatform::PlatformData platformData(system, "", "", platform.getPropertyDefaultValue("OpenCLPrecision"), "false");
...@@ -59,7 +59,6 @@ void testTransform() { ...@@ -59,7 +59,6 @@ void testTransform() {
context.initialize(); context.initialize();
OpenMM_SFMT::SFMT sfmt; OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt); init_gen_rand(0, sfmt);
int xsize = 28, ysize = 25, zsize = 30;
vector<Real2> original(xsize*ysize*zsize); vector<Real2> original(xsize*ysize*zsize);
vector<t_complex> reference(original.size()); vector<t_complex> reference(original.size());
for (int i = 0; i < (int) original.size(); i++) { for (int i = 0; i < (int) original.size(); i++) {
...@@ -67,10 +66,16 @@ void testTransform() { ...@@ -67,10 +66,16 @@ void testTransform() {
original[i] = value; original[i] = value;
reference[i] = t_complex(value.x, value.y); reference[i] = t_complex(value.x, value.y);
} }
for (int i = 0; i < (int) reference.size(); i++) {
if (realToComplex)
reference[i] = t_complex(i%2 == 0 ? original[i/2].x : original[i/2].y, 0);
else
reference[i] = t_complex(original[i].x, original[i].y);
}
OpenCLArray grid1(context, original.size(), sizeof(Real2), "grid1"); OpenCLArray grid1(context, original.size(), sizeof(Real2), "grid1");
OpenCLArray grid2(context, original.size(), sizeof(Real2), "grid2"); OpenCLArray grid2(context, original.size(), sizeof(Real2), "grid2");
grid1.upload(original); grid1.upload(original);
OpenCLFFT3D fft(context, xsize, ysize, zsize); OpenCLFFT3D fft(context, xsize, ysize, zsize, realToComplex);
// Perform a forward FFT, then verify the result is correct. // Perform a forward FFT, then verify the result is correct.
...@@ -80,10 +85,15 @@ void testTransform() { ...@@ -80,10 +85,15 @@ void testTransform() {
fftpack_t plan; fftpack_t plan;
fftpack_init_3d(&plan, xsize, ysize, zsize); fftpack_init_3d(&plan, xsize, ysize, zsize);
fftpack_exec_3d(plan, FFTPACK_FORWARD, &reference[0], &reference[0]); fftpack_exec_3d(plan, FFTPACK_FORWARD, &reference[0], &reference[0]);
for (int i = 0; i < (int) result.size(); ++i) { int outputZSize = (realToComplex ? zsize/2+1 : zsize);
ASSERT_EQUAL_TOL(reference[i].re, result[i].x, 1e-3); for (int x = 0; x < xsize; x++)
ASSERT_EQUAL_TOL(reference[i].im, result[i].y, 1e-3); for (int y = 0; y < ysize; y++)
} for (int z = 0; z < outputZSize; z++) {
int index1 = x*ysize*zsize + y*zsize + z;
int index2 = x*ysize*outputZSize + y*outputZSize + z;
ASSERT_EQUAL_TOL(reference[index1].re, result[index2].x, 1e-3);
ASSERT_EQUAL_TOL(reference[index1].im, result[index2].y, 1e-3);
}
fftpack_destroy(plan); fftpack_destroy(plan);
// Perform a backward transform and see if we get the original values. // Perform a backward transform and see if we get the original values.
...@@ -91,7 +101,8 @@ void testTransform() { ...@@ -91,7 +101,8 @@ void testTransform() {
fft.execFFT(grid2, grid1, false); fft.execFFT(grid2, grid1, false);
grid1.download(result); grid1.download(result);
double scale = 1.0/(xsize*ysize*zsize); double scale = 1.0/(xsize*ysize*zsize);
for (int i = 0; i < (int) result.size(); ++i) { int valuesToCheck = (realToComplex ? original.size()/2 : original.size());
for (int i = 0; i < valuesToCheck; ++i) {
ASSERT_EQUAL_TOL(original[i].x, scale*result[i].x, 1e-4); ASSERT_EQUAL_TOL(original[i].x, scale*result[i].x, 1e-4);
ASSERT_EQUAL_TOL(original[i].y, scale*result[i].y, 1e-4); ASSERT_EQUAL_TOL(original[i].y, scale*result[i].y, 1e-4);
} }
...@@ -101,10 +112,20 @@ int main(int argc, char* argv[]) { ...@@ -101,10 +112,20 @@ int main(int argc, char* argv[]) {
try { try {
if (argc > 1) if (argc > 1)
platform.setPropertyDefaultValue("OpenCLPrecision", string(argv[1])); platform.setPropertyDefaultValue("OpenCLPrecision", string(argv[1]));
if (platform.getPropertyDefaultValue("OpenCLPrecision") == "double") if (platform.getPropertyDefaultValue("OpenCLPrecision") == "double") {
testTransform<mm_double2>(); testTransform<mm_double2>(false, 28, 25, 30);
else testTransform<mm_double2>(true, 28, 25, 25);
testTransform<mm_float2>(); testTransform<mm_double2>(true, 25, 28, 25);
testTransform<mm_double2>(true, 25, 25, 28);
testTransform<mm_double2>(true, 21, 25, 27);
}
else {
testTransform<mm_float2>(false, 28, 25, 30);
testTransform<mm_float2>(true, 28, 25, 25);
testTransform<mm_float2>(true, 25, 28, 25);
testTransform<mm_float2>(true, 25, 25, 28);
testTransform<mm_float2>(true, 21, 25, 27);
}
} }
catch(const exception& e) { catch(const exception& e) {
cout << "exception: " << e.what() << endl; cout << "exception: " << e.what() << endl;
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2010-2014 Stanford University and the Authors. * * Portions copyright (c) 2010-2015 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -81,7 +81,7 @@ void testLargeSystem() { ...@@ -81,7 +81,7 @@ void testLargeSystem() {
const int numParticles = numMolecules*2; const int numParticles = numMolecules*2;
const double cutoff = 2.0; const double cutoff = 2.0;
const double boxSize = 4.0; const double boxSize = 4.0;
const double tolerance = 5; const double tolerance = 10;
System system; System system;
system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize)); system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
NonbondedForce* nonbonded = new NonbondedForce(); NonbondedForce* nonbonded = new NonbondedForce();
...@@ -114,7 +114,7 @@ void testLargeSystem() { ...@@ -114,7 +114,7 @@ void testLargeSystem() {
State finalState = context.getState(State::Forces | State::Energy | State::Positions); State finalState = context.getState(State::Forces | State::Energy | State::Positions);
ASSERT(finalState.getPotentialEnergy() < initialState.getPotentialEnergy()); ASSERT(finalState.getPotentialEnergy() < initialState.getPotentialEnergy());
// Compute the force magnitude, substracting off any component parallel to a constraint, and // Compute the force magnitude, subtracting off any component parallel to a constraint, and
// check that it satisfies the requested tolerance. // check that it satisfies the requested tolerance.
double forceNorm = 0.0; double forceNorm = 0.0;
...@@ -129,8 +129,8 @@ void testLargeSystem() { ...@@ -129,8 +129,8 @@ void testLargeSystem() {
f -= dir*dir.dot(f); f -= dir*dir.dot(f);
forceNorm += f.dot(f); forceNorm += f.dot(f);
} }
forceNorm = sqrt(forceNorm/(4*numMolecules)); forceNorm = sqrt(forceNorm/(5*numMolecules));
ASSERT(forceNorm < 3*tolerance); ASSERT(forceNorm < 2*tolerance);
} }
void testVirtualSites() { void testVirtualSites() {
...@@ -138,7 +138,7 @@ void testVirtualSites() { ...@@ -138,7 +138,7 @@ void testVirtualSites() {
const int numParticles = numMolecules*3; const int numParticles = numMolecules*3;
const double cutoff = 2.0; const double cutoff = 2.0;
const double boxSize = 4.0; const double boxSize = 4.0;
const double tolerance = 5; const double tolerance = 10;
System system; System system;
system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize)); system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
NonbondedForce* nonbonded = new NonbondedForce(); NonbondedForce* nonbonded = new NonbondedForce();
...@@ -195,8 +195,8 @@ void testVirtualSites() { ...@@ -195,8 +195,8 @@ void testVirtualSites() {
ASSERT_EQUAL_VEC((finalState.getPositions()[i+1]+finalState.getPositions()[i])*0.5, finalState.getPositions()[i+2], 1e-5); ASSERT_EQUAL_VEC((finalState.getPositions()[i+1]+finalState.getPositions()[i])*0.5, finalState.getPositions()[i+2], 1e-5);
} }
forceNorm = sqrt(forceNorm/(4*numMolecules)); forceNorm = sqrt(forceNorm/(5*numMolecules));
ASSERT(forceNorm < 3*tolerance); ASSERT(forceNorm < 2*tolerance);
} }
int main(int argc, char* argv[]) { int main(int argc, char* argv[]) {
......
...@@ -382,9 +382,9 @@ void ObcParameters::setPeriodic(OpenMM::RealVec* vectors) { ...@@ -382,9 +382,9 @@ void ObcParameters::setPeriodic(OpenMM::RealVec* vectors) {
assert(_cutoff); assert(_cutoff);
assert(boxSize[0][0] >= 2.0*_cutoffDistance); assert(vectors[0][0] >= 2.0*_cutoffDistance);
assert(boxSize[1][1] >= 2.0*_cutoffDistance); assert(vectors[1][1] >= 2.0*_cutoffDistance);
assert(boxSize[2][2] >= 2.0*_cutoffDistance); assert(vectors[2][2] >= 2.0*_cutoffDistance);
_periodic = true; _periodic = true;
_periodicBoxVectors[0] = vectors[0]; _periodicBoxVectors[0] = vectors[0];
......
...@@ -334,7 +334,7 @@ void testMonteCarlo() { ...@@ -334,7 +334,7 @@ void testMonteCarlo() {
integrator.addComputePerDof("oldx", "x"); integrator.addComputePerDof("oldx", "x");
integrator.addComputePerDof("x", "x+dt*gaussian"); integrator.addComputePerDof("x", "x+dt*gaussian");
integrator.addComputeGlobal("accept", "step(exp((oldE-energy)/kT)-uniform)"); integrator.addComputeGlobal("accept", "step(exp((oldE-energy)/kT)-uniform)");
integrator.addComputePerDof("x", "accept*x + (1-accept)*oldx"); integrator.addComputePerDof("x", "select(accept, x, oldx)");
HarmonicBondForce* forceField = new HarmonicBondForce(); HarmonicBondForce* forceField = new HarmonicBondForce();
forceField->addBond(0, 1, 2.0, 10.0); forceField->addBond(0, 1, 2.0, 10.0);
system.addForce(forceField); system.addForce(forceField);
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2010-2014 Stanford University and the Authors. * * Portions copyright (c) 2010-2015 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -129,8 +129,8 @@ void testLargeSystem() { ...@@ -129,8 +129,8 @@ void testLargeSystem() {
f -= dir*dir.dot(f); f -= dir*dir.dot(f);
forceNorm += f.dot(f); forceNorm += f.dot(f);
} }
forceNorm = sqrt(forceNorm/(4*numMolecules)); forceNorm = sqrt(forceNorm/(5*numMolecules));
ASSERT(forceNorm < 3*tolerance); ASSERT(forceNorm < 2*tolerance);
} }
void testVirtualSites() { void testVirtualSites() {
...@@ -195,8 +195,8 @@ void testVirtualSites() { ...@@ -195,8 +195,8 @@ void testVirtualSites() {
ASSERT_EQUAL_VEC((finalState.getPositions()[i+1]+finalState.getPositions()[i])*0.5, finalState.getPositions()[i+2], 1e-5); ASSERT_EQUAL_VEC((finalState.getPositions()[i+1]+finalState.getPositions()[i])*0.5, finalState.getPositions()[i+2], 1e-5);
} }
forceNorm = sqrt(forceNorm/(4*numMolecules)); forceNorm = sqrt(forceNorm/(5*numMolecules));
ASSERT(forceNorm < 3*tolerance); ASSERT(forceNorm < 2*tolerance);
} }
int main() { int main() {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment