Commit 71d33617 authored by John Chodera (MSKCC)'s avatar John Chodera (MSKCC)
Browse files

Merge remote-tracking branch 'upstream/master'

parents eb232608 9da36463
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-2013 Stanford University and the Authors. *
* Portions copyright (c) 2011-2015 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -344,6 +344,13 @@ public:
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
/**
* Copy changed parameters over to a context.
*
* @param context the context to copy parameters to
* @param force the CMAPTorsionForce to copy the parameters from
*/
void copyParametersToContext(ContextImpl& context, const CMAPTorsionForce& force);
private:
class Task;
OpenCLPlatform::PlatformData& data;
......
......@@ -106,9 +106,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
// if they supplied a valid deviceIndex, we only look through that one
if (i != deviceIndex && deviceIndex >= 0 && deviceIndex < (int) devices.size())
continue;
if (platformVendor == "Apple" && (devices[i].getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU || devices[i].getInfo<CL_DEVICE_VENDOR>() == "AMD"))
continue; // The CPU device on OS X won't work correctly, and there are serious bugs using AMD GPUs.
if (platformVendor == "Apple" && (devices[i].getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU))
continue; // The CPU device on OS X won't work correctly.
int maxSize = devices[i].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0];
int processingElementsPerComputeUnit = 8;
if (devices[i].getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU) {
......@@ -170,6 +169,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
compilationDefines["WORK_GROUP_SIZE"] = intToString(ThreadBlockSize);
if (platformVendor.size() >= 5 && platformVendor.substr(0, 5) == "Intel")
defaultOptimizationOptions = "";
else if (platformVendor == "Apple")
defaultOptimizationOptions = "-cl-mad-enable -cl-no-signed-zeros";
else
defaultOptimizationOptions = "-cl-fast-relaxed-math";
supports64BitGlobalAtomics = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_int64_base_atomics") != string::npos);
......@@ -241,8 +242,6 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
}
else
simdWidth = 1;
if (platformVendor == "Apple" && vendor == "AMD")
compilationDefines["MAC_AMD_WORKAROUND"] = "";
if (supports64BitGlobalAtomics)
compilationDefines["SUPPORTS_64_BIT_ATOMICS"] = "";
if (supportsDoublePrecision)
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Portions copyright (c) 2009-2015 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -35,25 +35,109 @@
using namespace OpenMM;
using namespace std;
OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize) : context(context), xsize(xsize), ysize(ysize), zsize(zsize) {
zkernel = createKernel(xsize, ysize, zsize, zthreads);
xkernel = createKernel(ysize, zsize, xsize, xthreads);
ykernel = createKernel(zsize, xsize, ysize, ythreads);
OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize, bool realToComplex) :
context(context), xsize(xsize), ysize(ysize), zsize(zsize) {
packRealAsComplex = false;
int packedXSize = xsize;
int packedYSize = ysize;
int packedZSize = zsize;
if (realToComplex) {
// If any axis size is even, we can pack the real values into a complex grid that is only half as large.
// Look for an appropriate axis.
packRealAsComplex = true;
int packedAxis, bufferSize;
if (xsize%2 == 0) {
packedAxis = 0;
packedXSize /= 2;
bufferSize = packedXSize;
}
else if (ysize%2 == 0) {
packedAxis = 1;
packedYSize /= 2;
bufferSize = packedYSize;
}
else if (zsize%2 == 0) {
packedAxis = 2;
packedZSize /= 2;
bufferSize = packedZSize;
}
else
packRealAsComplex = false;
if (packRealAsComplex) {
// Build the kernels for packing and unpacking the data.
map<string, string> defines;
defines["XSIZE"] = context.intToString(xsize);
defines["YSIZE"] = context.intToString(ysize);
defines["ZSIZE"] = context.intToString(zsize);
defines["PACKED_AXIS"] = context.intToString(packedAxis);
defines["PACKED_XSIZE"] = context.intToString(packedXSize);
defines["PACKED_YSIZE"] = context.intToString(packedYSize);
defines["PACKED_ZSIZE"] = context.intToString(packedZSize);
defines["M_PI"] = context.doubleToString(M_PI);
cl::Program program = context.createProgram(OpenCLKernelSources::fftR2C, defines);
packForwardKernel = cl::Kernel(program, "packForwardData");
unpackForwardKernel = cl::Kernel(program, "unpackForwardData");
unpackForwardKernel.setArg(2, bufferSize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2)), NULL);
packBackwardKernel = cl::Kernel(program, "packBackwardData");
packBackwardKernel.setArg(2, bufferSize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2)), NULL);
unpackBackwardKernel = cl::Kernel(program, "unpackBackwardData");
}
}
bool inputIsReal = (realToComplex && !packRealAsComplex);
zkernel = createKernel(packedXSize, packedYSize, packedZSize, zthreads, 0, true, inputIsReal);
xkernel = createKernel(packedYSize, packedZSize, packedXSize, xthreads, 1, true, inputIsReal);
ykernel = createKernel(packedZSize, packedXSize, packedYSize, ythreads, 2, true, inputIsReal);
invzkernel = createKernel(packedXSize, packedYSize, packedZSize, zthreads, 0, false, inputIsReal);
invxkernel = createKernel(packedYSize, packedZSize, packedXSize, xthreads, 1, false, inputIsReal);
invykernel = createKernel(packedZSize, packedXSize, packedYSize, ythreads, 2, false, inputIsReal);
}
void OpenCLFFT3D::execFFT(OpenCLArray& in, OpenCLArray& out, bool forward) {
zkernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
zkernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
zkernel.setArg<cl_int>(2, forward ? 1 : -1);
context.executeKernel(zkernel, xsize*ysize*zsize, zthreads);
xkernel.setArg<cl::Buffer>(0, out.getDeviceBuffer());
xkernel.setArg<cl::Buffer>(1, in.getDeviceBuffer());
xkernel.setArg<cl_int>(2, forward ? 1 : -1);
context.executeKernel(xkernel, xsize*ysize*zsize, xthreads);
ykernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
ykernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
ykernel.setArg<cl_int>(2, forward ? 1 : -1);
context.executeKernel(ykernel, xsize*ysize*zsize, ythreads);
cl::Kernel kernel1 = (forward ? zkernel : invzkernel);
cl::Kernel kernel2 = (forward ? xkernel : invxkernel);
cl::Kernel kernel3 = (forward ? ykernel : invykernel);
if (packRealAsComplex) {
cl::Kernel packKernel = (forward ? packForwardKernel : packBackwardKernel);
cl::Kernel unpackKernel = (forward ? unpackForwardKernel : unpackBackwardKernel);
int gridSize = xsize*ysize*zsize/2;
// Pack the data into a half sized grid.
packKernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
packKernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
context.executeKernel(packKernel, gridSize);
// Perform the FFT.
kernel1.setArg<cl::Buffer>(0, out.getDeviceBuffer());
kernel1.setArg<cl::Buffer>(1, in.getDeviceBuffer());
context.executeKernel(kernel1, gridSize, zthreads);
kernel2.setArg<cl::Buffer>(0, in.getDeviceBuffer());
kernel2.setArg<cl::Buffer>(1, out.getDeviceBuffer());
context.executeKernel(kernel2, gridSize, xthreads);
kernel3.setArg<cl::Buffer>(0, out.getDeviceBuffer());
kernel3.setArg<cl::Buffer>(1, in.getDeviceBuffer());
context.executeKernel(kernel3, gridSize, ythreads);
// Unpack the data.
unpackKernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
unpackKernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
context.executeKernel(unpackKernel, gridSize);
}
else {
kernel1.setArg<cl::Buffer>(0, in.getDeviceBuffer());
kernel1.setArg<cl::Buffer>(1, out.getDeviceBuffer());
context.executeKernel(kernel1, xsize*ysize*zsize, zthreads);
kernel2.setArg<cl::Buffer>(0, out.getDeviceBuffer());
kernel2.setArg<cl::Buffer>(1, in.getDeviceBuffer());
context.executeKernel(kernel2, xsize*ysize*zsize, xthreads);
kernel3.setArg<cl::Buffer>(0, in.getDeviceBuffer());
kernel3.setArg<cl::Buffer>(1, out.getDeviceBuffer());
context.executeKernel(kernel3, xsize*ysize*zsize, ythreads);
}
}
int OpenCLFFT3D::findLegalDimension(int minimum) {
......@@ -73,8 +157,10 @@ int OpenCLFFT3D::findLegalDimension(int minimum) {
}
}
cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads) {
cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads, int axis, bool forward, bool inputIsReal) {
int maxThreads = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
while (maxThreads > 128 && maxThreads-64 >= zsize)
maxThreads -= 64;
bool isCPU = context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU;
while (true) {
bool loopRequired = (zsize > maxThreads || isCPU);
......@@ -137,10 +223,10 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source<<"real2 b2 = "<<context.doubleToString((2*cos(2*M_PI/7)-cos(4*M_PI/7)-cos(6*M_PI/7))/3)<<"*(d0-d4);\n";
source<<"real2 b3 = "<<context.doubleToString((cos(2*M_PI/7)-2*cos(4*M_PI/7)+cos(6*M_PI/7))/3)<<"*(d4-d2);\n";
source<<"real2 b4 = "<<context.doubleToString((cos(2*M_PI/7)+cos(4*M_PI/7)-2*cos(6*M_PI/7))/3)<<"*(d2-d0);\n";
source<<"real2 b5 = -sign*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d7+d1);\n";
source<<"real2 b6 = -sign*"<<context.doubleToString((2*sin(2*M_PI/7)-sin(4*M_PI/7)+sin(6*M_PI/7))/3)<<"*(d1-d5);\n";
source<<"real2 b7 = -sign*"<<context.doubleToString((sin(2*M_PI/7)-2*sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d5-d3);\n";
source<<"real2 b8 = -sign*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)+2*sin(6*M_PI/7))/3)<<"*(d3-d1);\n";
source<<"real2 b5 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d7+d1);\n";
source<<"real2 b6 = -(SIGN)*"<<context.doubleToString((2*sin(2*M_PI/7)-sin(4*M_PI/7)+sin(6*M_PI/7))/3)<<"*(d1-d5);\n";
source<<"real2 b7 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)-2*sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d5-d3);\n";
source<<"real2 b8 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)+2*sin(6*M_PI/7))/3)<<"*(d3-d1);\n";
source<<"real2 t0 = b0+b1;\n";
source<<"real2 t1 = b2+b3;\n";
source<<"real2 t2 = b4-b3;\n";
......@@ -178,8 +264,8 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source<<"real2 d7 = d6+d5;\n";
source<<"real2 d8 = d6-d5;\n";
string coeff = context.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
source<<"real2 d9 = sign*(real2) (d2.y+"<<coeff<<"*d3.y, -d2.x-"<<coeff<<"*d3.x);\n";
source<<"real2 d10 = sign*(real2) ("<<coeff<<"*d2.y-d3.y, d3.x-"<<coeff<<"*d2.x);\n";
source<<"real2 d9 = (SIGN)*(real2) (d2.y+"<<coeff<<"*d3.y, -d2.x-"<<coeff<<"*d3.x);\n";
source<<"real2 d10 = (SIGN)*(real2) ("<<coeff<<"*d2.y-d3.y, d3.x-"<<coeff<<"*d2.x);\n";
source<<"data"<<output<<"[base+4*j*"<<m<<"] = c0+d4;\n";
source<<"data"<<output<<"[base+(4*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(5*L)<<"], d7+d9);\n";
source<<"data"<<output<<"[base+(4*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(5*L)<<"], d8+d10);\n";
......@@ -194,7 +280,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source<<"real2 d0 = c0+c2;\n";
source<<"real2 d1 = c0-c2;\n";
source<<"real2 d2 = c1+c3;\n";
source<<"real2 d3 = sign*(real2) (c1.y-c3.y, c3.x-c1.x);\n";
source<<"real2 d3 = (SIGN)*(real2) (c1.y-c3.y, c3.x-c1.x);\n";
source<<"data"<<output<<"[base+3*j*"<<m<<"] = d0+d2;\n";
source<<"data"<<output<<"[base+(3*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(4*L)<<"], d1+d3);\n";
source<<"data"<<output<<"[base+(3*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(4*L)<<"], d0-d2);\n";
......@@ -206,7 +292,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
source<<"real2 d0 = c1+c2;\n";
source<<"real2 d1 = c0-0.5f*d0;\n";
source<<"real2 d2 = sign*"<<context.doubleToString(sin(M_PI/3.0))<<"*(real2) (c1.y-c2.y, c2.x-c1.x);\n";
source<<"real2 d2 = (SIGN)*"<<context.doubleToString(sin(M_PI/3.0))<<"*(real2) (c1.y-c2.y, c2.x-c1.x);\n";
source<<"data"<<output<<"[base+2*j*"<<m<<"] = c0+d0;\n";
source<<"data"<<output<<"[base+(2*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(3*L)<<"], d1+d2);\n";
source<<"data"<<output<<"[base+(2*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(3*L)<<"], d1-d2);\n";
......@@ -226,13 +312,27 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
// Create the kernel.
bool outputIsReal = (inputIsReal && axis == 2 && !forward);
bool outputIsPacked = (inputIsReal && axis == 2 && forward);
string outputSuffix = (outputIsReal ? ".x" : "");
if (loopRequired) {
if (outputIsPacked)
source<<"if (x < XSIZE/2+1)\n";
source<<"for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))\n";
source<<"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"<<(stage%2)<<"[z];\n";
if (outputIsPacked)
source<<"out[y*(ZSIZE*(XSIZE/2+1))+z*(XSIZE/2+1)+x] = data"<<(stage%2)<<"[z]"<<outputSuffix<<";\n";
else
source<<"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"<<(stage%2)<<"[z]"<<outputSuffix<<";\n";
}
else {
source<<"if (index < XSIZE*YSIZE)\n";
source<<"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)];\n";
if (outputIsPacked) {
source<<"if (index < XSIZE*YSIZE && x < XSIZE/2+1)\n";
source<<"out[y*(ZSIZE*(XSIZE/2+1))+(get_local_id(0)%ZSIZE)*(XSIZE/2+1)+x] = data"<<(stage%2)<<"[get_local_id(0)]"<<outputSuffix<<";\n";
}
else {
source<<"if (index < XSIZE*YSIZE)\n";
source<<"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)]"<<outputSuffix<<";\n";
}
}
map<string, string> replacements;
replacements["XSIZE"] = context.intToString(xsize);
......@@ -242,6 +342,12 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
replacements["M_PI"] = context.doubleToString(M_PI);
replacements["COMPUTE_FFT"] = source.str();
replacements["LOOP_REQUIRED"] = (loopRequired ? "1" : "0");
replacements["SIGN"] = (forward ? "1" : "-1");
replacements["INPUT_TYPE"] = (inputIsReal && axis == 0 && forward ? "real" : "real2");
replacements["OUTPUT_TYPE"] = (outputIsReal ? "real" : "real2");
replacements["INPUT_IS_REAL"] = (inputIsReal && axis == 0 && forward ? "1" : "0");
replacements["INPUT_IS_PACKED"] = (inputIsReal && axis == 0 && !forward ? "1" : "0");
replacements["OUTPUT_IS_PACKED"] = (outputIsPacked ? "1" : "0");
cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::fft, replacements));
cl::Kernel kernel(program, "execFFT");
threads = (isCPU ? 1 : blocksPerGroup*zsize);
......@@ -253,9 +359,9 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
continue;
}
int bufferSize = blocksPerGroup*zsize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2));
kernel.setArg(2, bufferSize, NULL);
kernel.setArg(3, bufferSize, NULL);
kernel.setArg(4, bufferSize, NULL);
kernel.setArg(5, bufferSize, NULL);
return kernel;
}
}
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2014 Stanford University and the Authors. *
* Portions copyright (c) 2009-2015 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -32,6 +32,7 @@
#include "openmm/VirtualSite.h"
#include "quern.h"
#include "OpenCLExpressionUtilities.h"
#include "ReferenceCCMAAlgorithm.h"
#include <algorithm>
#include <cmath>
#include <cstdlib>
......@@ -323,157 +324,54 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
int numCCMA = (int) ccmaConstraints.size();
if (numCCMA > 0) {
vector<vector<int> > atomConstraints(context.getNumAtoms());
// Record information needed by ReferenceCCMAAlgorithm.
vector<pair<int, int> > refIndices(numCCMA);
vector<RealOpenMM> refDistance(numCCMA);
for (int i = 0; i < numCCMA; i++) {
atomConstraints[atom1[ccmaConstraints[i]]].push_back(i);
atomConstraints[atom2[ccmaConstraints[i]]].push_back(i);
}
vector<vector<int> > linkedConstraints(numCCMA);
for (unsigned atom = 0; atom < atomConstraints.size(); atom++) {
for (unsigned i = 0; i < atomConstraints[atom].size(); i++)
for (unsigned j = 0; j < i; j++) {
int c1 = atomConstraints[atom][i];
int c2 = atomConstraints[atom][j];
linkedConstraints[c1].push_back(c2);
linkedConstraints[c2].push_back(c1);
}
int index = ccmaConstraints[i];
refIndices[i] = make_pair(atom1[index], atom2[index]);
refDistance[i] = distance[index];
}
int maxLinks = 0;
for (unsigned i = 0; i < linkedConstraints.size(); i++)
maxLinks = max(maxLinks, (int) linkedConstraints[i].size());
int maxAtomConstraints = 0;
for (unsigned i = 0; i < atomConstraints.size(); i++)
maxAtomConstraints = max(maxAtomConstraints, (int) atomConstraints[i].size());
// Compute the constraint coupling matrix
vector<vector<int> > atomAngles(numAtoms);
HarmonicAngleForce const* angleForce = NULL;
for (int i = 0; i < system.getNumForces() && angleForce == NULL; i++)
angleForce = dynamic_cast<HarmonicAngleForce const*>(&system.getForce(i));
if (angleForce != NULL)
for (int i = 0; i < angleForce->getNumAngles(); i++) {
int particle1, particle2, particle3;
double angle, k;
angleForce->getAngleParameters(i, particle1, particle2, particle3, angle, k);
atomAngles[particle2].push_back(i);
}
vector<vector<pair<int, double> > > matrix(numCCMA);
for (int j = 0; j < numCCMA; j++) {
for (int k = 0; k < numCCMA; k++) {
if (j == k) {
matrix[j].push_back(pair<int, double>(j, 1.0));
continue;
}
double scale;
int cj = ccmaConstraints[j];
int ck = ccmaConstraints[k];
int atomj0 = atom1[cj];
int atomj1 = atom2[cj];
int atomk0 = atom1[ck];
int atomk1 = atom2[ck];
int atoma, atomb, atomc;
double imj0 = 1.0/system.getParticleMass(atomj0);
double imj1 = 1.0/system.getParticleMass(atomj1);
if (atomj0 == atomk0) {
atoma = atomj1;
atomb = atomj0;
atomc = atomk1;
scale = imj0/(imj0+imj1);
}
else if (atomj1 == atomk1) {
atoma = atomj0;
atomb = atomj1;
atomc = atomk0;
scale = imj1/(imj0+imj1);
}
else if (atomj0 == atomk1) {
atoma = atomj1;
atomb = atomj0;
atomc = atomk0;
scale = imj0/(imj0+imj1);
}
else if (atomj1 == atomk0) {
atoma = atomj0;
atomb = atomj1;
atomc = atomk1;
scale = imj1/(imj0+imj1);
}
else
continue; // These constraints are not connected.
// Look for a third constraint forming a triangle with these two.
vector<RealOpenMM> refMasses(numAtoms);
for (int i = 0; i < numAtoms; ++i)
refMasses[i] = (RealOpenMM) system.getParticleMass(i);
bool foundConstraint = false;
for (int m = 0; m < numCCMA; m++) {
int other = ccmaConstraints[m];
if ((atom1[other] == atoma && atom2[other] == atomc) || (atom1[other] == atomc && atom2[other] == atoma)) {
double d1 = distance[cj];
double d2 = distance[ck];
double d3 = distance[other];
matrix[j].push_back(pair<int, double>(k, scale*(d1*d1+d2*d2-d3*d3)/(2.0*d1*d2)));
foundConstraint = true;
break;
}
}
if (!foundConstraint && angleForce != NULL) {
// We didn't find one, so look for an angle force field term.
const vector<int>& angleCandidates = atomAngles[atomb];
for (vector<int>::const_iterator iter = angleCandidates.begin(); iter != angleCandidates.end(); iter++) {
int particle1, particle2, particle3;
double angle, ka;
angleForce->getAngleParameters(*iter, particle1, particle2, particle3, angle, ka);
if ((particle1 == atoma && particle3 == atomc) || (particle3 == atoma && particle1 == atomc)) {
matrix[j].push_back(pair<int, double>(k, scale*cos(angle)));
break;
}
}
// Look up angles for CCMA.
vector<ReferenceCCMAAlgorithm::AngleInfo> angles;
for (int i = 0; i < system.getNumForces(); i++) {
const HarmonicAngleForce* force = dynamic_cast<const HarmonicAngleForce*>(&system.getForce(i));
if (force != NULL) {
for (int j = 0; j < force->getNumAngles(); j++) {
int atom1, atom2, atom3;
double angle, k;
force->getAngleParameters(j, atom1, atom2, atom3, angle, k);
angles.push_back(ReferenceCCMAAlgorithm::AngleInfo(atom1, atom2, atom3, (RealOpenMM) angle));
}
}
}
// Invert it using QR.
vector<int> matrixRowStart;
vector<int> matrixColIndex;
vector<double> matrixValue;
for (int i = 0; i < numCCMA; i++) {
matrixRowStart.push_back(matrixValue.size());
for (int j = 0; j < (int) matrix[i].size(); j++) {
pair<int, double> element = matrix[i][j];
matrixColIndex.push_back(element.first);
matrixValue.push_back(element.second);
}
}
matrixRowStart.push_back(matrixValue.size());
int *qRowStart, *qColIndex, *rRowStart, *rColIndex;
double *qValue, *rValue;
int result = QUERN_compute_qr(numCCMA, numCCMA, &matrixRowStart[0], &matrixColIndex[0], &matrixValue[0], NULL,
&qRowStart, &qColIndex, &qValue, &rRowStart, &rColIndex, &rValue);
vector<double> rhs(numCCMA);
matrix.clear();
matrix.resize(numCCMA);
for (int i = 0; i < numCCMA; i++) {
// Extract column i of the inverse matrix.
for (int j = 0; j < numCCMA; j++)
rhs[j] = (i == j ? 1.0 : 0.0);
result = QUERN_multiply_with_q_transpose(numCCMA, qRowStart, qColIndex, qValue, &rhs[0]);
result = QUERN_solve_with_r(numCCMA, rRowStart, rColIndex, rValue, &rhs[0], &rhs[0]);
for (int j = 0; j < numCCMA; j++) {
double value = rhs[j]*distance[ccmaConstraints[i]]/distance[ccmaConstraints[j]];
if (abs(value) > 0.1)
matrix[j].push_back(pair<int, double>(i, value));
}
}
QUERN_free_result(qRowStart, qColIndex, qValue);
QUERN_free_result(rRowStart, rColIndex, rValue);
// Create a ReferenceCCMAAlgorithm. It will build and invert the constraint matrix for us.
ReferenceCCMAAlgorithm ccma(numAtoms, numCCMA, refIndices, refDistance, refMasses, angles, 0.1);
vector<vector<pair<int, double> > > matrix = ccma.getMatrix();
int maxRowElements = 0;
for (unsigned i = 0; i < matrix.size(); i++)
maxRowElements = max(maxRowElements, (int) matrix[i].size());
maxRowElements++;
// Build the list of constraints for each atom.
vector<vector<int> > atomConstraints(context.getNumAtoms());
for (int i = 0; i < numCCMA; i++) {
atomConstraints[atom1[ccmaConstraints[i]]].push_back(i);
atomConstraints[atom2[ccmaConstraints[i]]].push_back(i);
}
int maxAtomConstraints = 0;
for (unsigned i = 0; i < atomConstraints.size(); i++)
maxAtomConstraints = max(maxAtomConstraints, (int) atomConstraints[i].size());
// Sort the constraints.
vector<int> constraintOrder(numCCMA);
......
This diff is collapsed.
......@@ -492,6 +492,11 @@ double OpenCLParallelCalcCMAPTorsionForceKernel::execute(ContextImpl& context, b
return 0.0;
}
void OpenCLParallelCalcCMAPTorsionForceKernel::copyParametersToContext(ContextImpl& context, const CMAPTorsionForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
class OpenCLParallelCalcCustomTorsionForceKernel::Task : public OpenCLContext::WorkTask {
public:
Task(ContextImpl& context, OpenCLCalcCustomTorsionForceKernel& kernel, bool includeForce,
......
......@@ -109,7 +109,7 @@ bool OpenCLPlatform::supportsDoublePrecision() const {
bool OpenCLPlatform::isPlatformSupported() {
// Return false for OpenCL implementations that are known
// to be buggy (Apple OSX since 10.7.5)
// to be buggy (Apple OS X prior to 10.10).
#ifdef __APPLE__
char str[256];
......@@ -122,12 +122,10 @@ bool OpenCLPlatform::isPlatformSupported() {
if (sscanf(str, "%d.%d.%d", &major, &minor, &micro) != 3)
return false;
if ((major > 11) || (major == 11 && minor > 4) || (major == 11 && minor == 4 && micro >= 2))
// 11.4.2 is the darwin release corresponding to OSX 10.7.5, which is the
// point at which a number of serious bugs were introduced into the
// Apple OpenCL libraries, resulting in catistrophically incorrect MD simulations
// (see https://github.com/SimTk/openmm/issues/395 for example). Once a fix is released,
// this version check should be updated.
if (major < 14 || (major == 14 && minor < 3))
// 14.3.0 is the darwin release corresponding to OS X 10.10.3. Versions prior to that
// contained a number of serious bugs in the Apple OpenCL libraries.
// (See https://github.com/SimTk/openmm/issues/395 for example.)
return false;
#endif
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2013 Stanford University and the Authors. *
* Portions copyright (c) 2010-2015 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -42,7 +42,6 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
replacements["MIN_KEY"] = trait->getMinKey();
replacements["MAX_KEY"] = trait->getMaxKey();
replacements["MAX_VALUE"] = trait->getMaxValue();
replacements["VALUE_IS_INT2"] = (trait->getDataType() == std::string("int2") ? "1" : "0");
cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements));
shortListKernel = cl::Kernel(program, "sortShortList");
computeRangeKernel = cl::Kernel(program, "computeRange");
......@@ -59,7 +58,11 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
unsigned int maxRangeSize = std::min(maxGroupSize, (unsigned int) computeRangeKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
unsigned int maxPositionsSize = std::min(maxGroupSize, (unsigned int) computeBucketPositionsKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
unsigned int maxShortListSize = shortListKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice());
isShortList = (length <= maxLocalBuffer && length < maxShortListSize);
// On Qualcomm's OpenCL, it's essential to check against maxShortListSize. Otherwise you get a crash.
// But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
// maximum, so including the check hurts performance. For the moment I'm going to just comment it out.
// If we officially support Qualcomm in the future, we'll need to do something better.
isShortList = (length <= maxLocalBuffer/* && length < maxShortListSize*/);
for (rangeKernelSize = 1; rangeKernelSize*2 <= maxRangeSize; rangeKernelSize *= 2)
;
positionsKernelSize = std::min(rangeKernelSize, maxPositionsSize);
......
__kernel void computeGlobal(__global mixed2* restrict dt, __global mixed* restrict globals, __global mixed* restrict params,
float uniform, float gaussian, __global const real* restrict energy) {
float uniform, float gaussian, const real energy) {
COMPUTE_STEP
}
......@@ -26,7 +26,7 @@ void storePos(__global real4* restrict posq, __global real4* restrict posqCorrec
__kernel void computePerDof(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta,
__global mixed4* restrict velm, __global const real4* restrict force, __global const mixed2* restrict dt, __global const mixed* restrict globals,
__global const mixed* restrict params, __global mixed* restrict sum, __global const float4* restrict gaussianValues,
unsigned int gaussianBaseIndex, __global const float4* restrict uniformValues, __global const real* restrict energy
unsigned int gaussianBaseIndex, __global const float4* restrict uniformValues, const real energy
PARAMETER_ARGUMENTS) {
mixed stepSize = dt[0].y;
int index = get_global_id(0);
......
......@@ -227,7 +227,9 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea
int start = block2*TILE_SIZE;
int included[TILE_SIZE];
int numIncluded = 0;
SYNC_WARPS;
positionCache[get_local_id(0)] = posq[start+indexInWarp];
SYNC_WARPS;
if (atom1 < NUM_ATOMS) {
for (int j = 0; j < 32; j++) {
int atom2 = start+j;
......@@ -287,7 +289,7 @@ __kernel void computeNeighborStartIndices(__global int* restrict numNeighborsFor
unsigned int globalIndex = startAtom+get_local_id(0);
posBuffer[get_local_id(0)] = (globalIndex < NUM_ATOMS ? numNeighborsForAtom[globalIndex] : 0);
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
// Perform a parallel prefix sum.
......
......@@ -2,26 +2,57 @@ real2 multiplyComplex(real2 c1, real2 c2) {
return (real2) (c1.x*c2.x-c1.y*c2.y, c1.x*c2.y+c1.y*c2.x);
}
/**
* Load a value from the half-complex grid produces by a real-to-complex transform.
*/
real2 loadComplexValue(__global const real2* restrict in, int x, int y, int z) {
const int inputZSize = ZSIZE/2+1;
if (z < inputZSize)
return in[x*YSIZE*inputZSize+y*inputZSize+z];
int xp = (x == 0 ? 0 : XSIZE-x);
int yp = (y == 0 ? 0 : YSIZE-y);
real2 value = in[xp*YSIZE*inputZSize+yp*inputZSize+(ZSIZE-z)];
return (real2) (value.x, -value.y);
}
/**
* Perform a 1D FFT on each row along one axis.
*/
__kernel void execFFT(__global const real2* restrict in, __global real2* restrict out, int sign, __local real2* restrict w,
__kernel void execFFT(__global const INPUT_TYPE* restrict in, __global OUTPUT_TYPE* restrict out, __local real2* restrict w,
__local real2* restrict data0, __local real2* restrict data1) {
for (int i = get_local_id(0); i < ZSIZE; i += get_local_size(0))
w[i] = (real2) (cos(-sign*i*2*M_PI/ZSIZE), sin(-sign*i*2*M_PI/ZSIZE));
w[i] = (real2) (cos(-(SIGN)*i*2*M_PI/ZSIZE), sin(-(SIGN)*i*2*M_PI/ZSIZE));
barrier(CLK_LOCAL_MEM_FENCE);
for (int baseIndex = get_group_id(0)*BLOCKS_PER_GROUP; baseIndex < XSIZE*YSIZE; baseIndex += get_num_groups(0)*BLOCKS_PER_GROUP) {
int index = baseIndex+get_local_id(0)/ZSIZE;
int x = index/YSIZE;
int y = index-x*YSIZE;
#if OUTPUT_IS_PACKED
if (x < XSIZE/2+1) {
#endif
#if LOOP_REQUIRED
for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))
#if INPUT_IS_REAL
data0[z] = (real2) (in[x*(YSIZE*ZSIZE)+y*ZSIZE+z], 0);
#elif INPUT_IS_PACKED
data0[z] = loadComplexValue(in, x, y, z);
#else
data0[z] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+z];
#endif
#else
if (index < XSIZE*YSIZE)
#if INPUT_IS_REAL
data0[get_local_id(0)] = (real2) (in[x*(YSIZE*ZSIZE)+y*ZSIZE+get_local_id(0)%ZSIZE], 0);
#elif INPUT_IS_PACKED
data0[get_local_id(0)] = loadComplexValue(in, x, y, get_local_id(0)%ZSIZE);
#else
data0[get_local_id(0)] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+get_local_id(0)%ZSIZE];
#endif
#endif
#if OUTPUT_IS_PACKED
}
#endif
barrier(CLK_LOCAL_MEM_FENCE);
COMPUTE_FFT
......
/**
* Combine the two halves of a real grid into a complex grid that is half as large.
*/
__kernel void packForwardData(__global const real* restrict in, __global real2* restrict out) {
const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
int y = remainder/PACKED_ZSIZE;
int z = remainder-y*PACKED_ZSIZE;
#if PACKED_AXIS == 0
real2 value = (real2) (in[2*x*YSIZE*ZSIZE+y*ZSIZE+z], in[(2*x+1)*YSIZE*ZSIZE+y*ZSIZE+z]);
#elif PACKED_AXIS == 1
real2 value = (real2) (in[x*YSIZE*ZSIZE+2*y*ZSIZE+z], in[x*YSIZE*ZSIZE+(2*y+1)*ZSIZE+z]);
#else
real2 value = (real2) (in[x*YSIZE*ZSIZE+y*ZSIZE+2*z], in[x*YSIZE*ZSIZE+y*ZSIZE+(2*z+1)]);
#endif
out[index] = value;
}
}
/**
* Split the transformed data back into a full sized, symmetric grid.
*/
__kernel void unpackForwardData(__global const real2* restrict in, __global real2* restrict out, __local real2* restrict w) {
// Compute the phase factors.
#if PACKED_AXIS == 0
for (int i = get_local_id(0); i < PACKED_XSIZE; i += get_local_size(0))
w[i] = (real2) (sin(i*2*M_PI/XSIZE), cos(i*2*M_PI/XSIZE));
#elif PACKED_AXIS == 1
for (int i = get_local_id(0); i < PACKED_YSIZE; i += get_local_size(0))
w[i] = (real2) (sin(i*2*M_PI/YSIZE), cos(i*2*M_PI/YSIZE));
#else
for (int i = get_local_id(0); i < PACKED_ZSIZE; i += get_local_size(0))
w[i] = (real2) (sin(i*2*M_PI/ZSIZE), cos(i*2*M_PI/ZSIZE));
#endif
barrier(CLK_LOCAL_MEM_FENCE);
// Transform the data.
const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
const int outputZSize = ZSIZE/2+1;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
int y = remainder/PACKED_ZSIZE;
int z = remainder-y*PACKED_ZSIZE;
int xp = (x == 0 ? 0 : PACKED_XSIZE-x);
int yp = (y == 0 ? 0 : PACKED_YSIZE-y);
int zp = (z == 0 ? 0 : PACKED_ZSIZE-z);
real2 z1 = in[x*PACKED_YSIZE*PACKED_ZSIZE+y*PACKED_ZSIZE+z];
real2 z2 = in[xp*PACKED_YSIZE*PACKED_ZSIZE+yp*PACKED_ZSIZE+zp];
#if PACKED_AXIS == 0
real2 wfac = w[x];
#elif PACKED_AXIS == 1
real2 wfac = w[y];
#else
real2 wfac = w[z];
#endif
real2 output = (real2) ((z1.x+z2.x - wfac.x*(z1.x-z2.x) + wfac.y*(z1.y+z2.y))/2, (z1.y-z2.y - wfac.y*(z1.x-z2.x) - wfac.x*(z1.y+z2.y))/2);
if (z < outputZSize)
out[x*YSIZE*outputZSize+y*outputZSize+z] = output;
xp = (x == 0 ? 0 : XSIZE-x);
yp = (y == 0 ? 0 : YSIZE-y);
zp = (z == 0 ? 0 : ZSIZE-z);
if (zp < outputZSize) {
#if PACKED_AXIS == 0
if (x == 0)
out[PACKED_XSIZE*YSIZE*outputZSize+yp*outputZSize+zp] = (real2) ((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
#elif PACKED_AXIS == 1
if (y == 0)
out[xp*YSIZE*outputZSize+PACKED_YSIZE*outputZSize+zp] = (real2) ((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
#else
if (z == 0)
out[xp*YSIZE*outputZSize+yp*outputZSize+PACKED_ZSIZE] = (real2) ((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
#endif
else
out[xp*YSIZE*outputZSize+yp*outputZSize+zp] = (real2) (output.x, -output.y);
}
}
}
/**
* Load a value from the half-complex grid produced by a real-to-complex transform.
*/
real2 loadComplexValue(__global const real2* restrict in, int x, int y, int z) {
const int inputZSize = ZSIZE/2+1;
if (z < inputZSize)
return in[x*YSIZE*inputZSize+y*inputZSize+z];
int xp = (x == 0 ? 0 : XSIZE-x);
int yp = (y == 0 ? 0 : YSIZE-y);
real2 value = in[xp*YSIZE*inputZSize+yp*inputZSize+(ZSIZE-z)];
return (real2) (value.x, -value.y);
}
/**
* Repack the symmetric complex grid into one half as large in preparation for doing an inverse complex-to-real transform.
*/
__kernel void packBackwardData(__global const real2* restrict in, __global real2* restrict out, __local real2* restrict w) {
// Compute the phase factors.
#if PACKED_AXIS == 0
for (int i = get_local_id(0); i < PACKED_XSIZE; i += get_local_size(0))
w[i] = (real2) (cos(i*2*M_PI/XSIZE), sin(i*2*M_PI/XSIZE));
#elif PACKED_AXIS == 1
for (int i = get_local_id(0); i < PACKED_YSIZE; i += get_local_size(0))
w[i] = (real2) (cos(i*2*M_PI/YSIZE), sin(i*2*M_PI/YSIZE));
#else
for (int i = get_local_id(0); i < PACKED_ZSIZE; i += get_local_size(0))
w[i] = (real2) (cos(i*2*M_PI/ZSIZE), sin(i*2*M_PI/ZSIZE));
#endif
barrier(CLK_LOCAL_MEM_FENCE);
// Transform the data.
const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
int y = remainder/PACKED_ZSIZE;
int z = remainder-y*PACKED_ZSIZE;
int xp = (x == 0 ? 0 : PACKED_XSIZE-x);
int yp = (y == 0 ? 0 : PACKED_YSIZE-y);
int zp = (z == 0 ? 0 : PACKED_ZSIZE-z);
real2 z1 = loadComplexValue(in, x, y, z);
#if PACKED_AXIS == 0
real2 wfac = w[x];
real2 z2 = loadComplexValue(in, PACKED_XSIZE-x, yp, zp);
#elif PACKED_AXIS == 1
real2 wfac = w[y];
real2 z2 = loadComplexValue(in, xp, PACKED_YSIZE-y, zp);
#else
real2 wfac = w[z];
real2 z2 = loadComplexValue(in, xp, yp, PACKED_ZSIZE-z);
#endif
real2 even = (real2) ((z1.x+z2.x)/2, (z1.y-z2.y)/2);
real2 odd = (real2) ((z1.x-z2.x)/2, (z1.y+z2.y)/2);
odd = (real2) (odd.x*wfac.x-odd.y*wfac.y, odd.y*wfac.x+odd.x*wfac.y);
out[x*PACKED_YSIZE*PACKED_ZSIZE+y*PACKED_ZSIZE+z] = (real2) (even.x-odd.y, even.y+odd.x);
}
}
/**
* Split the data back into a full sized, real grid after an inverse transform.
*/
__kernel void unpackBackwardData(__global const real2* restrict in, __global real* restrict out) {
const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
int y = remainder/PACKED_ZSIZE;
int z = remainder-y*PACKED_ZSIZE;
real2 value = 2*in[index];
#if PACKED_AXIS == 0
out[2*x*YSIZE*ZSIZE+y*ZSIZE+z] = value.x;
out[(2*x+1)*YSIZE*ZSIZE+y*ZSIZE+z] = value.y;
#elif PACKED_AXIS == 1
out[x*YSIZE*ZSIZE+2*y*ZSIZE+z] = value.x;
out[x*YSIZE*ZSIZE+(2*y+1)*ZSIZE+z] = value.y;
#else
out[x*YSIZE*ZSIZE+y*ZSIZE+2*z] = value.x;
out[x*YSIZE*ZSIZE+y*ZSIZE+(2*z+1)] = value.y;
#endif
}
}
......@@ -5,16 +5,15 @@
/**
* Find a bounding box for the atoms in each block.
*/
__kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict posq,
__global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global int* restrict rebuildNeighborList,
__kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
__global const real4* restrict posq, __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global int* restrict rebuildNeighborList,
__global real2* restrict sortedBlocks) {
int index = get_global_id(0);
int base = index*TILE_SIZE;
while (base < numAtoms) {
real4 pos = posq[base];
#ifdef USE_PERIODIC
pos.xyz -= floor(pos.xyz*invPeriodicBoxSize.xyz)*periodicBoxSize.xyz;
real4 firstPoint = pos;
APPLY_PERIODIC_TO_POS(pos)
#endif
real4 minPos = pos;
real4 maxPos = pos;
......@@ -22,7 +21,8 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri
for (int i = base+1; i < last; i++) {
pos = posq[i];
#ifdef USE_PERIODIC
pos.xyz -= floor((pos.xyz-firstPoint.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
real4 center = 0.5f*(maxPos+minPos);
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
#endif
minPos = min(minPos, pos);
maxPos = max(maxPos, pos);
......@@ -44,7 +44,7 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri
__kernel void sortBoxData(__global const real2* restrict sortedBlock, __global const real4* restrict blockCenter,
__global const real4* restrict blockBoundingBox, __global real4* restrict sortedBlockCenter,
__global real4* restrict sortedBlockBoundingBox, __global const real4* restrict posq, __global const real4* restrict oldPositions,
__global unsigned int* restrict interactionCount, __global int* restrict rebuildNeighborList) {
__global unsigned int* restrict interactionCount, __global int* restrict rebuildNeighborList, int forceRebuild) {
for (int i = get_global_id(0); i < NUM_BLOCKS; i += get_global_size(0)) {
int index = (int) sortedBlock[i].y;
sortedBlockCenter[i] = blockCenter[index];
......@@ -53,7 +53,7 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c
// Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.
bool rebuild = false;
bool rebuild = forceRebuild;
for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
real4 delta = oldPositions[i]-posq[i];
if (delta.x*delta.x + delta.y*delta.y + delta.z*delta.z > 0.25f*PADDING*PADDING)
......@@ -70,8 +70,8 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c
* to global memory.
*/
void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms, int* numAtoms, int numValid, __global unsigned int* interactionCount,
__global int* interactingTiles, __global unsigned int* interactingAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
__global real4* posq, real4 blockCenterX, real4 blockSizeX, unsigned int maxTiles, bool finish) {
__global int* interactingTiles, __global unsigned int* interactingAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX,
real4 periodicBoxVecY, real4 periodicBoxVecZ, __global const real4* posq, real4 blockCenterX, real4 blockSizeX, unsigned int maxTiles, bool finish) {
real4 posBuffer[TILE_SIZE];
const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
......@@ -83,7 +83,7 @@ void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms,
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
pos.xyz -= floor((pos.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, blockCenterX)
}
#endif
posBuffer[i] = pos;
......@@ -99,14 +99,14 @@ void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms,
real4 pos = posq[atom];
#ifdef USE_PERIODIC
if (singlePeriodicCopy)
pos.xyz -= floor((pos.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, blockCenterX)
#endif
bool interacts = false;
#ifdef USE_PERIODIC
if (!singlePeriodicCopy) {
for (int j = 0; j < TILE_SIZE && !interacts; j++) {
real4 delta = pos-posBuffer[j];
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
APPLY_PERIODIC_TO_DELTA(delta)
interacts = (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
}
}
......@@ -159,9 +159,10 @@ void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms,
* Compare the bounding boxes for each pair of blocks. If they are sufficiently far apart,
* mark them as non-interacting.
*/
__kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, __global unsigned int* restrict interactionCount,
__global int* restrict interactingTiles, __global unsigned int* restrict interactingAtoms, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex,
unsigned int numBlocks, __global real2* restrict sortedBlocks, __global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox,
__kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
__global unsigned int* restrict interactionCount, __global int* restrict interactingTiles, __global unsigned int* restrict interactingAtoms,
__global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex, unsigned int numBlocks, __global real2* restrict sortedBlocks,
__global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real4* restrict oldPositions,
__global const int* restrict rebuildNeighborList) {
if (rebuildNeighborList[0] == 0)
......@@ -204,9 +205,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
real4 blockSizeY = sortedBlockBoundingBox[j];
real4 delta = blockCenterX-blockCenterY;
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
delta.x = max((real) 0, fabs(delta.x)-blockSizeX.x-blockSizeY.x);
delta.y = max((real) 0, fabs(delta.y)-blockSizeX.y-blockSizeY.y);
......@@ -216,12 +215,12 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
buffer[valuesInBuffer++] = y;
if (valuesInBuffer == BUFFER_SIZE) {
storeInteractionData(x, buffer, atoms, &numAtoms, valuesInBuffer, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, posq, blockCenterX, blockSizeX, maxTiles, false);
storeInteractionData(x, buffer, atoms, &numAtoms, valuesInBuffer, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ, posq, blockCenterX, blockSizeX, maxTiles, false);
valuesInBuffer = 0;
}
}
}
storeInteractionData(x, buffer, atoms, &numAtoms, valuesInBuffer, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, posq, blockCenterX, blockSizeX, maxTiles, true);
storeInteractionData(x, buffer, atoms, &numAtoms, valuesInBuffer, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ, posq, blockCenterX, blockSizeX, maxTiles, true);
}
// Record the positions the neighbor list is based on.
......
......@@ -20,8 +20,9 @@ __kernel void computeBornSum(
#endif
__global const real4* restrict posq, __global const float2* restrict global_params,
#ifdef USE_CUTOFF
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
unsigned int maxTiles, __global const real4* restrict blockCenter, __global const real4* restrict blockSize, __global const int* restrict interactingAtoms,
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
__global const real4* restrict blockSize, __global const int* restrict interactingAtoms,
#else
unsigned int numTiles,
#endif
......@@ -62,7 +63,7 @@ __kernel void computeBornSum(
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF
......@@ -111,7 +112,7 @@ __kernel void computeBornSum(
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
......@@ -253,14 +254,13 @@ __kernel void computeBornSum(
real4 blockCenterX = blockCenter[x];
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
localData[tgx].x -= floor((localData[tgx].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[tgx].y -= floor((localData[tgx].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[tgx].z -= floor((localData[tgx].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[tgx], blockCenterX)
}
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
real bornSum = 0;
real4 posq1 = posq[atom1];
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
float2 params1 = global_params[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
......@@ -321,7 +321,7 @@ __kernel void computeBornSum(
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
int atom2 = atomIndices[j];
......@@ -411,8 +411,9 @@ __kernel void computeGBSAForce1(
#endif
__global real* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
#ifdef USE_CUTOFF
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
unsigned int maxTiles, __global const real4* restrict blockCenter, __global const real4* restrict blockSize, __global const int* restrict interactingAtoms,
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
__global const real4* restrict blockSize, __global const int* restrict interactingAtoms,
#else
unsigned int numTiles,
#endif
......@@ -452,7 +453,7 @@ __kernel void computeGBSAForce1(
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
......@@ -516,7 +517,7 @@ __kernel void computeGBSAForce1(
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
......@@ -669,15 +670,13 @@ __kernel void computeGBSAForce1(
real4 blockCenterX = blockCenter[x];
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
localData[tgx].x -= floor((localData[tgx].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[tgx].y -= floor((localData[tgx].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[tgx].z -= floor((localData[tgx].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[tgx], blockCenterX)
}
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
real4 force = 0;
real4 posq1 = posq[atom1];
posq1.xyz -= floor((posq1.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
float bornRadius1 = global_bornRadii[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
......@@ -740,7 +739,7 @@ __kernel void computeGBSAForce1(
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
int atom2 = atomIndices[j];
......
......@@ -23,7 +23,8 @@ __kernel void computeNonbonded(
__global const ushort2* restrict exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
#ifdef USE_CUTOFF
, __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
unsigned int maxTiles, __global const real4* restrict blockCenter, __global const real4* restrict blockSize, __global const int* restrict interactingAtoms
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
__global const real4* restrict blockSize, __global const int* restrict interactingAtoms
#endif
PARAMETER_ARGUMENTS) {
real energy = 0;
......@@ -65,7 +66,7 @@ __kernel void computeNonbonded(
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF
......@@ -133,7 +134,7 @@ __kernel void computeNonbonded(
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF
......@@ -291,15 +292,13 @@ __kernel void computeNonbonded(
real4 blockCenterX = blockCenter[x];
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
localData[tgx].x -= floor((localData[tgx].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[tgx].y -= floor((localData[tgx].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[tgx].z -= floor((localData[tgx].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[tgx], blockCenterX)
}
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
real4 force = 0;
real4 posq1 = posq[atom1];
posq1.xyz -= floor((posq1.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
......@@ -364,7 +363,7 @@ __kernel void computeNonbonded(
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF
......
......@@ -138,35 +138,34 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
real add = pos.w*data[ix].x*data[iy].y*data[iz].z;
#ifdef USE_DOUBLE_PRECISION
atom_add(&pmeGrid[2*index], (long) (add*0x100000000));
#ifdef USE_ALTERNATE_MEMORY_ACCESS_PATTERN
// On Nvidia devices (at least Maxwell anyway), this split ordering produces much higher performance. Why?
// I have no idea! And of course on AMD it produces slower performance. GPUs are not meant to be understood.
atom_add(&pmeGrid[index%2 == 0 ? index/2 : (index+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z)/2], (long) (add*0x100000000));
#else
atom_add(&pmeGrid[index], (long) (add*0x100000000));
#endif
}
}
}
}
}
__kernel void finishSpreadCharge(__global long* restrict pmeGrid) {
__global real2* realGrid = (__global real2*) pmeGrid;
__kernel void finishSpreadCharge(__global long* restrict fixedGrid, __global real* restrict realGrid) {
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
real scale = EPSILON_FACTOR/(real) 0x100000000;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
#ifdef USE_DOUBLE_PRECISION
long value = pmeGrid[2*index];
#ifdef USE_ALTERNATE_MEMORY_ACCESS_PATTERN
long value = fixedGrid[index%2 == 0 ? index/2 : (index+gridSize)/2];
#else
long value = pmeGrid[index];
long value = fixedGrid[index];
#endif
real2 realValue = (real2) ((real) (value*scale), 0);
realGrid[index] = realValue;
realGrid[index] = (real) (value*scale);
}
}
#elif defined(DEVICE_IS_CPU)
__kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
__global real2* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
__global real* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
const int firstx = get_global_id(0)*GRID_SIZE_X/get_global_size(0);
const int lastx = (get_global_id(0)+1)*GRID_SIZE_X/get_global_size(0);
if (firstx == lastx)
......@@ -230,7 +229,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
pmeGrid[index].x += EPSILON_FACTOR*pos.w*data[ix].x*data[iy].y*data[iz].z;
pmeGrid[index] += EPSILON_FACTOR*pos.w*data[ix].x*data[iy].y*data[iz].z;
}
}
}
......@@ -238,7 +237,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
}
#else
__kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
__global real2* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta) {
__global real* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta) {
unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
for (int gridIndex = get_global_id(0); gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
// Compute the charge on a grid point.
......@@ -290,22 +289,23 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
}
}
}
pmeGrid[gridIndex] = (real2) (result*EPSILON_FACTOR, 0);
pmeGrid[gridIndex] = result*EPSILON_FACTOR;
}
}
#endif
__kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global real* restrict energyBuffer, __global const real* restrict pmeBsplineModuliX,
__global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, real recipScaleFactor) {
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
real energy = 0.0f;
__kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global const real* restrict pmeBsplineModuliX,
__global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
// R2C stores into a half complex matrix where the last dimension is cut by half
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
const real recipScaleFactor = (1.0f/M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
int kx = index/(GRID_SIZE_Y*GRID_SIZE_Z);
int remainder = index-kx*GRID_SIZE_Y*GRID_SIZE_Z;
int ky = remainder/GRID_SIZE_Z;
int kz = remainder-ky*GRID_SIZE_Z;
if (kx == 0 && ky == 0 && kz == 0)
continue;
// real indices
int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
int ky = remainder/(GRID_SIZE_Z/2+1);
int kz = remainder-ky*(GRID_SIZE_Z/2+1);
int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
......@@ -319,13 +319,53 @@ __kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global r
real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
real denom = m2*bx*by*bz;
real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
pmeGrid[index] = (real2) (grid.x*eterm, grid.y*eterm);
energy += eterm*(grid.x*grid.x + grid.y*grid.y);
if (kx != 0 || ky != 0 || kz != 0) {
pmeGrid[index] = (real2) (grid.x*eterm, grid.y*eterm);
}
}
}
__kernel void gridEvaluateEnergy(__global real2* restrict pmeGrid, __global real* restrict energyBuffer,
__global const real* restrict pmeBsplineModuliX, __global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ,
real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
// R2C stores into a half complex matrix where the last dimension is cut by half
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
const real recipScaleFactor = (1.0f/M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
real energy = 0;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
// real indices
int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z));
int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z);
int ky = remainder/(GRID_SIZE_Z);
int kz = remainder-ky*(GRID_SIZE_Z);
int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
real mhx = mx*recipBoxVecX.x;
real mhy = mx*recipBoxVecY.x+my*recipBoxVecY.y;
real mhz = mx*recipBoxVecZ.x+my*recipBoxVecZ.y+mz*recipBoxVecZ.z;
real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
real bx = pmeBsplineModuliX[kx];
real by = pmeBsplineModuliY[ky];
real bz = pmeBsplineModuliZ[kz];
real denom = m2*bx*by*bz;
real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
if (kz >= (GRID_SIZE_Z/2+1)) {
kx = ((kx == 0) ? kx : GRID_SIZE_X-kx);
ky = ((ky == 0) ? ky : GRID_SIZE_Y-ky);
kz = GRID_SIZE_Z-kz;
}
int indexInHalfComplexGrid = kz + ky*(GRID_SIZE_Z/2+1)+kx*(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
real2 grid = pmeGrid[indexInHalfComplexGrid];
if (kx != 0 || ky != 0 || kz != 0) {
energy += eterm*(grid.x*grid.x + grid.y*grid.y);
}
}
energyBuffer[get_global_id(0)] += 0.5f*energy;
}
__kernel void gridInterpolateForce(__global const real4* restrict posq, __global real4* restrict forceBuffers, __global const real2* restrict pmeGrid,
__kernel void gridInterpolateForce(__global const real4* restrict posq, __global real4* restrict forceBuffers, __global const real* restrict pmeGrid,
real4 periodicBoxSize, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, __global int2* restrict pmeAtomGridIndex) {
const real4 scale = 1/(real) (PME_ORDER-1);
real4 data[PME_ORDER];
......@@ -385,7 +425,7 @@ __kernel void gridInterpolateForce(__global const real4* restrict posq, __global
int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
real gridvalue = pmeGrid[index].x;
real gridvalue = pmeGrid[index];
force.x += ddata[ix].x*data[iy].y*data[iz].z*gridvalue;
force.y += data[ix].x*ddata[iy].y*data[iz].z*gridvalue;
force.z += data[ix].x*data[iy].y*ddata[iz].z*gridvalue;
......
......@@ -109,13 +109,7 @@ __kernel void assignElementsToBuckets(__global const DATA_TYPE* restrict data, u
float maxValue = (float) (range[1]);
float bucketWidth = (maxValue-minValue)/numBuckets;
for (uint index = get_global_id(0); index < length; index += get_global_size(0)) {
#if defined(MAC_AMD_WORKAROUND) && VALUE_IS_INT2
__global int* d = (__global int*) data;
int2 element = (int2) (d[2*index], d[2*index+1]);
float key = (float) getValue(element);
#else
float key = (float) getValue(data[index]);
#endif
uint bucketIndex = min((uint) ((key-minValue)/bucketWidth), numBuckets-1);
offsetInBucket[index] = atom_inc(&bucketOffset[bucketIndex]);
bucketOfElement[index] = bucketIndex;
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010 Stanford University and the Authors. *
* Portions copyright (c) 2010-2015 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -105,11 +105,68 @@ void testCMAPTorsions() {
}
}
void testChangingParameters() {
// Create a system with two maps and one torsion.
const int mapSize = 8;
System system;
for (int i = 0; i < 5; i++)
system.addParticle(1.0);
CMAPTorsionForce* cmap = new CMAPTorsionForce();
vector<double> mapEnergy1(mapSize*mapSize);
vector<double> mapEnergy2(mapSize*mapSize);
for (int i = 0; i < mapSize; i++) {
double angle1 = i*2*M_PI/mapSize;
double energy1 = cos(angle1);
for (int j = 0; j < mapSize; j++) {
double angle2 = j*2*M_PI/mapSize;
double energy2 = 10*sin(angle2);
mapEnergy1[i+j*mapSize] = energy1+energy2;
mapEnergy2[i+j*mapSize] = energy1-energy2;
}
}
cmap->addMap(mapSize, mapEnergy1);
cmap->addMap(mapSize, mapEnergy2);
cmap->addTorsion(0, 0, 1, 2, 3, 1, 2, 3, 4);
system.addForce(cmap);
// Set particle positions so angle1=0 and angle2=PI/4.
vector<Vec3> positions(5);
positions[0] = Vec3(0, 0, 1);
positions[1] = Vec3(0, 0, 0);
positions[2] = Vec3(1, 0, 0);
positions[3] = Vec3(1, 0, 1);
positions[4] = Vec3(0.5, -0.5, 1);
VerletIntegrator integrator(0.01);
Context context(system, integrator, platform);
context.setPositions(positions);
// Check that the energy is correct.
double energy = context.getState(State::Energy).getPotentialEnergy();
ASSERT_EQUAL_TOL(1+10*sin(M_PI/4), energy, 1e-5);
// Modify the parameters.
cmap->setTorsionParameters(0, 1, 0, 1, 2, 3, 1, 2, 3, 4);
for (int i = 0; i < mapSize*mapSize; i++)
mapEnergy2[i] *= 2.0;
cmap->setMapParameters(1, mapSize, mapEnergy2);
cmap->updateParametersInContext(context);
// See if the results are correct.
energy = context.getState(State::Energy).getPotentialEnergy();
ASSERT_EQUAL_TOL(2-20*sin(M_PI/4), energy, 1e-5);
}
int main(int argc, char* argv[]) {
try {
if (argc > 1)
platform.setPropertyDefaultValue("OpenCLPrecision", string(argv[1]));
testCMAPTorsions();
testChangingParameters();
}
catch(const exception& e) {
cout << "exception: " << e.what() << endl;
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011 Stanford University and the Authors. *
* Portions copyright (c) 2011-2015 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -51,7 +51,7 @@ using namespace std;
static OpenCLPlatform platform;
template <class Real2>
void testTransform() {
void testTransform(bool realToComplex, int xsize, int ysize, int zsize) {
System system;
system.addParticle(0.0);
OpenCLPlatform::PlatformData platformData(system, "", "", platform.getPropertyDefaultValue("OpenCLPrecision"), "false");
......@@ -59,7 +59,6 @@ void testTransform() {
context.initialize();
OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt);
int xsize = 28, ysize = 25, zsize = 30;
vector<Real2> original(xsize*ysize*zsize);
vector<t_complex> reference(original.size());
for (int i = 0; i < (int) original.size(); i++) {
......@@ -67,10 +66,16 @@ void testTransform() {
original[i] = value;
reference[i] = t_complex(value.x, value.y);
}
for (int i = 0; i < (int) reference.size(); i++) {
if (realToComplex)
reference[i] = t_complex(i%2 == 0 ? original[i/2].x : original[i/2].y, 0);
else
reference[i] = t_complex(original[i].x, original[i].y);
}
OpenCLArray grid1(context, original.size(), sizeof(Real2), "grid1");
OpenCLArray grid2(context, original.size(), sizeof(Real2), "grid2");
grid1.upload(original);
OpenCLFFT3D fft(context, xsize, ysize, zsize);
OpenCLFFT3D fft(context, xsize, ysize, zsize, realToComplex);
// Perform a forward FFT, then verify the result is correct.
......@@ -80,10 +85,15 @@ void testTransform() {
fftpack_t plan;
fftpack_init_3d(&plan, xsize, ysize, zsize);
fftpack_exec_3d(plan, FFTPACK_FORWARD, &reference[0], &reference[0]);
for (int i = 0; i < (int) result.size(); ++i) {
ASSERT_EQUAL_TOL(reference[i].re, result[i].x, 1e-3);
ASSERT_EQUAL_TOL(reference[i].im, result[i].y, 1e-3);
}
int outputZSize = (realToComplex ? zsize/2+1 : zsize);
for (int x = 0; x < xsize; x++)
for (int y = 0; y < ysize; y++)
for (int z = 0; z < outputZSize; z++) {
int index1 = x*ysize*zsize + y*zsize + z;
int index2 = x*ysize*outputZSize + y*outputZSize + z;
ASSERT_EQUAL_TOL(reference[index1].re, result[index2].x, 1e-3);
ASSERT_EQUAL_TOL(reference[index1].im, result[index2].y, 1e-3);
}
fftpack_destroy(plan);
// Perform a backward transform and see if we get the original values.
......@@ -91,7 +101,8 @@ void testTransform() {
fft.execFFT(grid2, grid1, false);
grid1.download(result);
double scale = 1.0/(xsize*ysize*zsize);
for (int i = 0; i < (int) result.size(); ++i) {
int valuesToCheck = (realToComplex ? original.size()/2 : original.size());
for (int i = 0; i < valuesToCheck; ++i) {
ASSERT_EQUAL_TOL(original[i].x, scale*result[i].x, 1e-4);
ASSERT_EQUAL_TOL(original[i].y, scale*result[i].y, 1e-4);
}
......@@ -101,10 +112,20 @@ int main(int argc, char* argv[]) {
try {
if (argc > 1)
platform.setPropertyDefaultValue("OpenCLPrecision", string(argv[1]));
if (platform.getPropertyDefaultValue("OpenCLPrecision") == "double")
testTransform<mm_double2>();
else
testTransform<mm_float2>();
if (platform.getPropertyDefaultValue("OpenCLPrecision") == "double") {
testTransform<mm_double2>(false, 28, 25, 30);
testTransform<mm_double2>(true, 28, 25, 25);
testTransform<mm_double2>(true, 25, 28, 25);
testTransform<mm_double2>(true, 25, 25, 28);
testTransform<mm_double2>(true, 21, 25, 27);
}
else {
testTransform<mm_float2>(false, 28, 25, 30);
testTransform<mm_float2>(true, 28, 25, 25);
testTransform<mm_float2>(true, 25, 28, 25);
testTransform<mm_float2>(true, 25, 25, 28);
testTransform<mm_float2>(true, 21, 25, 27);
}
}
catch(const exception& e) {
cout << "exception: " << e.what() << endl;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment