Commit 7bfb75c7 authored by Lee-Ping Wang's avatar Lee-Ping Wang
Browse files

Merge branch 'master' of github.com:SimTk/openmm

parents a85d163b 49cb91c5
...@@ -20,6 +20,5 @@ find_library (FFTW_THREADS_LIBRARY NAMES fftw3f_threads) ...@@ -20,6 +20,5 @@ find_library (FFTW_THREADS_LIBRARY NAMES fftw3f_threads)
# all listed variables are TRUE # all listed variables are TRUE
include (FindPackageHandleStandardArgs) include (FindPackageHandleStandardArgs)
find_package_handle_standard_args (FFTW DEFAULT_MSG FFTW_LIBRARY FFTW_INCLUDES) find_package_handle_standard_args (FFTW DEFAULT_MSG FFTW_LIBRARY FFTW_INCLUDES)
find_package_handle_standard_args (FFTW_THREADS DEFAULT_MSG FFTW_THREADS_LIBRARY FFTW_INCLUDES)
mark_as_advanced (FFTW_LIBRARY FFTW_THREADS_LIBRARY FFTW_INCLUDES) mark_as_advanced (FFTW_LIBRARY FFTW_THREADS_LIBRARY FFTW_INCLUDES)
...@@ -30,6 +30,7 @@ for i in range(numPlatforms): ...@@ -30,6 +30,7 @@ for i in range(numPlatforms):
simulation = Simulation(pdb.topology, system, integrator, platform) simulation = Simulation(pdb.topology, system, integrator, platform)
simulation.context.setPositions(pdb.positions) simulation.context.setPositions(pdb.positions)
forces[i] = simulation.context.getState(getForces=True).getForces() forces[i] = simulation.context.getState(getForces=True).getForces()
del simulation
print "- Successfully computed forces" print "- Successfully computed forces"
except: except:
print "- Error computing forces with", platform.getName(), "platform" print "- Error computing forces with", platform.getName(), "platform"
......
...@@ -5222,6 +5222,10 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -5222,6 +5222,10 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
cu.setTime(cu.getTime()+integrator.getStepSize()); cu.setTime(cu.getTime()+integrator.getStepSize());
cu.setStepCount(cu.getStepCount()+1); cu.setStepCount(cu.getStepCount()+1);
cu.reorderAtoms(); cu.reorderAtoms();
if (cu.getAtomsWereReordered()) {
forcesAreValid = false;
validSavedForces.clear();
}
} }
double CudaIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid) { double CudaIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid) {
......
...@@ -32,7 +32,12 @@ extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, con ...@@ -32,7 +32,12 @@ extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, con
*/ */
extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real4* __restrict__ posqCorrection, const mixed4* __restrict__ posDelta, mixed4* __restrict__ velm, const mixed2* __restrict__ dt) { extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real4* __restrict__ posqCorrection, const mixed4* __restrict__ posDelta, mixed4* __restrict__ velm, const mixed2* __restrict__ dt) {
#if __CUDA_ARCH__ >= 130
double invStepSize = 1.0/dt[0].y; double invStepSize = 1.0/dt[0].y;
#else
float invStepSize = 1.0f/dt[0].y;
float correction = (1.0f-invStepSize*dt[0].y)/dt[0].y;
#endif
int index = blockIdx.x*blockDim.x+threadIdx.x; int index = blockIdx.x*blockDim.x+threadIdx.x;
while (index < NUM_ATOMS) { while (index < NUM_ATOMS) {
mixed4 vel = velm[index]; mixed4 vel = velm[index];
...@@ -48,9 +53,15 @@ extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real ...@@ -48,9 +53,15 @@ extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real
pos.x += delta.x; pos.x += delta.x;
pos.y += delta.y; pos.y += delta.y;
pos.z += delta.z; pos.z += delta.z;
#if __CUDA_ARCH__ >= 130
vel.x = (mixed) (invStepSize*delta.x); vel.x = (mixed) (invStepSize*delta.x);
vel.y = (mixed) (invStepSize*delta.y); vel.y = (mixed) (invStepSize*delta.y);
vel.z = (mixed) (invStepSize*delta.z); vel.z = (mixed) (invStepSize*delta.z);
#else
vel.x = invStepSize*delta.x + correction*delta.x;
vel.y = invStepSize*delta.y + correction*delta.x;
vel.z = invStepSize*delta.z + correction*delta.x;
#endif
#ifdef USE_MIXED_PRECISION #ifdef USE_MIXED_PRECISION
posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w); posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0); posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
......
...@@ -37,7 +37,12 @@ extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, c ...@@ -37,7 +37,12 @@ extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, c
extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4* __restrict__ posq, extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4* __restrict__ posq,
real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const mixed4* __restrict__ posDelta) { real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const mixed4* __restrict__ posDelta) {
mixed2 stepSize = dt[0]; mixed2 stepSize = dt[0];
#if __CUDA_ARCH__ >= 130
double oneOverDt = 1.0/stepSize.y; double oneOverDt = 1.0/stepSize.y;
#else
float oneOverDt = 1.0f/stepSize.y;
float correction = (1.0f-oneOverDt*stepSize.y)/stepSize.y;
#endif
int index = blockIdx.x*blockDim.x+threadIdx.x; int index = blockIdx.x*blockDim.x+threadIdx.x;
if (index == 0) if (index == 0)
dt[0].x = stepSize.y; dt[0].x = stepSize.y;
...@@ -55,7 +60,11 @@ extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4* ...@@ -55,7 +60,11 @@ extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4*
pos.x += delta.x; pos.x += delta.x;
pos.y += delta.y; pos.y += delta.y;
pos.z += delta.z; pos.z += delta.z;
#if __CUDA_ARCH__ >= 130
velocity = make_mixed4((mixed) (delta.x*oneOverDt), (mixed) (delta.y*oneOverDt), (mixed) (delta.z*oneOverDt), velocity.w); velocity = make_mixed4((mixed) (delta.x*oneOverDt), (mixed) (delta.y*oneOverDt), (mixed) (delta.z*oneOverDt), velocity.w);
#else
velocity = make_mixed4((mixed) (delta.x*oneOverDt+delta.x*correction), (mixed) (delta.y*oneOverDt+delta.y*correction), (mixed) (delta.z*oneOverDt+delta.z*correction), velocity.w);
#endif
#ifdef USE_MIXED_PRECISION #ifdef USE_MIXED_PRECISION
posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w); posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0); posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
......
...@@ -97,7 +97,6 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device ...@@ -97,7 +97,6 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
// Try to figure out which device is the fastest. // Try to figure out which device is the fastest.
int bestSpeed = -1; int bestSpeed = -1;
bool bestSupportsDouble = false;
for (int i = 0; i < (int) devices.size(); i++) { for (int i = 0; i < (int) devices.size(); i++) {
if (platformVendor == "Apple" && devices[i].getInfo<CL_DEVICE_VENDOR>() == "AMD") if (platformVendor == "Apple" && devices[i].getInfo<CL_DEVICE_VENDOR>() == "AMD")
continue; // Don't use AMD GPUs on OS X due to serious bugs. continue; // Don't use AMD GPUs on OS X due to serious bugs.
...@@ -136,11 +135,9 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device ...@@ -136,11 +135,9 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
} }
} }
int speed = devices[i].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()*processingElementsPerComputeUnit*devices[i].getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>(); int speed = devices[i].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()*processingElementsPerComputeUnit*devices[i].getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>();
bool supportsDouble = (devices[i].getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_fp64") != string::npos); if (maxSize >= minThreadBlockSize && speed > bestSpeed) {
if (maxSize >= minThreadBlockSize && speed > bestSpeed && (supportsDouble || !bestSupportsDouble)) {
deviceIndex = i; deviceIndex = i;
bestSpeed = speed; bestSpeed = speed;
bestSupportsDouble = supportsDouble;
} }
} }
} }
...@@ -277,7 +274,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device ...@@ -277,7 +274,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
clearFiveBuffersKernel = cl::Kernel(utilities, "clearFiveBuffers"); clearFiveBuffersKernel = cl::Kernel(utilities, "clearFiveBuffers");
clearSixBuffersKernel = cl::Kernel(utilities, "clearSixBuffers"); clearSixBuffersKernel = cl::Kernel(utilities, "clearSixBuffers");
reduceReal4Kernel = cl::Kernel(utilities, "reduceReal4Buffer"); reduceReal4Kernel = cl::Kernel(utilities, "reduceReal4Buffer");
reduceForcesKernel = cl::Kernel(utilities, "reduceForces"); if (supports64BitGlobalAtomics)
reduceForcesKernel = cl::Kernel(utilities, "reduceForces");
// Decide whether native_sqrt(), native_rsqrt(), and native_recip() are sufficiently accurate to use. // Decide whether native_sqrt(), native_rsqrt(), and native_recip() are sufficiently accurate to use.
......
...@@ -5444,6 +5444,10 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr ...@@ -5444,6 +5444,10 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
cl.setTime(cl.getTime()+integrator.getStepSize()); cl.setTime(cl.getTime()+integrator.getStepSize());
cl.setStepCount(cl.getStepCount()+1); cl.setStepCount(cl.getStepCount()+1);
cl.reorderAtoms(); cl.reorderAtoms();
if (cl.getAtomsWereReordered()) {
forcesAreValid = false;
validSavedForces.clear();
}
// Reduce UI lag. // Reduce UI lag.
......
...@@ -81,7 +81,7 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con ...@@ -81,7 +81,7 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
} }
else { else {
numForceThreadBlocks = context.getNumThreadBlocks(); numForceThreadBlocks = context.getNumThreadBlocks();
forceThreadBlockSize = OpenCLContext::ThreadBlockSize; forceThreadBlockSize = (context.getSIMDWidth() >= 32 ? OpenCLContext::ThreadBlockSize : 32);
if (context.getSupports64BitGlobalAtomics()) { if (context.getSupports64BitGlobalAtomics()) {
// Even though using longForceBuffer, still need a single forceBuffer for the reduceForces kernel to convert the long results into float4 which will be used by later kernels. // Even though using longForceBuffer, still need a single forceBuffer for the reduceForces kernel to convert the long results into float4 which will be used by later kernels.
numForceBuffers = 1; numForceBuffers = 1;
......
...@@ -70,7 +70,7 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c ...@@ -70,7 +70,7 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c
* to global memory. * to global memory.
*/ */
void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms, int* numAtoms, int numValid, __global unsigned int* interactionCount, void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms, int* numAtoms, int numValid, __global unsigned int* interactionCount,
__global ushort2* interactingTiles, __global unsigned int* interactingAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global int* interactingTiles, __global unsigned int* interactingAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
__global real4* posq, real4 blockCenterX, real4 blockSizeX, unsigned int maxTiles, bool finish) { __global real4* posq, real4 blockCenterX, real4 blockSizeX, unsigned int maxTiles, bool finish) {
real4 posBuffer[TILE_SIZE]; real4 posBuffer[TILE_SIZE];
const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF && const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
...@@ -128,7 +128,7 @@ void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms, ...@@ -128,7 +128,7 @@ void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms,
int baseIndex = atom_add(interactionCount, tilesToStore); int baseIndex = atom_add(interactionCount, tilesToStore);
if (baseIndex+tilesToStore <= maxTiles) { if (baseIndex+tilesToStore <= maxTiles) {
for (int i = 0; i < tilesToStore; i++) { for (int i = 0; i < tilesToStore; i++) {
interactingTiles[baseIndex+i] = (ushort2) (x, singlePeriodicCopy); interactingTiles[baseIndex+i] = x;
for (int j = 0; j < TILE_SIZE; j++) for (int j = 0; j < TILE_SIZE; j++)
interactingAtoms[(baseIndex+i)*TILE_SIZE+j] = atoms[i*TILE_SIZE+j]; interactingAtoms[(baseIndex+i)*TILE_SIZE+j] = atoms[i*TILE_SIZE+j];
} }
...@@ -145,7 +145,7 @@ void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms, ...@@ -145,7 +145,7 @@ void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms,
int baseIndex = atom_add(interactionCount, tilesToStore); int baseIndex = atom_add(interactionCount, tilesToStore);
if (baseIndex+tilesToStore <= maxTiles) { if (baseIndex+tilesToStore <= maxTiles) {
for (int i = 0; i < tilesToStore; i++) { for (int i = 0; i < tilesToStore; i++) {
interactingTiles[baseIndex+i] = (ushort2) (x, singlePeriodicCopy); interactingTiles[baseIndex+i] = x;
for (int j = 0; j < TILE_SIZE; j++) { for (int j = 0; j < TILE_SIZE; j++) {
int index = i*TILE_SIZE+j; int index = i*TILE_SIZE+j;
interactingAtoms[(baseIndex+i)*TILE_SIZE+j] = (index < *numAtoms ? atoms[index] : NUM_ATOMS); interactingAtoms[(baseIndex+i)*TILE_SIZE+j] = (index < *numAtoms ? atoms[index] : NUM_ATOMS);
...@@ -159,9 +159,8 @@ void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms, ...@@ -159,9 +159,8 @@ void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms,
* Compare the bounding boxes for each pair of blocks. If they are sufficiently far apart, * Compare the bounding boxes for each pair of blocks. If they are sufficiently far apart,
* mark them as non-interacting. * mark them as non-interacting.
*/ */
__kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict blockCenter, __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, __global unsigned int* restrict interactionCount,
__global const real4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount, __global ushort2* restrict interactingTiles, __global int* restrict interactingTiles, __global unsigned int* restrict interactingAtoms, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex,
__global unsigned int* restrict interactingAtoms, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex,
unsigned int numBlocks, __global real2* restrict sortedBlocks, __global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox, unsigned int numBlocks, __global real2* restrict sortedBlocks, __global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real4* restrict oldPositions, __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real4* restrict oldPositions,
__global const int* restrict rebuildNeighborList) { __global const int* restrict rebuildNeighborList) {
...@@ -180,8 +179,8 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi ...@@ -180,8 +179,8 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
numAtoms = 0; numAtoms = 0;
real2 sortedKey = sortedBlocks[i]; real2 sortedKey = sortedBlocks[i];
unsigned short x = (unsigned short) sortedKey.y; unsigned short x = (unsigned short) sortedKey.y;
real4 blockCenterX = blockCenter[x]; real4 blockCenterX = sortedBlockCenter[i];
real4 blockSizeX = blockBoundingBox[x]; real4 blockSizeX = sortedBlockBoundingBox[i];
// Load exclusion data for block x. // Load exclusion data for block x.
......
...@@ -424,6 +424,7 @@ __kernel void computeGBSAForce1( ...@@ -424,6 +424,7 @@ __kernel void computeGBSAForce1(
localData[localAtomIndex].z = posq1.z; localData[localAtomIndex].z = posq1.z;
localData[localAtomIndex].q = posq1.w; localData[localAtomIndex].q = posq1.w;
localData[get_local_id(0)].bornRadius = bornRadius1; localData[get_local_id(0)].bornRadius = bornRadius1;
SYNC_WARPS;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
real4 posq2 = (real4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q); real4 posq2 = (real4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
...@@ -454,8 +455,8 @@ __kernel void computeGBSAForce1( ...@@ -454,8 +455,8 @@ __kernel void computeGBSAForce1(
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
#endif #endif
SYNC_WARPS;
} }
SYNC_WARPS;
} }
} }
else { else {
...@@ -472,6 +473,7 @@ __kernel void computeGBSAForce1( ...@@ -472,6 +473,7 @@ __kernel void computeGBSAForce1(
localData[get_local_id(0)].fy = 0.0f; localData[get_local_id(0)].fy = 0.0f;
localData[get_local_id(0)].fz = 0.0f; localData[get_local_id(0)].fz = 0.0f;
localData[get_local_id(0)].fw = 0.0f; localData[get_local_id(0)].fw = 0.0f;
SYNC_WARPS;
unsigned int tj = tgx; unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
...@@ -627,6 +629,7 @@ __kernel void computeGBSAForce1( ...@@ -627,6 +629,7 @@ __kernel void computeGBSAForce1(
localData[get_local_id(0)].fz = 0.0f; localData[get_local_id(0)].fz = 0.0f;
localData[get_local_id(0)].fw = 0.0f; localData[get_local_id(0)].fw = 0.0f;
} }
SYNC_WARPS;
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
if (singlePeriodicCopy) { if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic // The box is small enough that we can just translate all the atoms into a single periodic
......
...@@ -36,6 +36,7 @@ __kernel void integrateLangevinPart2(__global real4* restrict posq, __global rea ...@@ -36,6 +36,7 @@ __kernel void integrateLangevinPart2(__global real4* restrict posq, __global rea
double invStepSize = 1.0/dt[0].y; double invStepSize = 1.0/dt[0].y;
#else #else
float invStepSize = 1.0f/dt[0].y; float invStepSize = 1.0f/dt[0].y;
float correction = (1.0f-invStepSize*dt[0].y)/dt[0].y;
#endif #endif
int index = get_global_id(0); int index = get_global_id(0);
while (index < NUM_ATOMS) { while (index < NUM_ATOMS) {
...@@ -53,7 +54,7 @@ __kernel void integrateLangevinPart2(__global real4* restrict posq, __global rea ...@@ -53,7 +54,7 @@ __kernel void integrateLangevinPart2(__global real4* restrict posq, __global rea
#ifdef SUPPORTS_DOUBLE_PRECISION #ifdef SUPPORTS_DOUBLE_PRECISION
vel.xyz = convert_mixed4(invStepSize*convert_double4(delta)).xyz; vel.xyz = convert_mixed4(invStepSize*convert_double4(delta)).xyz;
#else #else
vel.xyz = invStepSize*delta.xyz; vel.xyz = invStepSize*delta.xyz + correction*delta.xyz;
#endif #endif
#ifdef USE_MIXED_PRECISION #ifdef USE_MIXED_PRECISION
posq[index] = convert_real4(pos); posq[index] = convert_real4(pos);
......
...@@ -81,6 +81,7 @@ __kernel void reduceReal4Buffer(__global real4* restrict buffer, int bufferSize, ...@@ -81,6 +81,7 @@ __kernel void reduceReal4Buffer(__global real4* restrict buffer, int bufferSize,
} }
} }
#ifdef SUPPORTS_64_BIT_ATOMICS
/** /**
* Sum the various buffers containing forces. * Sum the various buffers containing forces.
*/ */
...@@ -94,6 +95,7 @@ __kernel void reduceForces(__global const long* restrict longBuffer, __global re ...@@ -94,6 +95,7 @@ __kernel void reduceForces(__global const long* restrict longBuffer, __global re
buffer[index] = sum; buffer[index] = sum;
} }
} }
#endif
/** /**
* This is called to determine the accuracy of various native functions. * This is called to determine the accuracy of various native functions.
......
...@@ -38,6 +38,7 @@ __kernel void integrateVerletPart2(int numAtoms, __global mixed2* restrict dt, _ ...@@ -38,6 +38,7 @@ __kernel void integrateVerletPart2(int numAtoms, __global mixed2* restrict dt, _
double oneOverDt = 1.0/stepSize.y; double oneOverDt = 1.0/stepSize.y;
#else #else
float oneOverDt = 1.0f/stepSize.y; float oneOverDt = 1.0f/stepSize.y;
float correction = (1.0f-oneOverDt*stepSize.y)/stepSize.y;
#endif #endif
if (get_global_id(0) == 0) if (get_global_id(0) == 0)
dt[0].x = stepSize.y; dt[0].x = stepSize.y;
...@@ -58,7 +59,7 @@ __kernel void integrateVerletPart2(int numAtoms, __global mixed2* restrict dt, _ ...@@ -58,7 +59,7 @@ __kernel void integrateVerletPart2(int numAtoms, __global mixed2* restrict dt, _
#ifdef SUPPORTS_DOUBLE_PRECISION #ifdef SUPPORTS_DOUBLE_PRECISION
velocity.xyz = convert_mixed4(convert_double4(delta)*oneOverDt).xyz; velocity.xyz = convert_mixed4(convert_double4(delta)*oneOverDt).xyz;
#else #else
velocity.xyz = delta.xyz*oneOverDt; velocity.xyz = delta.xyz*oneOverDt + delta.xyz*correction;
#endif #endif
#ifdef USE_MIXED_PRECISION #ifdef USE_MIXED_PRECISION
posq[index] = convert_real4(pos); posq[index] = convert_real4(pos);
......
...@@ -455,7 +455,7 @@ void testSwitchingFunction() { ...@@ -455,7 +455,7 @@ void testSwitchingFunction() {
positions[1] = Vec3(r+delta, 0, 0); positions[1] = Vec3(r+delta, 0, 0);
context.setPositions(positions); context.setPositions(positions);
double e2 = context.getState(State::Energy).getPotentialEnergy(); double e2 = context.getState(State::Energy).getPotentialEnergy();
ASSERT_EQUAL_TOL((e2-e1)/(2*delta), state.getForces()[0][0], 1e-3); ASSERT_EQUAL_TOL((e2-e1)/(2*delta), state.getForces()[0][0], 2e-3);
} }
} }
......
...@@ -33,7 +33,7 @@ ...@@ -33,7 +33,7 @@
using namespace OpenMM; using namespace OpenMM;
extern "C" void registerPlatforms() { extern "C" OPENMM_EXPORT void registerPlatforms() {
} }
extern "C" OPENMM_EXPORT void registerKernelFactories() { extern "C" OPENMM_EXPORT void registerKernelFactories() {
......
...@@ -85,7 +85,10 @@ IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug) ...@@ -85,7 +85,10 @@ IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug) ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME}) SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug) ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${PTHREADS_LIB} ${FFTW_LIBRARY} ${FFTW_THREADS_LIBRARY}) TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${PTHREADS_LIB} ${FFTW_LIBRARY})
IF (FFTW_THREADS_LIBRARY)
TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${FFTW_THREADS_LIBRARY})
ENDIF (FFTW_THREADS_LIBRARY)
SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_PME_BUILDING_SHARED_LIBRARY") SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_PME_BUILDING_SHARED_LIBRARY")
INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET}) INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
......
...@@ -32,10 +32,10 @@ ...@@ -32,10 +32,10 @@
using namespace OpenMM; using namespace OpenMM;
extern "C" void registerPlatforms() { extern "C" OPENMM_EXPORT_PME void registerPlatforms() {
} }
extern "C" void registerKernelFactories() { extern "C" OPENMM_EXPORT_PME void registerKernelFactories() {
if (CpuCalcPmeReciprocalForceKernel::isProcessorSupported()) { if (CpuCalcPmeReciprocalForceKernel::isProcessorSupported()) {
CpuPmeKernelFactory* factory = new CpuPmeKernelFactory(); CpuPmeKernelFactory* factory = new CpuPmeKernelFactory();
for (int i = 0; i < Platform::getNumPlatforms(); i++) for (int i = 0; i < Platform::getNumPlatforms(); i++)
......
...@@ -119,8 +119,9 @@ static void spreadCharge(int start, int end, float* posq, float* grid, int gridx ...@@ -119,8 +119,9 @@ static void spreadCharge(int start, int end, float* posq, float* grid, int gridx
for (int i = start; i < end; i++) { for (int i = start; i < end; i++) {
// Find the position relative to the nearest grid point. // Find the position relative to the nearest grid point.
__m128 pos = _mm_load_ps(&posq[4*i]); __m128 pos = _mm_loadu_ps(&posq[4*i]);
__m128 posInBox = _mm_sub_ps(pos, _mm_mul_ps(boxSize, _mm_floor_ps(_mm_mul_ps(pos, invBoxSize)))); __m128 posFloor = _mm_floor_ps(_mm_mul_ps(pos, invBoxSize));
__m128 posInBox = _mm_sub_ps(pos, _mm_mul_ps(boxSize, posFloor));
__m128 t = _mm_mul_ps(_mm_mul_ps(posInBox, invBoxSize), gridSize); __m128 t = _mm_mul_ps(_mm_mul_ps(posInBox, invBoxSize), gridSize);
__m128i ti = _mm_cvttps_epi32(t); __m128i ti = _mm_cvttps_epi32(t);
__m128 dr = _mm_sub_ps(t, _mm_cvtepi32_ps(ti)); __m128 dr = _mm_sub_ps(t, _mm_cvtepi32_ps(ti));
...@@ -309,8 +310,9 @@ static void interpolateForces(int start, int end, float* posq, float* force, flo ...@@ -309,8 +310,9 @@ static void interpolateForces(int start, int end, float* posq, float* force, flo
for (int i = start; i < end; i++) { for (int i = start; i < end; i++) {
// Find the position relative to the nearest grid point. // Find the position relative to the nearest grid point.
__m128 pos = _mm_load_ps(&posq[4*i]); __m128 pos = _mm_loadu_ps(&posq[4*i]);
__m128 posInBox = _mm_sub_ps(pos, _mm_mul_ps(boxSize, _mm_floor_ps(_mm_mul_ps(pos, invBoxSize)))); __m128 posFloor = _mm_floor_ps(_mm_mul_ps(pos, invBoxSize));
__m128 posInBox = _mm_sub_ps(pos, _mm_mul_ps(boxSize, posFloor));
__m128 t = _mm_mul_ps(_mm_mul_ps(posInBox, invBoxSize), gridSize); __m128 t = _mm_mul_ps(_mm_mul_ps(posInBox, invBoxSize), gridSize);
__m128i ti = _mm_cvttps_epi32(t); __m128i ti = _mm_cvttps_epi32(t);
__m128 dr = _mm_sub_ps(t, _mm_cvtepi32_ps(ti)); __m128 dr = _mm_sub_ps(t, _mm_cvtepi32_ps(ti));
...@@ -638,11 +640,11 @@ int CpuCalcPmeReciprocalForceKernel::findFFTDimension(int minimum) { ...@@ -638,11 +640,11 @@ int CpuCalcPmeReciprocalForceKernel::findFFTDimension(int minimum) {
// Attempt to factor the current value. // Attempt to factor the current value.
int unfactored = minimum; int unfactored = minimum;
for (int factor = 2; factor < 12; factor++) { for (int factor = 2; factor < 8; factor++) {
while (unfactored > 1 && unfactored%factor == 0) while (unfactored > 1 && unfactored%factor == 0)
unfactored /= factor; unfactored /= factor;
} }
if (unfactored == 1) if (unfactored == 1 || unfactored == 11 || unfactored == 13)
return minimum; return minimum;
minimum++; minimum++;
} }
......
...@@ -124,6 +124,10 @@ void testPME() { ...@@ -124,6 +124,10 @@ void testPME() {
int main(int argc, char* argv[]) { int main(int argc, char* argv[]) {
try { try {
if (!CpuCalcPmeReciprocalForceKernel::isProcessorSupported()) {
cout << "CPU is not supported. Exiting." << endl;
return 0;
}
testPME(); testPME();
} }
catch(const exception& e) { catch(const exception& e) {
......
...@@ -34,10 +34,10 @@ ...@@ -34,10 +34,10 @@
using namespace OpenMM; using namespace OpenMM;
extern "C" void registerPlatforms() { extern "C" OPENMM_EXPORT void registerPlatforms() {
} }
extern "C" void registerKernelFactories() { extern "C" OPENMM_EXPORT void registerKernelFactories() {
try { try {
Platform& platform = Platform::getPlatformByName("CUDA"); Platform& platform = Platform::getPlatformByName("CUDA");
CudaDrudeKernelFactory* factory = new CudaDrudeKernelFactory(); CudaDrudeKernelFactory* factory = new CudaDrudeKernelFactory();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment