Commit 7bfb75c7 authored by Lee-Ping Wang's avatar Lee-Ping Wang
Browse files

Merge branch 'master' of github.com:SimTk/openmm

parents a85d163b 49cb91c5
......@@ -20,6 +20,5 @@ find_library (FFTW_THREADS_LIBRARY NAMES fftw3f_threads)
# all listed variables are TRUE
include (FindPackageHandleStandardArgs)
find_package_handle_standard_args (FFTW DEFAULT_MSG FFTW_LIBRARY FFTW_INCLUDES)
find_package_handle_standard_args (FFTW_THREADS DEFAULT_MSG FFTW_THREADS_LIBRARY FFTW_INCLUDES)
mark_as_advanced (FFTW_LIBRARY FFTW_THREADS_LIBRARY FFTW_INCLUDES)
......@@ -30,6 +30,7 @@ for i in range(numPlatforms):
simulation = Simulation(pdb.topology, system, integrator, platform)
simulation.context.setPositions(pdb.positions)
forces[i] = simulation.context.getState(getForces=True).getForces()
del simulation
print "- Successfully computed forces"
except:
print "- Error computing forces with", platform.getName(), "platform"
......
......@@ -5222,6 +5222,10 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
cu.setTime(cu.getTime()+integrator.getStepSize());
cu.setStepCount(cu.getStepCount()+1);
cu.reorderAtoms();
if (cu.getAtomsWereReordered()) {
forcesAreValid = false;
validSavedForces.clear();
}
}
double CudaIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid) {
......
......@@ -32,7 +32,12 @@ extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, con
*/
extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real4* __restrict__ posqCorrection, const mixed4* __restrict__ posDelta, mixed4* __restrict__ velm, const mixed2* __restrict__ dt) {
#if __CUDA_ARCH__ >= 130
double invStepSize = 1.0/dt[0].y;
#else
float invStepSize = 1.0f/dt[0].y;
float correction = (1.0f-invStepSize*dt[0].y)/dt[0].y;
#endif
int index = blockIdx.x*blockDim.x+threadIdx.x;
while (index < NUM_ATOMS) {
mixed4 vel = velm[index];
......@@ -48,9 +53,15 @@ extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real
pos.x += delta.x;
pos.y += delta.y;
pos.z += delta.z;
#if __CUDA_ARCH__ >= 130
vel.x = (mixed) (invStepSize*delta.x);
vel.y = (mixed) (invStepSize*delta.y);
vel.z = (mixed) (invStepSize*delta.z);
#else
vel.x = invStepSize*delta.x + correction*delta.x;
vel.y = invStepSize*delta.y + correction*delta.x;
vel.z = invStepSize*delta.z + correction*delta.x;
#endif
#ifdef USE_MIXED_PRECISION
posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
......
......@@ -37,7 +37,12 @@ extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, c
extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4* __restrict__ posq,
real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const mixed4* __restrict__ posDelta) {
mixed2 stepSize = dt[0];
#if __CUDA_ARCH__ >= 130
double oneOverDt = 1.0/stepSize.y;
#else
float oneOverDt = 1.0f/stepSize.y;
float correction = (1.0f-oneOverDt*stepSize.y)/stepSize.y;
#endif
int index = blockIdx.x*blockDim.x+threadIdx.x;
if (index == 0)
dt[0].x = stepSize.y;
......@@ -55,7 +60,11 @@ extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4*
pos.x += delta.x;
pos.y += delta.y;
pos.z += delta.z;
#if __CUDA_ARCH__ >= 130
velocity = make_mixed4((mixed) (delta.x*oneOverDt), (mixed) (delta.y*oneOverDt), (mixed) (delta.z*oneOverDt), velocity.w);
#else
velocity = make_mixed4((mixed) (delta.x*oneOverDt+delta.x*correction), (mixed) (delta.y*oneOverDt+delta.y*correction), (mixed) (delta.z*oneOverDt+delta.z*correction), velocity.w);
#endif
#ifdef USE_MIXED_PRECISION
posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
......
......@@ -97,7 +97,6 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
// Try to figure out which device is the fastest.
int bestSpeed = -1;
bool bestSupportsDouble = false;
for (int i = 0; i < (int) devices.size(); i++) {
if (platformVendor == "Apple" && devices[i].getInfo<CL_DEVICE_VENDOR>() == "AMD")
continue; // Don't use AMD GPUs on OS X due to serious bugs.
......@@ -136,11 +135,9 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
}
}
int speed = devices[i].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()*processingElementsPerComputeUnit*devices[i].getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>();
bool supportsDouble = (devices[i].getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_fp64") != string::npos);
if (maxSize >= minThreadBlockSize && speed > bestSpeed && (supportsDouble || !bestSupportsDouble)) {
if (maxSize >= minThreadBlockSize && speed > bestSpeed) {
deviceIndex = i;
bestSpeed = speed;
bestSupportsDouble = supportsDouble;
}
}
}
......@@ -277,7 +274,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
clearFiveBuffersKernel = cl::Kernel(utilities, "clearFiveBuffers");
clearSixBuffersKernel = cl::Kernel(utilities, "clearSixBuffers");
reduceReal4Kernel = cl::Kernel(utilities, "reduceReal4Buffer");
reduceForcesKernel = cl::Kernel(utilities, "reduceForces");
if (supports64BitGlobalAtomics)
reduceForcesKernel = cl::Kernel(utilities, "reduceForces");
// Decide whether native_sqrt(), native_rsqrt(), and native_recip() are sufficiently accurate to use.
......
......@@ -5444,6 +5444,10 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
cl.setTime(cl.getTime()+integrator.getStepSize());
cl.setStepCount(cl.getStepCount()+1);
cl.reorderAtoms();
if (cl.getAtomsWereReordered()) {
forcesAreValid = false;
validSavedForces.clear();
}
// Reduce UI lag.
......
......@@ -81,7 +81,7 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
}
else {
numForceThreadBlocks = context.getNumThreadBlocks();
forceThreadBlockSize = OpenCLContext::ThreadBlockSize;
forceThreadBlockSize = (context.getSIMDWidth() >= 32 ? OpenCLContext::ThreadBlockSize : 32);
if (context.getSupports64BitGlobalAtomics()) {
// Even though using longForceBuffer, still need a single forceBuffer for the reduceForces kernel to convert the long results into float4 which will be used by later kernels.
numForceBuffers = 1;
......
......@@ -70,7 +70,7 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c
* to global memory.
*/
void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms, int* numAtoms, int numValid, __global unsigned int* interactionCount,
__global ushort2* interactingTiles, __global unsigned int* interactingAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
__global int* interactingTiles, __global unsigned int* interactingAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
__global real4* posq, real4 blockCenterX, real4 blockSizeX, unsigned int maxTiles, bool finish) {
real4 posBuffer[TILE_SIZE];
const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
......@@ -128,7 +128,7 @@ void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms,
int baseIndex = atom_add(interactionCount, tilesToStore);
if (baseIndex+tilesToStore <= maxTiles) {
for (int i = 0; i < tilesToStore; i++) {
interactingTiles[baseIndex+i] = (ushort2) (x, singlePeriodicCopy);
interactingTiles[baseIndex+i] = x;
for (int j = 0; j < TILE_SIZE; j++)
interactingAtoms[(baseIndex+i)*TILE_SIZE+j] = atoms[i*TILE_SIZE+j];
}
......@@ -145,7 +145,7 @@ void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms,
int baseIndex = atom_add(interactionCount, tilesToStore);
if (baseIndex+tilesToStore <= maxTiles) {
for (int i = 0; i < tilesToStore; i++) {
interactingTiles[baseIndex+i] = (ushort2) (x, singlePeriodicCopy);
interactingTiles[baseIndex+i] = x;
for (int j = 0; j < TILE_SIZE; j++) {
int index = i*TILE_SIZE+j;
interactingAtoms[(baseIndex+i)*TILE_SIZE+j] = (index < *numAtoms ? atoms[index] : NUM_ATOMS);
......@@ -159,9 +159,8 @@ void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms,
* Compare the bounding boxes for each pair of blocks. If they are sufficiently far apart,
* mark them as non-interacting.
*/
__kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict blockCenter,
__global const real4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount, __global ushort2* restrict interactingTiles,
__global unsigned int* restrict interactingAtoms, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex,
__kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, __global unsigned int* restrict interactionCount,
__global int* restrict interactingTiles, __global unsigned int* restrict interactingAtoms, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex,
unsigned int numBlocks, __global real2* restrict sortedBlocks, __global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real4* restrict oldPositions,
__global const int* restrict rebuildNeighborList) {
......@@ -180,8 +179,8 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
numAtoms = 0;
real2 sortedKey = sortedBlocks[i];
unsigned short x = (unsigned short) sortedKey.y;
real4 blockCenterX = blockCenter[x];
real4 blockSizeX = blockBoundingBox[x];
real4 blockCenterX = sortedBlockCenter[i];
real4 blockSizeX = sortedBlockBoundingBox[i];
// Load exclusion data for block x.
......
......@@ -424,6 +424,7 @@ __kernel void computeGBSAForce1(
localData[localAtomIndex].z = posq1.z;
localData[localAtomIndex].q = posq1.w;
localData[get_local_id(0)].bornRadius = bornRadius1;
SYNC_WARPS;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
real4 posq2 = (real4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
......@@ -454,8 +455,8 @@ __kernel void computeGBSAForce1(
#ifdef USE_CUTOFF
}
#endif
SYNC_WARPS;
}
SYNC_WARPS;
}
}
else {
......@@ -472,6 +473,7 @@ __kernel void computeGBSAForce1(
localData[get_local_id(0)].fy = 0.0f;
localData[get_local_id(0)].fz = 0.0f;
localData[get_local_id(0)].fw = 0.0f;
SYNC_WARPS;
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
......@@ -627,6 +629,7 @@ __kernel void computeGBSAForce1(
localData[get_local_id(0)].fz = 0.0f;
localData[get_local_id(0)].fw = 0.0f;
}
SYNC_WARPS;
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
......
......@@ -36,6 +36,7 @@ __kernel void integrateLangevinPart2(__global real4* restrict posq, __global rea
double invStepSize = 1.0/dt[0].y;
#else
float invStepSize = 1.0f/dt[0].y;
float correction = (1.0f-invStepSize*dt[0].y)/dt[0].y;
#endif
int index = get_global_id(0);
while (index < NUM_ATOMS) {
......@@ -53,7 +54,7 @@ __kernel void integrateLangevinPart2(__global real4* restrict posq, __global rea
#ifdef SUPPORTS_DOUBLE_PRECISION
vel.xyz = convert_mixed4(invStepSize*convert_double4(delta)).xyz;
#else
vel.xyz = invStepSize*delta.xyz;
vel.xyz = invStepSize*delta.xyz + correction*delta.xyz;
#endif
#ifdef USE_MIXED_PRECISION
posq[index] = convert_real4(pos);
......
......@@ -81,6 +81,7 @@ __kernel void reduceReal4Buffer(__global real4* restrict buffer, int bufferSize,
}
}
#ifdef SUPPORTS_64_BIT_ATOMICS
/**
* Sum the various buffers containing forces.
*/
......@@ -94,6 +95,7 @@ __kernel void reduceForces(__global const long* restrict longBuffer, __global re
buffer[index] = sum;
}
}
#endif
/**
* This is called to determine the accuracy of various native functions.
......
......@@ -38,6 +38,7 @@ __kernel void integrateVerletPart2(int numAtoms, __global mixed2* restrict dt, _
double oneOverDt = 1.0/stepSize.y;
#else
float oneOverDt = 1.0f/stepSize.y;
float correction = (1.0f-oneOverDt*stepSize.y)/stepSize.y;
#endif
if (get_global_id(0) == 0)
dt[0].x = stepSize.y;
......@@ -58,7 +59,7 @@ __kernel void integrateVerletPart2(int numAtoms, __global mixed2* restrict dt, _
#ifdef SUPPORTS_DOUBLE_PRECISION
velocity.xyz = convert_mixed4(convert_double4(delta)*oneOverDt).xyz;
#else
velocity.xyz = delta.xyz*oneOverDt;
velocity.xyz = delta.xyz*oneOverDt + delta.xyz*correction;
#endif
#ifdef USE_MIXED_PRECISION
posq[index] = convert_real4(pos);
......
......@@ -455,7 +455,7 @@ void testSwitchingFunction() {
positions[1] = Vec3(r+delta, 0, 0);
context.setPositions(positions);
double e2 = context.getState(State::Energy).getPotentialEnergy();
ASSERT_EQUAL_TOL((e2-e1)/(2*delta), state.getForces()[0][0], 1e-3);
ASSERT_EQUAL_TOL((e2-e1)/(2*delta), state.getForces()[0][0], 2e-3);
}
}
......
......@@ -33,7 +33,7 @@
using namespace OpenMM;
extern "C" void registerPlatforms() {
extern "C" OPENMM_EXPORT void registerPlatforms() {
}
extern "C" OPENMM_EXPORT void registerKernelFactories() {
......
......@@ -85,7 +85,10 @@ IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${PTHREADS_LIB} ${FFTW_LIBRARY} ${FFTW_THREADS_LIBRARY})
TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${PTHREADS_LIB} ${FFTW_LIBRARY})
IF (FFTW_THREADS_LIBRARY)
TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${FFTW_THREADS_LIBRARY})
ENDIF (FFTW_THREADS_LIBRARY)
SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_PME_BUILDING_SHARED_LIBRARY")
INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
......
......@@ -32,10 +32,10 @@
using namespace OpenMM;
extern "C" void registerPlatforms() {
extern "C" OPENMM_EXPORT_PME void registerPlatforms() {
}
extern "C" void registerKernelFactories() {
extern "C" OPENMM_EXPORT_PME void registerKernelFactories() {
if (CpuCalcPmeReciprocalForceKernel::isProcessorSupported()) {
CpuPmeKernelFactory* factory = new CpuPmeKernelFactory();
for (int i = 0; i < Platform::getNumPlatforms(); i++)
......
......@@ -119,8 +119,9 @@ static void spreadCharge(int start, int end, float* posq, float* grid, int gridx
for (int i = start; i < end; i++) {
// Find the position relative to the nearest grid point.
__m128 pos = _mm_load_ps(&posq[4*i]);
__m128 posInBox = _mm_sub_ps(pos, _mm_mul_ps(boxSize, _mm_floor_ps(_mm_mul_ps(pos, invBoxSize))));
__m128 pos = _mm_loadu_ps(&posq[4*i]);
__m128 posFloor = _mm_floor_ps(_mm_mul_ps(pos, invBoxSize));
__m128 posInBox = _mm_sub_ps(pos, _mm_mul_ps(boxSize, posFloor));
__m128 t = _mm_mul_ps(_mm_mul_ps(posInBox, invBoxSize), gridSize);
__m128i ti = _mm_cvttps_epi32(t);
__m128 dr = _mm_sub_ps(t, _mm_cvtepi32_ps(ti));
......@@ -309,8 +310,9 @@ static void interpolateForces(int start, int end, float* posq, float* force, flo
for (int i = start; i < end; i++) {
// Find the position relative to the nearest grid point.
__m128 pos = _mm_load_ps(&posq[4*i]);
__m128 posInBox = _mm_sub_ps(pos, _mm_mul_ps(boxSize, _mm_floor_ps(_mm_mul_ps(pos, invBoxSize))));
__m128 pos = _mm_loadu_ps(&posq[4*i]);
__m128 posFloor = _mm_floor_ps(_mm_mul_ps(pos, invBoxSize));
__m128 posInBox = _mm_sub_ps(pos, _mm_mul_ps(boxSize, posFloor));
__m128 t = _mm_mul_ps(_mm_mul_ps(posInBox, invBoxSize), gridSize);
__m128i ti = _mm_cvttps_epi32(t);
__m128 dr = _mm_sub_ps(t, _mm_cvtepi32_ps(ti));
......@@ -638,11 +640,11 @@ int CpuCalcPmeReciprocalForceKernel::findFFTDimension(int minimum) {
// Attempt to factor the current value.
int unfactored = minimum;
for (int factor = 2; factor < 12; factor++) {
for (int factor = 2; factor < 8; factor++) {
while (unfactored > 1 && unfactored%factor == 0)
unfactored /= factor;
}
if (unfactored == 1)
if (unfactored == 1 || unfactored == 11 || unfactored == 13)
return minimum;
minimum++;
}
......
......@@ -124,6 +124,10 @@ void testPME() {
int main(int argc, char* argv[]) {
try {
if (!CpuCalcPmeReciprocalForceKernel::isProcessorSupported()) {
cout << "CPU is not supported. Exiting." << endl;
return 0;
}
testPME();
}
catch(const exception& e) {
......
......@@ -34,10 +34,10 @@
using namespace OpenMM;
extern "C" void registerPlatforms() {
extern "C" OPENMM_EXPORT void registerPlatforms() {
}
extern "C" void registerKernelFactories() {
extern "C" OPENMM_EXPORT void registerKernelFactories() {
try {
Platform& platform = Platform::getPlatformByName("CUDA");
CudaDrudeKernelFactory* factory = new CudaDrudeKernelFactory();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment