Commit 3cb25ad8 authored by Lee-Ping Wang's avatar Lee-Ping Wang
Browse files

Merge branch 'master' of github.com:leeping/openmm

parents 7bfb75c7 24608623
......@@ -35,8 +35,8 @@
using namespace OpenMM;
MonteCarloAnisotropicBarostat::MonteCarloAnisotropicBarostat(const Vec3& defaultPressure, double temperature, int frequency, bool scaleX, bool scaleY, bool scaleZ) :
defaultPressure(defaultPressure), temperature(temperature), frequency(frequency), scaleX(scaleX), scaleY(scaleY), scaleZ(scaleZ) {
MonteCarloAnisotropicBarostat::MonteCarloAnisotropicBarostat(const Vec3& defaultPressure, double temperature, bool scaleX, bool scaleY, bool scaleZ, int frequency) :
defaultPressure(defaultPressure), temperature(temperature), scaleX(scaleX), scaleY(scaleY), scaleZ(scaleZ), frequency(frequency) {
setRandomNumberSeed((int) time(NULL));
}
......
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Portions copyright (c) 2009-2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -489,7 +489,6 @@ private:
struct MoleculeGroup;
class VirtualSiteInfo;
void findMoleculeGroups();
static void tagAtomsInMolecule(int atom, int molecule, std::vector<int>& atomMolecule, std::vector<std::vector<int> >& atomBonds);
/**
* Ensure that all molecules marked as "identical" really are identical. This should be
* called whenever force field parameters change. If necessary, it will rebuild the list
......@@ -515,7 +514,7 @@ private:
int numAtomBlocks;
int numThreadBlocks;
bool useBlockingSync, useDoublePrecision, useMixedPrecision, contextIsValid, atomsWereReordered;
std::string compiler, tempDir, gpuArchitecture;
std::string compiler, tempDir, cacheDir, gpuArchitecture;
float4 periodicBoxSizeFloat, invPeriodicBoxSizeFloat;
double4 periodicBoxSize, invPeriodicBoxSize;
std::string defaultOptimizationOptions;
......
......@@ -638,7 +638,7 @@ private:
class CudaCalcCustomNonbondedForceKernel : public CalcCustomNonbondedForceKernel {
public:
CudaCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcCustomNonbondedForceKernel(name, platform),
cu(cu), params(NULL), globals(NULL), tabulatedFunctionParams(NULL), forceCopy(NULL), system(system) {
cu(cu), params(NULL), globals(NULL), tabulatedFunctionParams(NULL), interactionGroupData(NULL), forceCopy(NULL), system(system), hasInitializedKernel(false) {
}
~CudaCalcCustomNonbondedForceKernel();
/**
......@@ -665,15 +665,20 @@ public:
*/
void copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force);
private:
void initInteractionGroups(const CustomNonbondedForce& force, const std::string& interactionSource);
CudaContext& cu;
CudaParameterSet* params;
CudaArray* globals;
CudaArray* tabulatedFunctionParams;
CudaArray* interactionGroupData;
CUfunction interactionGroupKernel;
std::vector<void*> interactionGroupArgs;
std::vector<std::string> globalParamNames;
std::vector<float> globalParamValues;
std::vector<CudaArray*> tabulatedFunctions;
double longRangeCoefficient;
bool hasInitializedLongRangeCorrection;
bool hasInitializedLongRangeCorrection, hasInitializedKernel;
int numGroupThreadBlocks;
CustomNonbondedForce* forceCopy;
const System& system;
};
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Portions copyright (c) 2009-2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -35,19 +35,25 @@
#include "CudaIntegrationUtilities.h"
#include "CudaKernelSources.h"
#include "CudaNonbondedUtilities.h"
#include "SHA1.h"
#include "hilbert.h"
#include "openmm/OpenMMException.h"
#include "openmm/Platform.h"
#include "openmm/System.h"
#include "openmm/VirtualSite.h"
#include "CudaExpressionUtilities.h"
#include "openmm/internal/ContextImpl.h"
#include <algorithm>
#include <cstdlib>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <sstream>
#include <typeinfo>
#include <cudaProfiler.h>
#ifndef WIN32
#include <unistd.h>
#endif
#define CHECK_RESULT(result) CHECK_RESULT2(result, errorMessage);
......@@ -87,10 +93,14 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
}
else
throw OpenMMException("Illegal value for CudaPrecision: "+precision);
char* cacheVariable = getenv("OPENMM_CACHE_DIR");
cacheDir = (cacheVariable == NULL ? tempDir : string(cacheVariable));
#ifdef WIN32
this->tempDir = tempDir+"\\";
cacheDir = cacheDir+"\\";
#else
this->tempDir = tempDir+"/";
cacheDir = cacheDir+"/";
#endif
contextIndex = platformData.contexts.size();
int numDevices;
......@@ -214,6 +224,8 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
compilationDefines["ACOS"] = useDoublePrecision ? "acos" : "acosf";
compilationDefines["ASIN"] = useDoublePrecision ? "asin" : "asinf";
compilationDefines["ATAN"] = useDoublePrecision ? "atan" : "atanf";
compilationDefines["ERF"] = useDoublePrecision ? "erf" : "erff";
compilationDefines["ERFC"] = useDoublePrecision ? "erfc" : "erfcf";
// Create the work thread used for parallelization when running on multiple devices.
......@@ -347,6 +359,7 @@ static bool compileInWindows(const string &command) {
#endif
CUmodule CudaContext::createModule(const string source, const map<string, string>& defines, const char* optimizationFlags) {
string bits = intToString(8*sizeof(void*));
string options = (optimizationFlags == NULL ? defaultOptimizationOptions : string(optimizationFlags));
stringstream src;
if (!options.empty())
......@@ -394,17 +407,38 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
src << endl;
src << source << endl;
// See whether we already have PTX for this kernel cached.
CSHA1 sha1;
sha1.Update((const UINT_8*) src.str().c_str(), src.str().size());
sha1.Final();
UINT_8 hash[20];
sha1.GetHash(hash);
stringstream cacheFile;
cacheFile << cacheDir;
cacheFile.flags(ios::hex);
for (int i = 0; i < 20; i++)
cacheFile << setw(2) << setfill('0') << (int) hash[i];
cacheFile << '_' << gpuArchitecture << '_' << bits;
CUmodule module;
if (cuModuleLoad(&module, cacheFile.str().c_str()) == CUDA_SUCCESS)
return module;
// Write out the source to a temporary file.
stringstream tempFileName;
tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions.
#ifdef WIN32
tempFileName << "_" << GetCurrentProcessId();
#else
tempFileName << "_" << getpid();
#endif
string inputFile = (tempDir+tempFileName.str()+".cu");
string outputFile = (tempDir+tempFileName.str()+".ptx");
string logFile = (tempDir+tempFileName.str()+".log");
ofstream out(inputFile.c_str());
out << src.str();
out.close();
string bits = intToString(8*sizeof(void*));
#ifdef WIN32
#ifdef _DEBUG
string command = "\""+compiler+"\" --ptx -G -g --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o "+outputFile+" "+options+" "+inputFile+" 2> "+logFile;
......@@ -433,7 +467,6 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
}
throw OpenMMException(error.str());
}
CUmodule module;
CUresult result = cuModuleLoad(&module, outputFile.c_str());
if (result != CUDA_SUCCESS) {
std::stringstream m;
......@@ -441,7 +474,8 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
throw OpenMMException(m.str());
}
remove(inputFile.c_str());
remove(outputFile.c_str());
if (rename(outputFile.c_str(), cacheFile.str().c_str()) != 0)
remove(outputFile.c_str());
remove(logFile.c_str());
return module;
}
......@@ -616,15 +650,6 @@ void CudaContext::clearAutoclearBuffers() {
}
}
void CudaContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds) {
// Recursively tag atoms as belonging to a particular molecule.
atomMolecule[atom] = molecule;
for (int i = 0; i < (int) atomBonds[atom].size(); i++)
if (atomMolecule[atomBonds[atom][i]] == -1)
tagAtomsInMolecule(atomBonds[atom][i], molecule, atomMolecule, atomBonds);
}
/**
* This class ensures that atom reordering doesn't break virtual sites.
*/
......@@ -719,16 +744,14 @@ void CudaContext::findMoleculeGroups() {
}
}
// Now tag atoms by which molecule they belong to.
// Now identify atoms by which molecule they belong to.
vector<int> atomMolecule(numAtoms, -1);
int numMolecules = 0;
for (int i = 0; i < numAtoms; i++)
if (atomMolecule[i] == -1)
tagAtomsInMolecule(i, numMolecules++, atomMolecule, atomBonds);
vector<vector<int> > atomIndices(numMolecules);
for (int i = 0; i < numAtoms; i++)
atomIndices[atomMolecule[i]].push_back(i);
vector<vector<int> > atomIndices = ContextImpl::findMolecules(numAtoms, atomBonds);
int numMolecules = atomIndices.size();
vector<int> atomMolecule(numAtoms);
for (int i = 0; i < (int) atomIndices.size(); i++)
for (int j = 0; j < (int) atomIndices[i].size(); j++)
atomMolecule[atomIndices[i][j]] = i;
// Construct a description of each molecule.
......
This diff is collapsed.
......@@ -2,11 +2,11 @@
* Apply the Andersen thermostat to adjust particle velocities.
*/
extern "C" __global__ void applyAndersenThermostat(float collisionFrequency, float kT, mixed4* velm, const mixed4* __restrict__ stepSize, const float4* __restrict__ random,
extern "C" __global__ void applyAndersenThermostat(int numAtoms, float collisionFrequency, float kT, mixed4* velm, const mixed4* __restrict__ stepSize, const float4* __restrict__ random,
unsigned int randomIndex, const int* __restrict__ atomGroups) {
float collisionProbability = 1.0f-expf(-(float) (collisionFrequency*stepSize[0].y));
float randomRange = erff(collisionProbability/sqrtf(2.0f));
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
mixed4 velocity = velm[index];
float4 selectRand = random[randomIndex+atomGroups[index]];
float4 velRand = random[randomIndex+index];
......
......@@ -2,16 +2,16 @@
* Perform the first step of Brownian integration.
*/
extern "C" __global__ void integrateBrownianPart1(mixed tauDeltaT, mixed noiseAmplitude, const long long* __restrict__ force,
extern "C" __global__ void integrateBrownianPart1(int numAtoms, int paddedNumAtoms, mixed tauDeltaT, mixed noiseAmplitude, const long long* __restrict__ force,
mixed4* __restrict__ posDelta, const mixed4* __restrict__ velm, const float4* __restrict__ random, unsigned int randomIndex) {
randomIndex += blockIdx.x*blockDim.x+threadIdx.x;
const mixed fscale = tauDeltaT/(mixed) 0x100000000;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
mixed invMass = velm[index].w;
if (invMass != 0) {
posDelta[index].x = fscale*invMass*force[index] + noiseAmplitude*SQRT(invMass)*random[randomIndex].x;
posDelta[index].y = fscale*invMass*force[index+PADDED_NUM_ATOMS] + noiseAmplitude*SQRT(invMass)*random[randomIndex].y;
posDelta[index].z = fscale*invMass*force[index+PADDED_NUM_ATOMS*2] + noiseAmplitude*SQRT(invMass)*random[randomIndex].z;
posDelta[index].y = fscale*invMass*force[index+paddedNumAtoms] + noiseAmplitude*SQRT(invMass)*random[randomIndex].y;
posDelta[index].z = fscale*invMass*force[index+paddedNumAtoms*2] + noiseAmplitude*SQRT(invMass)*random[randomIndex].z;
}
randomIndex += blockDim.x*gridDim.x;
}
......@@ -21,9 +21,9 @@ extern "C" __global__ void integrateBrownianPart1(mixed tauDeltaT, mixed noiseAm
* Perform the second step of Brownian integration.
*/
extern "C" __global__ void integrateBrownianPart2(mixed deltaT, real4* posq, real4* __restrict__ posqCorrection, mixed4* velm, const mixed4* __restrict__ posDelta) {
extern "C" __global__ void integrateBrownianPart2(int numAtoms, mixed deltaT, real4* posq, real4* __restrict__ posqCorrection, mixed4* velm, const mixed4* __restrict__ posDelta) {
const mixed oneOverDeltaT = RECIP(deltaT);
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
if (velm[index].w != 0) {
mixed4 delta = posDelta[index];
velm[index].x = oneOverDeltaT*delta.x;
......
extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta) {
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
extern "C" __global__ void applyPositionDeltas(int numAtoms, real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta) {
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
......
......@@ -22,8 +22,11 @@ if ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection) {
if (needCorrection) {
// Subtract off the part of this interaction that was included in the reciprocal space contribution.
tempForce = -prefactor*((1.0f-erfcAlphaR)-alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
tempEnergy += -prefactor*(1.0f-erfcAlphaR);
if (1-erfcAlphaR > 1e-6) {
real erfAlphaR = ERF(alphaR); // Our erfc approximation is not accurate enough when r is very small, which happens with Drude particles.
tempForce = -prefactor*(erfAlphaR-alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
tempEnergy += -prefactor*erfAlphaR;
}
}
else {
#if HAS_LENNARD_JONES
......
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
typedef struct {
real x, y, z;
real q;
real fx, fy, fz;
ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN
real padding;
#endif
} AtomData;
extern "C" __global__ void computeInteractionGroups(
unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq, const int4* __restrict__ groupData,
real4 periodicBoxSize, real4 invPeriodicBoxSize
PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
const unsigned int tbx = threadIdx.x - tgx; // block warpIndex
real energy = 0.0f;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
const unsigned int startTile = FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps;
const unsigned int endTile = FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps;
for (int tile = startTile; tile < endTile; tile++) {
const int4 atomData = groupData[TILE_SIZE*tile+tgx];
const int atom1 = atomData.x;
const int atom2 = atomData.y;
const int rangeStart = atomData.z&0xFFFF;
const int rangeEnd = (atomData.z>>16)&0xFFFF;
const int exclusions = atomData.w;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
real3 force = make_real3(0);
real4 posq2 = posq[atom2];
localData[threadIdx.x].x = posq2.x;
localData[threadIdx.x].y = posq2.y;
localData[threadIdx.x].z = posq2.z;
localData[threadIdx.x].q = posq2.w;
LOAD_LOCAL_PARAMETERS
localData[threadIdx.x].fx = 0.0f;
localData[threadIdx.x].fy = 0.0f;
localData[threadIdx.x].fz = 0.0f;
int tj = tgx;
for (int j = rangeStart; j < rangeEnd; j++) {
bool isExcluded = (((exclusions>>tj)&1) == 0);
int localIndex = tbx+tj;
posq2 = make_real4(localData[localIndex].x, localData[localIndex].y, localData[localIndex].z, localData[localIndex].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (!isExcluded && r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
real dEdR = 0.0f;
real tempEnergy = 0.0f;
COMPUTE_INTERACTION
energy += tempEnergy;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[localIndex].fx += delta.x;
localData[localIndex].fy += delta.y;
localData[localIndex].fz += delta.z;
#ifdef USE_CUTOFF
}
#endif
tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
}
if (exclusions != 0) {
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
}
atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
}
\ No newline at end of file
......@@ -4,7 +4,7 @@ enum {VelScale, ForceScale, NoiseScale, MaxParams};
* Perform the first step of Langevin integration.
*/
extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta,
extern "C" __global__ void integrateLangevinPart1(int numAtoms, int paddedNumAtoms, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta,
const mixed* __restrict__ paramBuffer, const mixed2* __restrict__ dt, const float4* __restrict__ random, unsigned int randomIndex) {
mixed vscale = paramBuffer[VelScale];
mixed fscale = paramBuffer[ForceScale]/(mixed) 0x100000000;
......@@ -12,13 +12,13 @@ extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, con
mixed stepSize = dt[0].y;
int index = blockIdx.x*blockDim.x+threadIdx.x;
randomIndex += index;
while (index < NUM_ATOMS) {
while (index < numAtoms) {
mixed4 velocity = velm[index];
if (velocity.w != 0) {
mixed sqrtInvMass = SQRT(velocity.w);
velocity.x = vscale*velocity.x + fscale*velocity.w*force[index] + noisescale*sqrtInvMass*random[randomIndex].x;
velocity.y = vscale*velocity.y + fscale*velocity.w*force[index+PADDED_NUM_ATOMS] + noisescale*sqrtInvMass*random[randomIndex].y;
velocity.z = vscale*velocity.z + fscale*velocity.w*force[index+PADDED_NUM_ATOMS*2] + noisescale*sqrtInvMass*random[randomIndex].z;
velocity.y = vscale*velocity.y + fscale*velocity.w*force[index+paddedNumAtoms] + noisescale*sqrtInvMass*random[randomIndex].y;
velocity.z = vscale*velocity.z + fscale*velocity.w*force[index+paddedNumAtoms*2] + noisescale*sqrtInvMass*random[randomIndex].z;
velm[index] = velocity;
posDelta[index] = make_mixed4(stepSize*velocity.x, stepSize*velocity.y, stepSize*velocity.z, 0);
}
......@@ -31,7 +31,7 @@ extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, con
* Perform the second step of Langevin integration.
*/
extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real4* __restrict__ posqCorrection, const mixed4* __restrict__ posDelta, mixed4* __restrict__ velm, const mixed2* __restrict__ dt) {
extern "C" __global__ void integrateLangevinPart2(int numAtoms, real4* __restrict__ posq, real4* __restrict__ posqCorrection, const mixed4* __restrict__ posDelta, mixed4* __restrict__ velm, const mixed2* __restrict__ dt) {
#if __CUDA_ARCH__ >= 130
double invStepSize = 1.0/dt[0].y;
#else
......@@ -39,7 +39,7 @@ extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real
float correction = (1.0f-invStepSize*dt[0].y)/dt[0].y;
#endif
int index = blockIdx.x*blockDim.x+threadIdx.x;
while (index < NUM_ATOMS) {
while (index < numAtoms) {
mixed4 vel = velm[index];
if (vel.w != 0) {
#ifdef USE_MIXED_PRECISION
......@@ -78,7 +78,7 @@ extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real
* Select the step size to use for the next step.
*/
extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed errorTol, mixed tau, mixed kT, mixed2* __restrict__ dt,
extern "C" __global__ void selectLangevinStepSize(int numAtoms, int paddedNumAtoms, mixed maxStepSize, mixed errorTol, mixed tau, mixed kT, mixed2* __restrict__ dt,
const mixed4* __restrict__ velm, const long long* __restrict__ force, mixed* __restrict__ paramBuffer) {
// Calculate the error.
......@@ -87,8 +87,8 @@ extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed error
mixed err = 0;
unsigned int index = threadIdx.x;
const mixed scale = RECIP((mixed) 0x100000000);
while (index < NUM_ATOMS) {
mixed3 f = make_mixed3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]);
while (index < numAtoms) {
mixed3 f = make_mixed3(scale*force[index], scale*force[index+paddedNumAtoms], scale*force[index+paddedNumAtoms*2]);
mixed invMass = velm[index].w;
err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass;
index += blockDim.x*gridDim.x;
......@@ -106,7 +106,7 @@ extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed error
if (blockIdx.x*blockDim.x+threadIdx.x == 0) {
// Select the new step size.
mixed totalError = SQRT(error[0]/(NUM_ATOMS*3));
mixed totalError = SQRT(error[0]/(numAtoms*3));
mixed newStepSize = SQRT(errorTol/totalError);
mixed oldStepSize = dt[0].y;
if (oldStepSize > 0.0f)
......
......@@ -2,13 +2,13 @@
* Perform the first step of Verlet integration.
*/
extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, const real4* __restrict__ posq,
extern "C" __global__ void integrateVerletPart1(int numAtoms, int paddedNumAtoms, const mixed2* __restrict__ dt, const real4* __restrict__ posq,
const real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta) {
const mixed2 stepSize = dt[0];
const mixed dtPos = stepSize.y;
const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
const mixed scale = dtVel/(mixed) 0x100000000;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
mixed4 velocity = velm[index];
if (velocity.w != 0.0) {
#ifdef USE_MIXED_PRECISION
......@@ -19,8 +19,8 @@ extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, c
real4 pos = posq[index];
#endif
velocity.x += scale*force[index]*velocity.w;
velocity.y += scale*force[index+PADDED_NUM_ATOMS]*velocity.w;
velocity.z += scale*force[index+PADDED_NUM_ATOMS*2]*velocity.w;
velocity.y += scale*force[index+paddedNumAtoms]*velocity.w;
velocity.z += scale*force[index+paddedNumAtoms*2]*velocity.w;
pos.x = velocity.x*dtPos;
pos.y = velocity.y*dtPos;
pos.z = velocity.z*dtPos;
......@@ -34,7 +34,7 @@ extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, c
* Perform the second step of Verlet integration.
*/
extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4* __restrict__ posq,
extern "C" __global__ void integrateVerletPart2(int numAtoms, mixed2* __restrict__ dt, real4* __restrict__ posq,
real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const mixed4* __restrict__ posDelta) {
mixed2 stepSize = dt[0];
#if __CUDA_ARCH__ >= 130
......@@ -46,7 +46,7 @@ extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4*
int index = blockIdx.x*blockDim.x+threadIdx.x;
if (index == 0)
dt[0].x = stepSize.y;
for (; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
for (; index < numAtoms; index += blockDim.x*gridDim.x) {
mixed4 velocity = velm[index];
if (velocity.w != 0.0) {
#ifdef USE_MIXED_PRECISION
......@@ -80,14 +80,14 @@ extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4*
* Select the step size to use for the next step.
*/
extern "C" __global__ void selectVerletStepSize(mixed maxStepSize, mixed errorTol, mixed2* __restrict__ dt, const mixed4* __restrict__ velm, const long long* __restrict__ force) {
extern "C" __global__ void selectVerletStepSize(int numAtoms, int paddedNumAtoms, mixed maxStepSize, mixed errorTol, mixed2* __restrict__ dt, const mixed4* __restrict__ velm, const long long* __restrict__ force) {
// Calculate the error.
extern __shared__ mixed error[];
mixed err = 0.0f;
const mixed scale = RECIP((mixed) 0x100000000);
for (int index = threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
mixed3 f = make_mixed3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]);
for (int index = threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
mixed3 f = make_mixed3(scale*force[index], scale*force[index+paddedNumAtoms], scale*force[index+paddedNumAtoms*2]);
mixed invMass = velm[index].w;
err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass;
}
......@@ -102,7 +102,7 @@ extern "C" __global__ void selectVerletStepSize(mixed maxStepSize, mixed errorTo
__syncthreads();
}
if (threadIdx.x == 0) {
mixed totalError = SQRT(error[0]/(NUM_ATOMS*3));
mixed totalError = SQRT(error[0]/(numAtoms*3));
mixed newStepSize = SQRT(errorTol/totalError);
mixed oldStepSize = dt[0].y;
if (oldStepSize > 0.0f)
......
......@@ -34,6 +34,9 @@
* This tests all the different force terms in the CUDA implementation of CustomNonbondedForce.
*/
#ifdef WIN32
#define _USE_MATH_DEFINES // Needed to get M_PI
#endif
#include "openmm/internal/AssertionUtilities.h"
#include "sfmt/SFMT.h"
#include "openmm/Context.h"
......@@ -42,6 +45,7 @@
#include "openmm/NonbondedForce.h"
#include "openmm/System.h"
#include "openmm/VerletIntegrator.h"
#include <cmath>
#include <iostream>
#include <vector>
......@@ -538,6 +542,179 @@ void testLongRangeCorrection() {
ASSERT_EQUAL_TOL(standardEnergy1-standardEnergy2, customEnergy1-customEnergy2, 1e-4);
}
void testInteractionGroups() {
const int numParticles = 6;
System system;
VerletIntegrator integrator(0.01);
CustomNonbondedForce* nonbonded = new CustomNonbondedForce("v1+v2");
nonbonded->addPerParticleParameter("v");
vector<double> params(1, 0.001);
for (int i = 0; i < numParticles; i++) {
system.addParticle(1.0);
nonbonded->addParticle(params);
params[0] *= 10;
}
set<int> set1, set2, set3, set4;
set1.insert(2);
set2.insert(0);
set2.insert(1);
set2.insert(2);
set2.insert(3);
set2.insert(4);
set2.insert(5);
nonbonded->addInteractionGroup(set1, set2); // Particle 2 interacts with every other particle.
set3.insert(0);
set3.insert(1);
set4.insert(4);
set4.insert(5);
nonbonded->addInteractionGroup(set3, set4); // Particles 0 and 1 interact with 4 and 5.
nonbonded->addExclusion(1, 2); // Add an exclusion to make sure it gets skipped.
system.addForce(nonbonded);
Context context(system, integrator, platform);
vector<Vec3> positions(numParticles);
context.setPositions(positions);
State state = context.getState(State::Energy);
double expectedEnergy = 331.423; // Each digit is the number of interactions a particle particle is involved in.
ASSERT_EQUAL_TOL(expectedEnergy, state.getPotentialEnergy(), TOL);
}
void testLargeInteractionGroup() {
const int numMolecules = 300;
const int numParticles = numMolecules*2;
const double boxSize = 20.0;
// Create a large system.
System system;
system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
for (int i = 0; i < numParticles; i++)
system.addParticle(1.0);
CustomNonbondedForce* nonbonded = new CustomNonbondedForce("4*eps*((sigma/r)^12-(sigma/r)^6)+138.935456*q/r; q=q1*q2; sigma=0.5*(sigma1+sigma2); eps=sqrt(eps1*eps2)");
nonbonded->addPerParticleParameter("q");
nonbonded->addPerParticleParameter("sigma");
nonbonded->addPerParticleParameter("eps");
vector<Vec3> positions(numParticles);
OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt);
vector<double> params(3);
for (int i = 0; i < numMolecules; i++) {
if (i < numMolecules/2) {
params[0] = 1.0;
params[1] = 0.2;
params[2] = 0.1;
nonbonded->addParticle(params);
params[0] = -1.0;
params[1] = 0.1;
nonbonded->addParticle(params);
}
else {
params[0] = 1.0;
params[1] = 0.2;
params[2] = 0.2;
nonbonded->addParticle(params);
params[0] = -1.0;
params[1] = 0.1;
nonbonded->addParticle(params);
}
positions[2*i] = Vec3(boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt));
positions[2*i+1] = Vec3(positions[2*i][0]+1.0, positions[2*i][1], positions[2*i][2]);
nonbonded->addExclusion(2*i, 2*i+1);
}
nonbonded->setNonbondedMethod(CustomNonbondedForce::CutoffPeriodic);
system.addForce(nonbonded);
// Compute the forces.
VerletIntegrator integrator(0.01);
Context context(system, integrator, platform);
context.setPositions(positions);
State state1 = context.getState(State::Forces);
// Modify the force so only one particle interacts with everything else.
set<int> set1, set2;
set1.insert(151);
for (int i = 0; i < numParticles; i++)
set2.insert(i);
nonbonded->addInteractionGroup(set1, set2);
context.reinitialize();
context.setPositions(positions);
State state2 = context.getState(State::Forces);
// The force on that one particle should be the same.
ASSERT_EQUAL_VEC(state1.getForces()[151], state2.getForces()[151], 1e-4);
// Modify the interaction group so it includes all interactions. This should now reproduce the original forces
// on all atoms.
for (int i = 0; i < numParticles; i++)
set1.insert(i);
nonbonded->setInteractionGroupParameters(0, set1, set2);
context.reinitialize();
context.setPositions(positions);
State state3 = context.getState(State::Forces);
for (int i = 0; i < numParticles; i++)
ASSERT_EQUAL_VEC(state1.getForces()[i], state3.getForces()[i], 1e-4);
}
void testInteractionGroupLongRangeCorrection() {
const int numParticles = 10;
const double boxSize = 10.0;
const double cutoff = 0.5;
System system;
system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
CustomNonbondedForce* nonbonded = new CustomNonbondedForce("c1*c2*r^-4");
nonbonded->addPerParticleParameter("c");
vector<Vec3> positions(numParticles);
vector<double> params(1);
for (int i = 0; i < numParticles; i++) {
system.addParticle(1.0);
params[0] = (i%2 == 0 ? 1.1 : 2.0);
nonbonded->addParticle(params);
positions[i] = Vec3(0.5*i, 0, 0);
}
nonbonded->setNonbondedMethod(CustomNonbondedForce::CutoffPeriodic);
nonbonded->setCutoffDistance(cutoff);
system.addForce(nonbonded);
// Setup nonbonded groups. They involve 1 interaction of type AA,
// 2 of type BB, and 5 of type AB.
set<int> set1, set2, set3, set4, set5;
set1.insert(0);
set1.insert(1);
set1.insert(2);
nonbonded->addInteractionGroup(set1, set1);
set2.insert(3);
set3.insert(4);
set3.insert(6);
set3.insert(8);
nonbonded->addInteractionGroup(set2, set3);
set4.insert(5);
set5.insert(7);
set5.insert(9);
nonbonded->addInteractionGroup(set4, set5);
// Compute energy with and without the correction.
VerletIntegrator integrator(0.01);
Context context(system, integrator, platform);
context.setPositions(positions);
double energy1 = context.getState(State::Energy).getPotentialEnergy();
nonbonded->setUseLongRangeCorrection(true);
context.reinitialize();
context.setPositions(positions);
double energy2 = context.getState(State::Energy).getPotentialEnergy();
// Check the result.
double sum = (1.1*1.1 + 2*2.0*2.0 + 5*1.1*2.0)*2.0;
int numPairs = (numParticles*(numParticles+1))/2;
double expected = 2*M_PI*numParticles*numParticles*sum/(numPairs*boxSize*boxSize*boxSize);
ASSERT_EQUAL_TOL(expected, energy2-energy1, 1e-4);
}
int main(int argc, char* argv[]) {
try {
if (argc > 1)
......@@ -553,6 +730,9 @@ int main(int argc, char* argv[]) {
testParallelComputation();
testSwitchingFunction();
testLongRangeCorrection();
testInteractionGroups();
testLargeInteractionGroup();
testInteractionGroupLongRangeCorrection();
}
catch(const exception& e) {
cout << "exception: " << e.what() << endl;
......
......@@ -53,44 +53,6 @@ using namespace std;
CudaPlatform platform;
void testChangingBoxSize() {
System system;
system.setDefaultPeriodicBoxVectors(Vec3(4, 0, 0), Vec3(0, 5, 0), Vec3(0, 0, 6));
system.addParticle(1.0);
NonbondedForce* nb = new NonbondedForce();
nb->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
nb->setCutoffDistance(2.0);
nb->addParticle(1, 0.5, 0.5);
system.addForce(nb);
LangevinIntegrator integrator(300.0, 1.0, 0.01);
Context context(system, integrator, platform);
vector<Vec3> positions;
positions.push_back(Vec3());
context.setPositions(positions);
Vec3 x, y, z;
context.getState(State::Forces).getPeriodicBoxVectors(x, y, z);
ASSERT_EQUAL_VEC(Vec3(4, 0, 0), x, 0);
ASSERT_EQUAL_VEC(Vec3(0, 5, 0), y, 0);
ASSERT_EQUAL_VEC(Vec3(0, 0, 6), z, 0);
context.setPeriodicBoxVectors(Vec3(7, 0, 0), Vec3(0, 8, 0), Vec3(0, 0, 9));
context.getState(State::Forces).getPeriodicBoxVectors(x, y, z);
ASSERT_EQUAL_VEC(Vec3(7, 0, 0), x, 0);
ASSERT_EQUAL_VEC(Vec3(0, 8, 0), y, 0);
ASSERT_EQUAL_VEC(Vec3(0, 0, 9), z, 0);
// Shrinking the box too small should produce an exception.
context.setPeriodicBoxVectors(Vec3(7, 0, 0), Vec3(0, 3.9, 0), Vec3(0, 0, 9));
bool ok = true;
try {
context.getState(State::Forces).getPeriodicBoxVectors(x, y, z);
ok = false;
}
catch (exception& ex) {
}
ASSERT(ok);
}
void testIdealGas() {
const int numParticles = 64;
const int frequency = 10;
......@@ -112,7 +74,7 @@ void testIdealGas() {
system.addParticle(1.0);
positions[i] = Vec3(initialLength*genrand_real2(sfmt), 0.5*initialLength*genrand_real2(sfmt), 2*initialLength*genrand_real2(sfmt));
}
MonteCarloAnisotropicBarostat* barostat = new MonteCarloAnisotropicBarostat(Vec3(pressure, pressure, pressure), temp[0], frequency);
MonteCarloAnisotropicBarostat* barostat = new MonteCarloAnisotropicBarostat(Vec3(pressure, pressure, pressure), temp[0], true, true, true, frequency);
system.addForce(barostat);
// Test it for three different temperatures.
......@@ -170,7 +132,7 @@ void testIdealGasAxis(int axis) {
system.addParticle(1.0);
positions[i] = Vec3(initialLength*genrand_real2(sfmt), 0.5*initialLength*genrand_real2(sfmt), 2*initialLength*genrand_real2(sfmt));
}
MonteCarloAnisotropicBarostat* barostat = new MonteCarloAnisotropicBarostat(Vec3(pressure, pressure, pressure), temp[0], frequency, scaleX, scaleY, scaleZ);
MonteCarloAnisotropicBarostat* barostat = new MonteCarloAnisotropicBarostat(Vec3(pressure, pressure, pressure), temp[0], scaleX, scaleY, scaleZ, frequency);
system.addForce(barostat);
// Test it for three different temperatures.
......@@ -226,7 +188,7 @@ void testRandomSeed() {
forceField->addParticle((i%2 == 0 ? 1.0 : -1.0), 1.0, 5.0);
}
system.addForce(forceField);
MonteCarloAnisotropicBarostat* barostat = new MonteCarloAnisotropicBarostat(Vec3(pressure, pressure, pressure), temp, 1);
MonteCarloAnisotropicBarostat* barostat = new MonteCarloAnisotropicBarostat(Vec3(pressure, pressure, pressure), temp, true, true, true, 1);
system.addForce(barostat);
vector<Vec3> positions(numParticles);
vector<Vec3> velocities(numParticles);
......@@ -332,7 +294,7 @@ void testEinsteinCrystal() {
system.addForce(force);
system.addForce(nb);
// Create the barostat.
MonteCarloAnisotropicBarostat* barostat = new MonteCarloAnisotropicBarostat(Vec3(pres3[p], pres3[p], pres3[p]), temp, frequency, (a==0||a==3), (a==1||a==3), (a==2||a==3));
MonteCarloAnisotropicBarostat* barostat = new MonteCarloAnisotropicBarostat(Vec3(pres3[p], pres3[p], pres3[p]), temp, (a==0||a==3), (a==1||a==3), (a==2||a==3), frequency);
system.addForce(barostat);
barostat->setTemperature(temp);
LangevinIntegrator integrator(temp, 0.1, 0.01);
......@@ -422,7 +384,6 @@ int main(int argc, char* argv[]) {
try {
if (argc > 1)
platform.setPropertyDefaultValue("CudaPrecision", string(argv[1]));
testChangingBoxSize();
testIdealGas();
testIdealGasAxis(0);
testIdealGasAxis(1);
......
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2011 Stanford University and the Authors. *
* Portions copyright (c) 2009-2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -191,6 +191,12 @@ public:
int getDeviceIndex() {
return deviceIndex;
}
/**
* Get the index of the cl::Platform associated with this object.
*/
int getPlatformIndex() {
return platformIndex;
}
/**
* Get the PlatformData object this context is part of.
*/
......@@ -589,7 +595,6 @@ private:
struct MoleculeGroup;
class VirtualSiteInfo;
void findMoleculeGroups();
static void tagAtomsInMolecule(int atom, int molecule, std::vector<int>& atomMolecule, std::vector<std::vector<int> >& atomBonds);
/**
* Ensure that all molecules marked as "identical" really are identical. This should be
* called whenever force field parameters change. If necessary, it will rebuild the list
......@@ -605,6 +610,7 @@ private:
double time;
OpenCLPlatform::PlatformData& platformData;
int deviceIndex;
int platformIndex;
int contextIndex;
int stepCount;
int computeForceCount;
......
......@@ -639,7 +639,7 @@ private:
class OpenCLCalcCustomNonbondedForceKernel : public CalcCustomNonbondedForceKernel {
public:
OpenCLCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, OpenCLContext& cl, const System& system) : CalcCustomNonbondedForceKernel(name, platform),
cl(cl), params(NULL), globals(NULL), tabulatedFunctionParams(NULL), forceCopy(NULL), system(system) {
cl(cl), params(NULL), globals(NULL), tabulatedFunctionParams(NULL), interactionGroupData(NULL), forceCopy(NULL), system(system), hasInitializedKernel(false) {
}
~OpenCLCalcCustomNonbondedForceKernel();
/**
......@@ -666,15 +666,20 @@ public:
*/
void copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force);
private:
void initInteractionGroups(const CustomNonbondedForce& force, const std::string& interactionSource);
OpenCLContext& cl;
OpenCLParameterSet* params;
OpenCLArray* globals;
OpenCLArray* tabulatedFunctionParams;
OpenCLArray* interactionGroupData;
cl::Kernel interactionGroupKernel;
std::vector<void*> interactionGroupArgs;
std::vector<std::string> globalParamNames;
std::vector<cl_float> globalParamValues;
std::vector<OpenCLArray*> tabulatedFunctions;
double longRangeCoefficient;
bool hasInitializedLongRangeCorrection;
bool hasInitializedLongRangeCorrection, hasInitializedKernel;
int numGroupThreadBlocks;
CustomNonbondedForce* forceCopy;
const System& system;
};
......
......@@ -81,6 +81,13 @@ public:
*/
template <class T>
void setParameterValues(const std::vector<std::vector<T> >& values);
/**
* Get a set of OpenCLNonbondedUtilities::ParameterInfo objects which describe the Buffers
* containing the data.
*/
std::vector<OpenCLNonbondedUtilities::ParameterInfo>& getBuffers() {
return buffers;
}
/**
* Get a set of OpenCLNonbondedUtilities::ParameterInfo objects which describe the Buffers
* containing the data.
......
......@@ -19,6 +19,6 @@ ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${OPENCL_LIBRARIES} ${PTHREADS_LIB})
SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_OPENCL_BUILDING_SHARED_LIBRARY")
SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-msse2 -DOPENMM_OPENCL_BUILDING_SHARED_LIBRARY")
INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Portions copyright (c) 2009-2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -39,6 +39,7 @@
#include "openmm/Platform.h"
#include "openmm/System.h"
#include "openmm/VirtualSite.h"
#include "openmm/internal/ContextImpl.h"
#include <algorithm>
#include <fstream>
#include <iostream>
......@@ -87,17 +88,25 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
contextIndex = platformData.contexts.size();
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
if (platformIndex < 0 || platformIndex >= (int) platforms.size())
throw OpenMMException("Illegal value for OpenCL platform index");
string platformVendor = platforms[platformIndex].getInfo<CL_PLATFORM_VENDOR>();
vector<cl::Device> devices;
platforms[platformIndex].getDevices(CL_DEVICE_TYPE_ALL, &devices);
const int minThreadBlockSize = 32;
if (deviceIndex < 0 || deviceIndex >= (int) devices.size()) {
// Try to figure out which device is the fastest.
int bestSpeed = -1;
int bestSpeed = -1;
int bestDevice = -1;
int bestPlatform = -1;
for (int j = 0; j < platforms.size(); j++) {
// if they supplied a valid platformIndex, we only look through that platform
if (j != platformIndex && platformIndex >= 0 && platformIndex < (int) platforms.size())
continue;
string platformVendor = platforms[j].getInfo<CL_PLATFORM_VENDOR>();
vector<cl::Device> devices;
platforms[j].getDevices(CL_DEVICE_TYPE_ALL, &devices);
for (int i = 0; i < (int) devices.size(); i++) {
// if they supplied a valid deviceIndex, we only look through that one
if (i != deviceIndex && deviceIndex >= 0 && deviceIndex < (int) devices.size())
continue;
if (platformVendor == "Apple" && devices[i].getInfo<CL_DEVICE_VENDOR>() == "AMD")
continue; // Don't use AMD GPUs on OS X due to serious bugs.
int maxSize = devices[i].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0];
......@@ -136,15 +145,26 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
}
int speed = devices[i].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()*processingElementsPerComputeUnit*devices[i].getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>();
if (maxSize >= minThreadBlockSize && speed > bestSpeed) {
deviceIndex = i;
bestDevice = i;
bestSpeed = speed;
bestPlatform = j;
}
}
}
if (deviceIndex == -1)
if (bestPlatform == -1)
throw OpenMMException("No compatible OpenCL platform is available");
if (bestDevice == -1)
throw OpenMMException("No compatible OpenCL device is available");
device = devices[deviceIndex];
this->deviceIndex = deviceIndex;
vector<cl::Device> devices;
platforms[bestPlatform].getDevices(CL_DEVICE_TYPE_ALL, &devices);
string platformVendor = platforms[bestPlatform].getInfo<CL_PLATFORM_VENDOR>();
device = devices[bestDevice];
this->deviceIndex = bestDevice;
this->platformIndex = bestPlatform;
if (device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() < minThreadBlockSize)
throw OpenMMException("The specified OpenCL device is not compatible with OpenMM");
compilationDefines["WORK_GROUP_SIZE"] = intToString(ThreadBlockSize);
......@@ -226,7 +246,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
compilationDefines["SYNC_WARPS"] = "barrier(CLK_LOCAL_MEM_FENCE)";
vector<cl::Device> contextDevices;
contextDevices.push_back(device);
cl_context_properties cprops[] = {CL_CONTEXT_PLATFORM, (cl_context_properties) platforms[platformIndex](), 0};
cl_context_properties cprops[] = {CL_CONTEXT_PLATFORM, (cl_context_properties) platforms[bestPlatform](), 0};
context = cl::Context(contextDevices, cprops, errorCallback);
queue = cl::CommandQueue(context, device);
numAtoms = system.getNumParticles();
......@@ -618,15 +638,6 @@ void OpenCLContext::reduceBuffer(OpenCLArray& array, int numBuffers) {
executeKernel(reduceReal4Kernel, bufferSize, 128);
}
void OpenCLContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds) {
// Recursively tag atoms as belonging to a particular molecule.
atomMolecule[atom] = molecule;
for (int i = 0; i < (int) atomBonds[atom].size(); i++)
if (atomMolecule[atomBonds[atom][i]] == -1)
tagAtomsInMolecule(atomBonds[atom][i], molecule, atomMolecule, atomBonds);
}
/**
* This class ensures that atom reordering doesn't break virtual sites.
*/
......@@ -722,16 +733,14 @@ void OpenCLContext::findMoleculeGroups() {
}
}
// Now tag atoms by which molecule they belong to.
// Now identify atoms by which molecule they belong to.
vector<int> atomMolecule(numAtoms, -1);
int numMolecules = 0;
for (int i = 0; i < numAtoms; i++)
if (atomMolecule[i] == -1)
tagAtomsInMolecule(i, numMolecules++, atomMolecule, atomBonds);
vector<vector<int> > atomIndices(numMolecules);
for (int i = 0; i < numAtoms; i++)
atomIndices[atomMolecule[i]].push_back(i);
vector<vector<int> > atomIndices = ContextImpl::findMolecules(numAtoms, atomBonds);
int numMolecules = atomIndices.size();
vector<int> atomMolecule(numAtoms);
for (int i = 0; i < (int) atomIndices.size(); i++)
for (int j = 0; j < (int) atomIndices[i].size(); j++)
atomMolecule[atomIndices[i][j]] = i;
// Construct a description of each molecule.
......
......@@ -46,6 +46,7 @@
#include "lepton/ParsedExpression.h"
#include "SimTKOpenMMRealType.h"
#include "SimTKOpenMMUtilities.h"
#include <algorithm>
#include <cmath>
#include <set>
......@@ -1875,6 +1876,17 @@ void OpenCLCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& contex
class OpenCLCustomNonbondedForceInfo : public OpenCLForceInfo {
public:
OpenCLCustomNonbondedForceInfo(int requiredBuffers, const CustomNonbondedForce& force) : OpenCLForceInfo(requiredBuffers), force(force) {
if (force.getNumInteractionGroups() > 0) {
groupsForParticle.resize(force.getNumParticles());
for (int i = 0; i < force.getNumInteractionGroups(); i++) {
set<int> set1, set2;
force.getInteractionGroupParameters(i, set1, set2);
for (set<int>::const_iterator iter = set1.begin(); iter != set1.end(); ++iter)
groupsForParticle[*iter].insert(2*i);
for (set<int>::const_iterator iter = set2.begin(); iter != set2.end(); ++iter)
groupsForParticle[*iter].insert(2*i+1);
}
}
}
bool areParticlesIdentical(int particle1, int particle2) {
vector<double> params1;
......@@ -1884,6 +1896,8 @@ public:
for (int i = 0; i < (int) params1.size(); i++)
if (params1[i] != params2[i])
return false;
if (groupsForParticle.size() > 0 && groupsForParticle[particle1] != groupsForParticle[particle2])
return false;
return true;
}
int getNumParticleGroups() {
......@@ -1901,6 +1915,7 @@ public:
}
private:
const CustomNonbondedForce& force;
vector<set<int> > groupsForParticle;
};
OpenCLCalcCustomNonbondedForceKernel::~OpenCLCalcCustomNonbondedForceKernel() {
......@@ -1910,6 +1925,8 @@ OpenCLCalcCustomNonbondedForceKernel::~OpenCLCalcCustomNonbondedForceKernel() {
delete globals;
if (tabulatedFunctionParams != NULL)
delete tabulatedFunctionParams;
if (interactionGroupData != NULL)
delete interactionGroupData;
for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
delete tabulatedFunctions[i];
if (forceCopy != NULL)
......@@ -1920,7 +1937,7 @@ void OpenCLCalcCustomNonbondedForceKernel::initialize(const System& system, cons
int forceIndex;
for (forceIndex = 0; forceIndex < system.getNumForces() && &system.getForce(forceIndex) != &force; ++forceIndex)
;
string prefix = "custom"+cl.intToString(forceIndex)+"_";
string prefix = (force.getNumInteractionGroups() == 0 ? "custom"+cl.intToString(forceIndex)+"_" : "");
// Record parameters and exclusions.
......@@ -2021,14 +2038,18 @@ void OpenCLCalcCustomNonbondedForceKernel::initialize(const System& system, cons
replacements["SWITCH_C5"] = cl.doubleToString(6/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 5.0));
}
string source = cl.replaceStrings(OpenCLKernelSources::customNonbonded, replacements);
cl.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup());
for (int i = 0; i < (int) params->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo(prefix+"params"+cl.intToString(i+1), buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory()));
}
if (globals != NULL) {
globals->upload(globalParamValues);
cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(prefix+"globals", "float", 1, sizeof(cl_float), globals->getDeviceBuffer()));
if (force.getNumInteractionGroups() > 0)
initInteractionGroups(force, source);
else {
cl.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup());
for (int i = 0; i < (int) params->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo(prefix+"params"+cl.intToString(i+1), buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory()));
}
if (globals != NULL) {
globals->upload(globalParamValues);
cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(prefix+"globals", "float", 1, sizeof(cl_float), globals->getDeviceBuffer()));
}
}
cl.addForce(new OpenCLCustomNonbondedForceInfo(cl.getNonbondedUtilities().getNumForceBuffers(), force));
......@@ -2044,6 +2065,250 @@ void OpenCLCalcCustomNonbondedForceKernel::initialize(const System& system, cons
}
}
void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbondedForce& force, const string& interactionSource) {
// Process groups to form tiles.
vector<vector<int> > atomLists;
vector<pair<int, int> > tiles;
map<pair<int, int>, int> duplicateInteractions;
for (int group = 0; group < force.getNumInteractionGroups(); group++) {
// Get the list of atoms in this group and sort them.
set<int> set1, set2;
force.getInteractionGroupParameters(group, set1, set2);
vector<int> atoms1, atoms2;
atoms1.insert(atoms1.begin(), set1.begin(), set1.end());
atoms2.insert(atoms2.begin(), set2.begin(), set2.end());
sort(atoms1.begin(), atoms1.end());
sort(atoms2.begin(), atoms2.end());
// Find how many tiles we will create for this group.
int tileWidth = min(min(32, (int) atoms1.size()), (int) atoms2.size());
int numBlocks1 = (atoms1.size()+tileWidth-1)/tileWidth;
int numBlocks2 = (atoms2.size()+tileWidth-1)/tileWidth;
// Add the tiles.
for (int i = 0; i < numBlocks1; i++)
for (int j = 0; j < numBlocks2; j++)
tiles.push_back(make_pair(atomLists.size()+i, atomLists.size()+numBlocks1+j));
// Add the atom lists.
for (int i = 0; i < numBlocks1; i++) {
vector<int> atoms;
int first = i*tileWidth;
int last = min((i+1)*tileWidth, (int) atoms1.size());
for (int j = first; j < last; j++)
atoms.push_back(atoms1[j]);
atomLists.push_back(atoms);
}
for (int i = 0; i < numBlocks2; i++) {
vector<int> atoms;
int first = i*tileWidth;
int last = min((i+1)*tileWidth, (int) atoms2.size());
for (int j = first; j < last; j++)
atoms.push_back(atoms2[j]);
atomLists.push_back(atoms);
}
// If this group contains duplicate interactions, record that we need to skip them once.
for (int i = 0; i < (int) atoms1.size(); i++) {
int a1 = atoms1[i];
if (set2.find(a1) == set2.end())
continue;
for (int j = 0; j < (int) atoms2.size() && atoms2[j] < a1; j++) {
int a2 = atoms2[j];
if (set1.find(a2) != set1.end()) {
pair<int, int> key = make_pair(a2, a1);
if (duplicateInteractions.find(key) == duplicateInteractions.end())
duplicateInteractions[key] = 0;
duplicateInteractions[key]++;
}
}
}
}
// Build a lookup table for quickly identifying excluded interactions.
set<pair<int, int> > exclusions;
for (int i = 0; i < force.getNumExclusions(); i++) {
int p1, p2;
force.getExclusionParticles(i, p1, p2);
exclusions.insert(make_pair(min(p1, p2), max(p1, p2)));
}
// Build the exclusion flags for each tile. While we're at it, filter out tiles
// where all interactions are excluded, and sort the tiles by size.
vector<vector<int> > exclusionFlags(tiles.size());
vector<pair<int, int> > tileOrder;
for (int tile = 0; tile < tiles.size(); tile++) {
if (atomLists[tiles[tile].first].size() < atomLists[tiles[tile].second].size()) {
// For efficiency, we want the first axis to be the larger one.
int swap = tiles[tile].first;
tiles[tile].first = tiles[tile].second;
tiles[tile].second = swap;
}
vector<int>& atoms1 = atomLists[tiles[tile].first];
vector<int>& atoms2 = atomLists[tiles[tile].second];
vector<int> flags(atoms1.size(), (int) (1LL<<atoms2.size())-1);
int numExcluded = 0;
for (int i = 0; i < (int) atoms1.size(); i++)
for (int j = 0; j < (int) atoms2.size(); j++) {
int a1 = atoms1[i];
int a2 = atoms2[j];
bool isExcluded = false;
pair<int, int> key = make_pair(min(a1, a2), max(a1, a2));
if (a1 == a2 || exclusions.find(key) != exclusions.end())
isExcluded = true; // This is an excluded interaction.
else if (duplicateInteractions.find(key) != duplicateInteractions.end() && duplicateInteractions[key] > 0) {
// Both atoms are in both sets, so skip duplicate interactions.
isExcluded = true;
duplicateInteractions[key]--;
}
if (isExcluded) {
flags[i] &= -1-(1<<j);
numExcluded++;
}
}
if (numExcluded == atoms1.size()*atoms2.size())
continue; // All interactions are excluded.
tileOrder.push_back(make_pair((int) -atoms2.size(), tile));
exclusionFlags[tile] = flags;
}
sort(tileOrder.begin(), tileOrder.end());
// Merge tiles to get as close as possible to 32 along the first axis of each one.
vector<int> tileSetStart;
tileSetStart.push_back(0);
int tileSetSize = 0;
for (int i = 0; i < tileOrder.size(); i++) {
int tile = tileOrder[i].second;
int size = atomLists[tiles[tile].first].size();
if (tileSetSize+size > 32) {
tileSetStart.push_back(i);
tileSetSize = 0;
}
tileSetSize += size;
}
tileSetStart.push_back(tileOrder.size());
// Build the data structures.
int numTileSets = tileSetStart.size()-1;
vector<mm_int4> groupData;
for (int tileSet = 0; tileSet < numTileSets; tileSet++) {
int indexInTileSet = 0;
int minSize = 0;
if (cl.getSIMDWidth() < 32) {
// We need to include a barrier inside the inner loop, so ensure that all
// threads will loop the same number of times.
for (int i = tileSetStart[tileSet]; i < tileSetStart[tileSet+1]; i++)
minSize = max(minSize, (int) atomLists[tiles[tileOrder[i].second].first].size());
}
for (int i = tileSetStart[tileSet]; i < tileSetStart[tileSet+1]; i++) {
int tile = tileOrder[i].second;
vector<int>& atoms1 = atomLists[tiles[tile].first];
vector<int>& atoms2 = atomLists[tiles[tile].second];
int range = indexInTileSet + ((indexInTileSet+max(minSize, (int) atoms1.size()))<<16);
int allFlags = (1<<atoms2.size())-1;
for (int j = 0; j < (int) atoms1.size(); j++) {
int a1 = atoms1[j];
int a2 = (j < atoms2.size() ? atoms2[j] : 0);
int flags = (exclusionFlags[tile].size() > 0 ? exclusionFlags[tile][j] : allFlags);
groupData.push_back(mm_int4(a1, a2, range, flags<<indexInTileSet));
}
indexInTileSet += atoms1.size();
}
for (; indexInTileSet < 32; indexInTileSet++)
groupData.push_back(mm_int4(0, 0, minSize<<16, 0));
}
interactionGroupData = OpenCLArray::create<mm_int4>(cl, groupData.size(), "interactionGroupData");
interactionGroupData->upload(groupData);
// Create the kernel.
map<string, string> replacements;
replacements["COMPUTE_INTERACTION"] = interactionSource;
const string suffixes[] = {"x", "y", "z", "w"};
stringstream localData;
int localDataSize = 0;
vector<OpenCLNonbondedUtilities::ParameterInfo>& buffers = params->getBuffers();
for (int i = 0; i < (int) buffers.size(); i++) {
if (buffers[i].getNumComponents() == 1)
localData<<buffers[i].getComponentType()<<" params"<<(i+1)<<";\n";
else {
for (int j = 0; j < buffers[i].getNumComponents(); ++j)
localData<<buffers[i].getComponentType()<<" params"<<(i+1)<<"_"<<suffixes[j]<<";\n";
}
localDataSize += buffers[i].getSize();
}
replacements["ATOM_PARAMETER_DATA"] = localData.str();
stringstream args;
for (int i = 0; i < (int) buffers.size(); i++)
args<<", __global const "<<buffers[i].getType()<<"* restrict global_params"<<(i+1);
if (globals != NULL)
args<<", __global const float* restrict globals";
replacements["PARAMETER_ARGUMENTS"] = args.str();
stringstream load1;
for (int i = 0; i < (int) buffers.size(); i++)
load1<<buffers[i].getType()<<" params"<<(i+1)<<"1 = global_params"<<(i+1)<<"[atom1];\n";
replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
stringstream loadLocal2;
for (int i = 0; i < (int) buffers.size(); i++) {
if (buffers[i].getNumComponents() == 1)
loadLocal2<<"localData[get_local_id(0)].params"<<(i+1)<<" = global_params"<<(i+1)<<"[atom2];\n";
else {
loadLocal2<<buffers[i].getType()<<" temp_params"<<(i+1)<<" = global_params"<<(i+1)<<"[atom2];\n";
for (int j = 0; j < buffers[i].getNumComponents(); ++j)
loadLocal2<<"localData[get_local_id(0)].params"<<(i+1)<<"_"<<suffixes[j]<<" = temp_params"<<(i+1)<<"."<<suffixes[j]<<";\n";
}
}
replacements["LOAD_LOCAL_PARAMETERS"] = loadLocal2.str();
stringstream load2;
for (int i = 0; i < (int) buffers.size(); i++) {
if (buffers[i].getNumComponents() == 1)
load2<<buffers[i].getType()<<" params"<<(i+1)<<"2 = localData[localIndex].params"<<(i+1)<<";\n";
else {
load2<<buffers[i].getType()<<" params"<<(i+1)<<"2 = ("<<buffers[i].getType()<<") (";
for (int j = 0; j < buffers[i].getNumComponents(); ++j) {
if (j > 0)
load2<<", ";
load2<<"localData[localIndex].params"<<(i+1)<<"_"<<suffixes[j];
}
load2<<");\n";
}
}
replacements["LOAD_ATOM2_PARAMETERS"] = load2.str();
map<string, string> defines;
if (force.getNonbondedMethod() != CustomNonbondedForce::NoCutoff)
defines["USE_CUTOFF"] = "1";
if (force.getNonbondedMethod() == CustomNonbondedForce::CutoffPeriodic)
defines["USE_PERIODIC"] = "1";
defines["THREAD_BLOCK_SIZE"] = cl.intToString(cl.getNonbondedUtilities().getForceThreadBlockSize());
double cutoff = force.getCutoffDistance();
defines["CUTOFF_SQUARED"] = cl.doubleToString(cutoff*cutoff);
defines["PADDED_NUM_ATOMS"] = cl.intToString(cl.getPaddedNumAtoms());
defines["TILE_SIZE"] = "32";
int numContexts = cl.getPlatformData().contexts.size();
int startIndex = cl.getContextIndex()*numTileSets/numContexts;
int endIndex = (cl.getContextIndex()+1)*numTileSets/numContexts;
defines["FIRST_TILE"] = cl.intToString(startIndex);
defines["LAST_TILE"] = cl.intToString(endIndex);
if ((localDataSize/4)%2 == 0 && !cl.getUseDoublePrecision())
defines["PARAMETER_SIZE_IS_EVEN"] = "1";
cl::Program program = cl.createProgram(cl.replaceStrings(OpenCLKernelSources::customNonbondedGroups, replacements), defines);
interactionGroupKernel = cl::Kernel(program, "computeInteractionGroups");
numGroupThreadBlocks = cl.getNonbondedUtilities().getNumForceThreadBlocks();
}
double OpenCLCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
if (globals != NULL) {
bool changed = false;
......@@ -2065,6 +2330,25 @@ double OpenCLCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool
longRangeCoefficient = CustomNonbondedForceImpl::calcLongRangeCorrection(*forceCopy, context.getOwner());
hasInitializedLongRangeCorrection = true;
}
if (interactionGroupData != NULL) {
if (!hasInitializedKernel) {
hasInitializedKernel = true;
int index = 0;
bool useLong = cl.getSupports64BitGlobalAtomics();
interactionGroupKernel.setArg<cl::Buffer>(index++, (useLong ? cl.getLongForceBuffer() : cl.getForceBuffers()).getDeviceBuffer());
interactionGroupKernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer());
interactionGroupKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
interactionGroupKernel.setArg<cl::Buffer>(index++, interactionGroupData->getDeviceBuffer());
setPeriodicBoxSizeArg(cl, interactionGroupKernel, index++);
setInvPeriodicBoxSizeArg(cl, interactionGroupKernel, index++);
for (int i = 0; i < (int) params->getBuffers().size(); i++)
interactionGroupKernel.setArg<cl::Memory>(index++, params->getBuffers()[i].getMemory());
if (globals != NULL)
interactionGroupKernel.setArg<cl::Buffer>(index++, globals->getDeviceBuffer());
}
int forceThreadBlockSize = max(32, cl.getNonbondedUtilities().getForceThreadBlockSize());
cl.executeKernel(interactionGroupKernel, numGroupThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
}
mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
return longRangeCoefficient/(boxSize.x*boxSize.y*boxSize.z);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment