Commit 1010df33 authored by Peter Eastman's avatar Peter Eastman
Browse files

Checking in Cuda implementation of explicit solvent

parent df4b64cb
......@@ -84,4 +84,14 @@ INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
SET(FINDCUDA_DIR ${CMAKE_CURRENT_SOURCE_DIR}/cuda-cmake)
IF (APPLE)
LINK_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/cudpp/mac)
ELSE (APPLE)
IF (WIN32)
LINK_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/cudpp/win)
INSTALL_FILES(/lib FILES ${CMAKE_CURRENT_SOURCE_DIR}/cudpp/win/cudpp32.dll)
ELSE (WIN32)
LINK_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/cudpp/linux)
ENDIF (WIN32)
ENDIF(APPLE)
SUBDIRS (sharedTarget staticTarget)
......@@ -39,42 +39,40 @@
using namespace OpenMM;
StreamImpl* CudaStreamFactory::createStreamImpl(std::string name, int size, Stream::DataType type, const Platform& platform, OpenMMContextImpl& context) const {
if (name == "particlePositions") {
CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
if (name == "particlePositions") {
float padding[] = {100000.0f, 100000.0f, 100000.0f, 0.2f};
return new CudaStreamImpl<float4>(name, size, type, platform, data.gpu->psPosq4, 4, padding, data.gpu);
}
if (name == "particleVelocities") {
CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
float padding[] = {0.0f, 0.0f, 0.0f, 0.0f};
return new CudaStreamImpl<float4>(name, size, type, platform, data.gpu->psVelm4, 4, padding, data.gpu);
}
if (name == "particleForces") {
CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
float padding[] = {0.0f, 0.0f, 0.0f, 0.0f};
return new CudaStreamImpl<float4>(name, size, type, platform, data.gpu->psForce4, 4, padding, data.gpu);
}
switch (type) {
case Stream::Float:
case Stream::Double:
return new CudaStreamImpl<float1>(name, size, type, platform, 1, NULL);
return new CudaStreamImpl<float1>(name, size, type, platform, 1, data.gpu);
case Stream::Float2:
case Stream::Double2:
return new CudaStreamImpl<float2>(name, size, type, platform, 1, NULL);
return new CudaStreamImpl<float2>(name, size, type, platform, 1, data.gpu);
case Stream::Float3:
case Stream::Double3:
return new CudaStreamImpl<float3>(name, size, type, platform, 1, NULL);
return new CudaStreamImpl<float3>(name, size, type, platform, 1, data.gpu);
case Stream::Float4:
case Stream::Double4:
return new CudaStreamImpl<float4>(name, size, type, platform, 1, NULL);
return new CudaStreamImpl<float4>(name, size, type, platform, 1, data.gpu);
case Stream::Integer:
return new CudaStreamImpl<int1>(name, size, type, platform, 1, NULL);
return new CudaStreamImpl<int1>(name, size, type, platform, 1, data.gpu);
case Stream::Integer2:
return new CudaStreamImpl<int2>(name, size, type, platform, 1, NULL);
return new CudaStreamImpl<int2>(name, size, type, platform, 1, data.gpu);
case Stream::Integer3:
return new CudaStreamImpl<int3>(name, size, type, platform, 1, NULL);
return new CudaStreamImpl<int3>(name, size, type, platform, 1, data.gpu);
case Stream::Integer4:
return new CudaStreamImpl<int4>(name, size, type, platform, 1, NULL);
return new CudaStreamImpl<int4>(name, size, type, platform, 1, data.gpu);
}
throw OpenMMException("Tried to create a Stream with an illegal DataType.");
}
......@@ -132,23 +132,24 @@ CudaStreamImpl<T>::~CudaStreamImpl() {
template <class T>
void CudaStreamImpl<T>::loadFromArray(const void* array) {
float* data = reinterpret_cast<float*>(stream->_pSysData);
int* order = gpu->psAtomIndex->_pSysData;
if (baseType == Stream::Float) {
float* arrayData = (float*) array;
for (int i = 0; i < getSize(); ++i)
for (int j = 0; j < width; ++j)
data[i*rowOffset+j] = arrayData[i*width+j];
data[i*rowOffset+j] = arrayData[order[i]*width+j];
}
else if (baseType == Stream::Double) {
double* arrayData = (double*) array;
for (int i = 0; i < getSize(); ++i)
for (int j = 0; j < width; ++j)
data[i*rowOffset+j] = (float) arrayData[i*width+j];
data[i*rowOffset+j] = (float) arrayData[order[i]*width+j];
}
else {
int* arrayData = (int*) array;
for (int i = 0; i < getSize(); ++i)
for (int j = 0; j < width; ++j)
data[i*rowOffset+j] = (float) arrayData[i*width+j];
data[i*rowOffset+j] = (float) arrayData[order[i]*width+j];
}
for (int i = getSize(); i < (int) stream->_length; ++i)
for (int j = 0; j < rowOffset; ++j)
......@@ -167,23 +168,24 @@ template <class T>
void CudaStreamImpl<T>::saveToArray(void* array) {
stream->Download();
float* data = reinterpret_cast<float*>(stream->_pSysData);
int* order = gpu->psAtomIndex->_pSysData;
if (baseType == Stream::Float) {
float* arrayData = (float*) array;
for (int i = 0; i < getSize(); ++i)
for (int j = 0; j < width; ++j)
arrayData[i*width+j] = data[i*rowOffset+j];
arrayData[order[i]*width+j] = data[i*rowOffset+j];
}
else if (baseType == Stream::Double) {
double* arrayData = (double*) array;
for (int i = 0; i < getSize(); ++i)
for (int j = 0; j < width; ++j)
arrayData[i*width+j] = data[i*rowOffset+j];
arrayData[order[i]*width+j] = data[i*rowOffset+j];
}
else {
int* arrayData = (int*) array;
for (int i = 0; i < getSize(); ++i)
for (int j = 0; j < width; ++j)
arrayData[i*width+j] = (int) data[i*rowOffset+j];
arrayData[order[i]*width+j] = (int) data[i*rowOffset+j];
}
}
......
......@@ -41,19 +41,19 @@ extern void kGenerateRandoms(gpuContext gpu);
extern void kCalculateCDLJObcGbsaForces1(gpuContext gpu);
extern void kCalculateCDLJObcGbsaForces1_12(gpuContext gpu);
extern void kCalculateCDLJForces(gpuContext gpu);
extern void kCalculateCDLJForces_12(gpuContext gpu);
extern void kCalculateObcGbsaForces1(gpuContext gpu);
extern void kCalculateObcGbsaForces1_12(gpuContext gpu);
extern void kReduceObcGbsaBornForces(gpuContext gpu);
extern void kCalculateObcGbsaForces2(gpuContext gpu);
extern void kCalculateObcGbsaForces2_12(gpuContext gpu);
extern void kCalculateLocalForces(gpuContext gpu);
extern void kCalculateAndersenThermostat(gpuContext gpu);
extern void kReduceBornSumAndForces(gpuContext gpu);
extern void kUpdatePart1(gpuContext gpu);
extern void kApplyFirstShake(gpuContext gpu);
extern void kApplyFirstSettle(gpuContext gpu);
extern void kUpdatePart2(gpuContext gpu);
extern void kApplySecondShake(gpuContext gpu);
extern void kApplySecondSettle(gpuContext gpu);
extern void kVerletUpdatePart1(gpuContext gpu);
extern void kVerletUpdatePart2(gpuContext gpu);
extern void kBrownianUpdatePart1(gpuContext gpu);
......@@ -66,12 +66,8 @@ extern void kClearBornForces(gpuContext gpu);
// Initializers
extern void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
extern void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
extern void SetCalculateCDLJObcGbsaForces1_12Sim(gpuContext gpu);
extern void GetCalculateCDLJObcGbsaForces1_12Sim(gpuContext gpu);
extern void SetCalculateCDLJForcesSim(gpuContext gpu);
extern void GetCalculateCDLJForcesSim(gpuContext gpu);
extern void SetCalculateCDLJForces_12Sim(gpuContext gpu);
extern void GetCalculateCDLJForces_12Sim(gpuContext gpu);
extern void SetCalculateLocalForcesSim(gpuContext gpu);
extern void GetCalculateLocalForcesSim(gpuContext gpu);
extern void SetCalculateObcGbsaBornSumSim(gpuContext gpu);
......@@ -82,14 +78,14 @@ extern void SetCalculateObcGbsaForces1_12Sim(gpuContext gpu);
extern void GetCalculateObcGbsaForces1_12Sim(gpuContext gpu);
extern void SetCalculateObcGbsaForces2Sim(gpuContext gpu);
extern void GetCalculateObcGbsaForces2Sim(gpuContext gpu);
extern void SetCalculateObcGbsaForces2_12Sim(gpuContext gpu);
extern void GetCalculateObcGbsaForces2_12Sim(gpuContext gpu);
extern void SetCalculateAndersenThermostatSim(gpuContext gpu);
extern void GetCalculateAndersenThermostatSim(gpuContext gpu);
extern void SetForcesSim(gpuContext gpu);
extern void GetForcesSim(gpuContext gpu);
extern void SetUpdateShakeHSim(gpuContext gpu);
extern void GetUpdateShakeHSim(gpuContext gpu);
extern void SetSettleSim(gpuContext gpu);
extern void GetSettleSim(gpuContext gpu);
extern void SetVerletUpdateSim(gpuContext gpu);
extern void GetVerletUpdateSim(gpuContext gpu);
extern void SetBrownianUpdateSim(gpuContext gpu);
......
......@@ -36,11 +36,12 @@
#include <limits>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <builtin_types.h>
#include <vector_functions.h>
using namespace std;
#define RTERROR(status, s) \
if (status != cudaSuccess) { \
......@@ -228,6 +229,12 @@ static const int GT2XX_RANDOM_THREADS_PER_BLOCK = 384;
static const int G8X_NONBOND_WORKUNITS_PER_SM = 220;
static const int GT2XX_NONBOND_WORKUNITS_PER_SM = 256;
enum CudaNonbondedMethod
{
NO_CUTOFF,
CUTOFF,
PERIODIC
};
struct cudaGmxSimulation {
// Constants
......@@ -236,6 +243,7 @@ struct cudaGmxSimulation {
unsigned int blocks; // Number of blocks to launch across linear kernels
unsigned int nonbond_blocks; // Number of blocks to launch across CDLJ and Born Force Part1
unsigned int bornForce2_blocks; // Number of blocks to launch across Born Force 2
unsigned int interaction_blocks; // Number of blocks to launch when identifying interacting tiles
unsigned int threads_per_block; // Threads per block to launch
unsigned int nonbond_threads_per_block; // Threads per block in nonbond kernel calls
unsigned int bornForce2_threads_per_block; // Threads per block in nonbond kernel calls
......@@ -245,12 +253,17 @@ struct cudaGmxSimulation {
unsigned int bsf_reduce_threads_per_block; // Threads per block in Born Sum And Forces reduction calls
unsigned int max_shake_threads_per_block; // Maximum threads per block in shake kernel calls
unsigned int shake_threads_per_block; // Threads per block in shake kernel calls
unsigned int settle_threads_per_block; // Threads per block in SETTLE kernel calls
unsigned int nonshake_threads_per_block; // Threads per block in nonshaking kernel call
unsigned int max_localForces_threads_per_block; // Threads per block in local forces kernel calls
unsigned int localForces_threads_per_block; // Threads per block in local forces kernel calls
unsigned int random_threads_per_block; // Threads per block in RNG kernel calls
unsigned int interaction_threads_per_block; // Threads per block when identifying interacting tiles
unsigned int workUnits; // Number of work units
unsigned int* pWorkUnit; // Pointer to work units
unsigned int* pInteractingWorkUnit; // Pointer to work units that have interactions
unsigned int* pInteractionFlag; // Flags for which work units have interactions
size_t* pInteractionCount; // A count of the number of work units which have interactions
unsigned int nonbond_workBlock; // Number of work units running simultaneously per block in CDLJ and Born Force Part 1
unsigned int bornForce2_workBlock; // Number of work units running second half of Born Forces calculation
unsigned int workUnitsPerSM; // Number of workblocks per SM
......@@ -270,6 +283,12 @@ struct cudaGmxSimulation {
unsigned int outputBuffers; // Number of output buffers
float bigFloat; // Floating point value used as a flag for Shaken atoms
float epsfac; // Epsilon factor for CDLJ calculations
CudaNonbondedMethod nonbondedMethod; // How to handle nonbonded interactions
float nonbondedCutoffSqr; // Cutoff distance for CDLJ calculations
float periodicBoxSizeX; // The X dimension of the periodic box
float periodicBoxSizeY; // The Y dimension of the periodic box
float periodicBoxSizeZ; // The Z dimension of the periodic box
float reactionFieldK; // Constant for reaction field correction
float probeRadius; // SASA probe radius
float surfaceAreaFactor; // ACE approximation surface area factor
float electricConstant; // ACE approximation electric constant
......@@ -326,6 +345,7 @@ struct cudaGmxSimulation {
float4* pLJ14Parameter; // Lennard Jones 1-4 parameters
float inverseTotalMass; // Used in linear momentum removal
unsigned int ShakeConstraints; // Total number of Shake constraints
unsigned int settleConstraints; // Total number of Settle constraints
unsigned int NonShakeConstraints; // Total number of NonShake atoms
unsigned int maxShakeIterations; // Maximum shake iterations
unsigned int degreesOfFreedom; // Number of degrees of freedom in system
......@@ -334,12 +354,17 @@ struct cudaGmxSimulation {
int* pNonShakeID; // Not Shaking atoms
int4* pShakeID; // Shake atoms and phase
float4* pShakeParameter; // Shake parameters
int4* pSettleID; // Settle atoms
float2* pSettleParameter; // Settle parameters
unsigned int* pExclusion; // Nonbond exclusion data
unsigned int bond_offset; // Offset to end of bonds
unsigned int bond_angle_offset; // Offset to end of bond angles
unsigned int dihedral_offset; // Offset to end of dihedrals
unsigned int rb_dihedral_offset; // Offset to end of Ryckaert Bellemans dihedrals
unsigned int LJ14_offset; // Offset to end of Lennard Jones 1-4 parameters
int* pAtomIndex; // The original index of each atom
float4* pGridBoundingBox; // The size of each grid cell
float4* pGridCenter; // The center of each grid cell
// Mutable stuff
float4* pPosq; // Pointer to atom positions and charges
......
......@@ -30,6 +30,7 @@
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <string.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
......@@ -40,6 +41,7 @@
#include <ctime>
#include <cmath>
#include <map>
#include <algorithm>
#ifdef WIN32
#include <windows.h>
#else
......@@ -148,6 +150,23 @@ struct ShakeCluster {
}
};
struct Constraint
{
Constraint(int atom1, int atom2, float distance2) : atom1(atom1), atom2(atom2), distance2(distance2) {
}
int atom1, atom2;
float distance2;
};
struct Molecule {
vector<int> atoms;
vector<int> bonds;
vector<int> angles;
vector<int> periodicTorsions;
vector<int> rbTorsions;
vector<int> constraints;
};
static const float dielectricOffset = 0.009f;
static const float PI = 3.1415926535f;
static const float probeRadius = 0.14f;
......@@ -793,7 +812,7 @@ int gpuReadCoulombParameters(gpuContext gpu, char* fname)
}
}
cout << total_exclusions << " total exclusions.\n";
gpuSetCoulombParameters(gpu, epsfac, atom, c6, c12, q, symbol, exclusions);
gpuSetCoulombParameters(gpu, epsfac, atom, c6, c12, q, symbol, exclusions, NO_CUTOFF);
gpuSetObcParameters(gpu, defaultInnerDielectric, defaultSolventDielectric, atom, radius, scale);
return coulombs;
}
......@@ -807,11 +826,12 @@ int gpuReadCoulombParameters(gpuContext gpu, char* fname)
extern "C"
void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const vector<int>& atom, const vector<float>& c6, const vector<float>& c12, const vector<float>& q,
const vector<char>& symbol, const vector<vector<int> >& exclusions)
const vector<char>& symbol, const vector<vector<int> >& exclusions, CudaNonbondedMethod method)
{
unsigned int coulombs = atom.size();
gpu->sim.epsfac = epsfac;
unsigned int total_exclusions = 0;
gpu->sim.nonbondedMethod = method;
gpu->exclusions = exclusions;
for (unsigned int i = 0; i < coulombs; i++)
{
......@@ -827,34 +847,6 @@ void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const vector<int>& at
gpu->psPosq4->_pSysStream[0][i].w = p0;
gpu->psSigEps2->_pSysStream[0][i].x = p1;
gpu->psSigEps2->_pSysStream[0][i].y = p2;
#if (DUMP_PARAMETERS == 1)
cout <<
i << " " <<
gpu->psPosq4->_pSysStream[0][i].w << " " <<
gpu->psSigEps2->_pSysStream[0][i].x << " " <<
gpu->psSigEps2->_pSysStream[0][i].y << " " <<
p0 << " " <<
p1 << " " <<
p2 << " " <<
exclusions;
#endif
for (int j = 0; j < (int) exclusions[i].size(); j++)
{
#if (DUMP_PARAMETERS == 1)
cout << " " << exclusions[i][j];
#endif
gpu->pExclusion[i * gpu->sim.paddedNumberOfAtoms + exclusions[i][j]] = 0;
if (i >= (int) exclusions[i][j])
{
total_exclusions++;
}
}
#if (DUMP_PARAMETERS == 1)
cout << endl;
#endif
}
// Dummy out extra atom data
......@@ -868,29 +860,23 @@ void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const vector<int>& at
gpu->psSigEps2->_pSysStream[0][i].y = 0.0f;
}
// Add in remaining exclusions
for (unsigned int i = coulombs; i < gpu->sim.paddedNumberOfAtoms; i++)
{
for (unsigned int j = 0; j < gpu->sim.paddedNumberOfAtoms; j++)
{
gpu->pExclusion[i * gpu->sim.paddedNumberOfAtoms + j] = 0;
gpu->pExclusion[j * gpu->sim.paddedNumberOfAtoms + i] = 0;
}
}
gpu->psPosq4->Upload();
gpu->psSigEps2->Upload();
}
// Check for exclusion consistency
for (unsigned int i = 0; i < coulombs; i++)
{
for (unsigned int j = i; j < coulombs; j++)
{
extern "C"
void gpuSetNonbondedCutoff(gpuContext gpu, float cutoffDistance, float solventDielectric)
{
gpu->sim.nonbondedCutoffSqr = cutoffDistance*cutoffDistance;
gpu->sim.reactionFieldK = pow(cutoffDistance, -3.0f)*(solventDielectric-1.0f)/(2.0f*solventDielectric+1.0f);
}
if (gpu->pExclusion[i * gpu->sim.paddedNumberOfAtoms + j] != gpu->pExclusion[j * gpu->sim.paddedNumberOfAtoms + i])
cout << "Warning: inconsistent exclusion betweens atoms " << i << " and " << j << endl;
}
}
extern "C"
void gpuSetPeriodicBoxSize(gpuContext gpu, float xsize, float ysize, float zsize)
{
gpu->sim.periodicBoxSizeX = xsize;
gpu->sim.periodicBoxSizeY = ysize;
gpu->sim.periodicBoxSizeZ = zsize;
}
extern "C"
......@@ -898,6 +884,7 @@ void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDie
{
unsigned int atoms = atom.size();
gpu->bIncludeGBSA = true;
for (unsigned int i = 0; i < atoms; i++)
{
gpu->psObcData->_pSysStream[0][i].x = radius[i] - dielectricOffset;
......@@ -974,10 +961,90 @@ void gpuSetShakeParameters(gpuContext gpu, const vector<int>& atom1, const vecto
constraintCount[atom2[i]]++;
}
// Identify clusters of three atoms that can be treated with SETTLE. First, for every
// atom that might be part of such a cluster, make a list of the two other atoms it is
// connected to.
vector<map<int, float> > settleConstraints(gpu->natoms);
for (int i = 0; i < atom1.size(); i++) {
if (constraintCount[atom1[i]] == 2 && constraintCount[atom2[i]] == 2) {
settleConstraints[atom1[i]][atom2[i]] = distance[i];
settleConstraints[atom2[i]][atom1[i]] = distance[i];
}
}
// Now remove the ones that don't actually form closed loops of three atoms.
vector<int> settleClusters;
for (int i = 0; i < settleConstraints.size(); i++) {
if (settleConstraints[i].size() == 2) {
int partner1 = settleConstraints[i].begin()->first;
int partner2 = (++settleConstraints[i].begin())->first;
if (settleConstraints[partner1].size() != 2 || settleConstraints[partner2].size() != 2 ||
settleConstraints[partner1].find(partner2) == settleConstraints[partner1].end())
settleConstraints[i].clear();
else if (i < partner1 && i < partner2)
settleClusters.push_back(i);
}
else
settleConstraints[i].clear();
}
// Record the actual SETTLE clusters.
CUDAStream<int4>* psSettleID = new CUDAStream<int4>((int) settleClusters.size(), 1);
gpu->psSettleID = psSettleID;
gpu->sim.pSettleID = psSettleID->_pDevStream[0];
CUDAStream<float2>* psSettleParameter = new CUDAStream<float2>((int) settleClusters.size(), 1);
gpu->psSettleParameter = psSettleParameter;
gpu->sim.pSettleParameter = psSettleParameter->_pDevStream[0];
gpu->sim.settleConstraints = settleClusters.size();
for (int i = 0; i < settleClusters.size(); i++) {
int atom1 = settleClusters[i];
int atom2 = settleConstraints[atom1].begin()->first;
int atom3 = (++settleConstraints[atom1].begin())->first;
float dist12 = settleConstraints[atom1].find(atom2)->second;
float dist13 = settleConstraints[atom1].find(atom3)->second;
float dist23 = settleConstraints[atom2].find(atom3)->second;
if (dist12 == dist13) { // atom1 is the central atom
psSettleID->_pSysData[i].x = atom1;
psSettleID->_pSysData[i].y = atom2;
psSettleID->_pSysData[i].z = atom3;
psSettleParameter->_pSysData[i].x = dist12;
psSettleParameter->_pSysData[i].y = dist23;
}
else if (dist12 == dist23) { // atom2 is the central atom
psSettleID->_pSysData[i].x = atom2;
psSettleID->_pSysData[i].y = atom1;
psSettleID->_pSysData[i].z = atom3;
psSettleParameter->_pSysData[i].x = dist12;
psSettleParameter->_pSysData[i].y = dist13;
}
else if (dist13 == dist23) { // atom3 is the central atom
psSettleID->_pSysData[i].x = atom3;
psSettleID->_pSysData[i].y = atom1;
psSettleID->_pSysData[i].z = atom2;
psSettleParameter->_pSysData[i].x = dist13;
psSettleParameter->_pSysData[i].y = dist12;
}
else
throw OpenMMException("Two of the three distances constrained with SETTLE must be the same.");
}
psSettleID->Upload();
psSettleParameter->Upload();
gpu->sim.settle_threads_per_block = (gpu->sim.settleConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks;
if (gpu->sim.settle_threads_per_block > gpu->sim.max_shake_threads_per_block)
gpu->sim.settle_threads_per_block = gpu->sim.max_shake_threads_per_block;
if (gpu->sim.settle_threads_per_block < 1)
gpu->sim.settle_threads_per_block = 1;
// Find clusters consisting of a central atom with up to three peripheral atoms.
map<int, ShakeCluster> clusters;
for (int i = 0; i < atom1.size(); i++) {
if (settleConstraints[atom1[i]].size() == 2)
continue; // This is being taken care of with SETTLE.
// Determine which is the central atom.
bool firstIsCentral;
......@@ -1096,9 +1163,6 @@ int gpuAllocateInitialBuffers(gpuContext gpu)
gpu->sim.degreesOfFreedom = 3 * gpu->sim.atoms - 6;
gpu->gpAtomTable = NULL;
gpu->gAtomTypes = 0;
gpu->sim.nonbondOutputBuffers = gpu->sim.paddedNumberOfAtoms / GRID;
gpu->sim.totalNonbondOutputBuffers = 2 * gpu->sim.nonbondOutputBuffers;
gpu->sim.outputBuffers = gpu->sim.totalNonbondOutputBuffers;
gpu->psPosq4 = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1);
gpu->sim.stride = gpu->psPosq4->_stride;
gpu->sim.stride2 = gpu->sim.stride * 2;
......@@ -1129,10 +1193,14 @@ int gpuAllocateInitialBuffers(gpuContext gpu)
gpu->psObcData = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1);
gpu->sim.pObcData = gpu->psObcData->_pDevStream[0];
gpu->pAtomSymbol = new unsigned char[gpu->natoms];
gpu->psAtomIndex = new CUDAStream<int>(gpu->sim.paddedNumberOfAtoms, 1);
gpu->sim.pAtomIndex = gpu->psAtomIndex->_pDevStream[0];
for (int i = 0; i < (int) gpu->sim.paddedNumberOfAtoms; i++)
gpu->psAtomIndex->_pSysStream[0][i] = i;
gpu->psAtomIndex->Upload();
// Determine randoms
gpu->seed = (unsigned long)time(NULL) & 0x000fffff;
gpu->sim.randomFrames = 995;
gpu->sim.randomFrames = 95;
gpu->sim.randomIterations = gpu->sim.randomFrames;
gpu->sim.randoms = gpu->sim.randomFrames * gpu->sim.paddedNumberOfAtoms - 5 * GRID;
gpu->sim.totalRandoms = gpu->sim.randoms + gpu->sim.paddedNumberOfAtoms;
......@@ -1351,8 +1419,8 @@ void* gpuInit(int numAtoms)
{
sscanf(pAdapter, "%d", &device);
}
cudaError_t status = cudaSetDevice(device);
RTERROR(status, "Error setting CUDA device")
// cudaError_t status = cudaSetDevice(device);
// RTERROR(status, "Error setting CUDA device")
// Determine which core to run on
#if 0
......@@ -1445,6 +1513,8 @@ void* gpuInit(int numAtoms)
gpu->sim.probeRadius = probeRadius;
gpu->sim.surfaceAreaFactor = surfaceAreaFactor;
gpu->sim.electricConstant = electricConstant;
gpu->sim.nonbondedMethod = NO_CUTOFF;
gpu->sim.nonbondedCutoffSqr = 0.0f;
gpu->sim.bigFloat = 99999999.0f;
gpu->sim.forceConversionFactor = forceConversionFactor;
......@@ -1461,6 +1531,7 @@ void* gpuInit(int numAtoms)
gpu->bCalculateCM = false;
gpu->bRemoveCM = false;
gpu->bRecalculateBornRadii = true;
gpu->bIncludeGBSA = false;
gpuInitializeRandoms(gpu);
// To be determined later
......@@ -1489,19 +1560,21 @@ void* gpuInit(int numAtoms)
gpu->psLJ14Parameter = NULL;
gpu->psShakeID = NULL;
gpu->psShakeParameter = NULL;
gpu->psSettleID = NULL;
gpu->psSettleParameter = NULL;
gpu->psExclusion = NULL;
gpu->psWorkUnit = NULL;
gpu->psInteractingWorkUnit = NULL;
gpu->psInteractionFlag = NULL;
gpu->psInteractionCount = NULL;
gpu->psGridBoundingBox = NULL;
gpu->psGridCenter = NULL;
// Initialize output buffer before reading parameters
gpu->pOutputBufferCounter = new unsigned int[gpu->sim.paddedNumberOfAtoms];
memset(gpu->pOutputBufferCounter, 0, gpu->sim.paddedNumberOfAtoms * sizeof(unsigned int));
// Initialize exclusion array
gpu->pExclusion = new unsigned int[gpu->sim.paddedNumberOfAtoms * gpu->sim.paddedNumberOfAtoms];
for (unsigned int i = 0; i < gpu->sim.paddedNumberOfAtoms * gpu->sim.paddedNumberOfAtoms; i++)
gpu->pExclusion[i] = 1;
return (void*)gpu;
}
......@@ -1586,7 +1659,6 @@ void gpuShutDown(gpuContext gpu)
{
// Delete sysmem pointers
delete[] gpu->pOutputBufferCounter;
delete[] gpu->pExclusion;
delete[] gpu->gpAtomTable;
delete[] gpu->pAtomSymbol;
......@@ -1620,13 +1692,23 @@ void gpuShutDown(gpuContext gpu)
delete gpu->psLJ14Parameter;
delete gpu->psShakeID;
delete gpu->psShakeParameter;
delete gpu->psSettleID;
delete gpu->psSettleParameter;
delete gpu->psExclusion;
delete gpu->psWorkUnit;
delete gpu->psInteractingWorkUnit;
delete gpu->psInteractionFlag;
delete gpu->psInteractionCount;
delete gpu->psRandom4;
delete gpu->psRandom2;
delete gpu->psRandomPosition;
delete gpu->psRandomSeed;
delete gpu->psLinearMomentum;
delete gpu->psAtomIndex;
delete gpu->psGridBoundingBox;
delete gpu->psGridCenter;
if (gpu->cudpp != 0)
cudppDestroyPlan(gpu->cudpp);
// Wrap up
delete gpu;
......@@ -1636,6 +1718,19 @@ void gpuShutDown(gpuContext gpu)
extern "C"
int gpuBuildOutputBuffers(gpuContext gpu)
{
// Select the number of output buffer to use.
gpu->bOutputBufferPerWarp = true;
gpu->sim.nonbondOutputBuffers = gpu->sim.nonbond_blocks * gpu->sim.nonbond_threads_per_block / GRID;
if (gpu->sim.nonbondOutputBuffers >= gpu->sim.paddedNumberOfAtoms/GRID)
{
// For small systems, it is more efficient to have one output buffer per block of 32 atoms instead of one per warp.
gpu->bOutputBufferPerWarp = false;
gpu->sim.nonbondOutputBuffers = gpu->sim.paddedNumberOfAtoms / GRID;
}
gpu->sim.totalNonbondOutputBuffers = (gpu->bIncludeGBSA ? 2 * gpu->sim.nonbondOutputBuffers : gpu->sim.nonbondOutputBuffers);
gpu->sim.outputBuffers = gpu->sim.totalNonbondOutputBuffers;
unsigned int outputBuffers = gpu->sim.totalNonbondOutputBuffers;
for (unsigned int i = 0; i < gpu->sim.paddedNumberOfAtoms; i++)
{
......@@ -1715,30 +1810,59 @@ int gpuBuildThreadBlockWorkList(gpuContext gpu)
const unsigned int grid = gpu->grid;
const unsigned int dim = (atoms + (grid - 1)) / grid;
const unsigned int cells = dim * (dim + 1) / 2;
const unsigned int* pExclusion = gpu->pExclusion;
CUDAStream<unsigned int>* psWorkUnit = new CUDAStream<unsigned int>(cells, 1u);
unsigned int* pWorkList = psWorkUnit->_pSysStream[0];
gpu->psWorkUnit = psWorkUnit;
gpu->sim.pWorkUnit = psWorkUnit->_pDevStream[0];
CUDAStream<unsigned int>* psInteractingWorkUnit = new CUDAStream<unsigned int>(cells, 1u);
gpu->psInteractingWorkUnit = psInteractingWorkUnit;
gpu->sim.pInteractingWorkUnit = psInteractingWorkUnit->_pDevStream[0];
CUDAStream<unsigned int>* psInteractionFlag = new CUDAStream<unsigned int>(cells, 1u);
gpu->psInteractionFlag = psInteractionFlag;
gpu->sim.pInteractionFlag = psInteractionFlag->_pDevStream[0];
CUDAStream<size_t>* psInteractionCount = new CUDAStream<size_t>(1, 1u);
gpu->psInteractionCount = psInteractionCount;
gpu->sim.pInteractionCount = psInteractionCount->_pDevStream[0];
CUDAStream<float4>* psGridBoundingBox = new CUDAStream<float4>(dim, 1u);
gpu->psGridBoundingBox = psGridBoundingBox;
gpu->sim.pGridBoundingBox = psGridBoundingBox->_pDevStream[0];
CUDAStream<float4>* psGridCenter = new CUDAStream<float4>(dim, 1u);
gpu->psGridCenter = psGridCenter;
gpu->sim.pGridCenter = psGridCenter->_pDevStream[0];
gpu->sim.nonbond_workBlock = gpu->sim.nonbond_threads_per_block / GRID;
gpu->sim.bornForce2_workBlock = gpu->sim.bornForce2_threads_per_block / GRID;
gpu->sim.workUnits = cells;
// Increase block count if necessary for extra large molecules that would
// otherwise overflow the SM workunit buffers
int minimumBlocks = (cells + gpu->sim.workUnitsPerSM - 1) / gpu->sim.workUnitsPerSM;
if ((int) gpu->sim.nonbond_blocks < minimumBlocks)
{
gpu->sim.nonbond_blocks = gpu->sim.nonbond_blocks * ((minimumBlocks + gpu->sim.nonbond_blocks - 1) / gpu->sim.nonbond_blocks);
}
if ((int) gpu->sim.bornForce2_blocks < minimumBlocks)
// Initialize the CUDPP workspace.
gpu->cudpp = 0;
CUDPPConfiguration config;
config.datatype = CUDPP_UINT;
config.algorithm = CUDPP_COMPACT;
config.options = CUDPP_OPTION_FORWARD;
CUDPPResult result = cudppPlan(&gpu->cudpp, config, cells, 1, 0);
if (CUDPP_SUCCESS != result)
{
gpu->sim.bornForce2_blocks = gpu->sim.bornForce2_blocks * ((minimumBlocks + gpu->sim.bornForce2_blocks - 1) / gpu->sim.bornForce2_blocks);
printf("Error initializing CUDPP: %d\n", result);
exit(-1);
}
// Increase block count if necessary for extra large molecules that would
// otherwise overflow the SM workunit buffers
// int minimumBlocks = (cells + gpu->sim.workUnitsPerSM - 1) / gpu->sim.workUnitsPerSM;
// if ((int) gpu->sim.nonbond_blocks < minimumBlocks)
// {
// gpu->sim.nonbond_blocks = gpu->sim.nonbond_blocks * ((minimumBlocks + gpu->sim.nonbond_blocks - 1) / gpu->sim.nonbond_blocks);
// }
// if ((int) gpu->sim.bornForce2_blocks < minimumBlocks)
// {
// gpu->sim.bornForce2_blocks = gpu->sim.bornForce2_blocks * ((minimumBlocks + gpu->sim.bornForce2_blocks - 1) / gpu->sim.bornForce2_blocks);
// }
gpu->sim.nbWorkUnitsPerBlock = cells / gpu->sim.nonbond_blocks;
gpu->sim.nbWorkUnitsPerBlockRemainder = cells - gpu->sim.nonbond_blocks * gpu->sim.nbWorkUnitsPerBlock;
gpu->sim.bf2WorkUnitsPerBlock = cells / gpu->sim.bornForce2_blocks;
gpu->sim.bf2WorkUnitsPerBlockRemainder = cells - gpu->sim.bornForce2_blocks * gpu->sim.bf2WorkUnitsPerBlock;
gpu->sim.interaction_threads_per_block = 64;
gpu->sim.interaction_blocks = (gpu->sim.workUnits + gpu->sim.interaction_threads_per_block - 1) / gpu->sim.interaction_threads_per_block;
// Decrease thread count for extra small molecules to spread computation
// across entire chip
......@@ -1763,23 +1887,6 @@ int gpuBuildThreadBlockWorkList(gpuContext gpu)
for (unsigned int x = y; x < dim; x++)
{
pWorkList[count] = (x << 17) | (y << 2);
// Check for exclusions
int exclusions = 0;
for (unsigned int i = y * grid; i < y * grid + grid; i++)
{
for (unsigned int j = x * grid; j < x * grid + grid; j++)
{
if (!pExclusion[i * atoms + j])
{
exclusions++;
}
}
}
// Signal exclusions if they exist
if (exclusions > 0)
pWorkList[count] |= 0x1;
count++;
}
}
......@@ -1790,42 +1897,58 @@ int gpuBuildThreadBlockWorkList(gpuContext gpu)
}
extern "C"
int gpuBuildExclusionList(gpuContext gpu)
void gpuBuildExclusionList(gpuContext gpu)
{
unsigned int atoms = gpu->sim.paddedNumberOfAtoms;
CUDAStream<unsigned int>* psExclusion = new CUDAStream<unsigned int>(atoms * atoms, 1u);
const unsigned int atoms = gpu->sim.paddedNumberOfAtoms;
const unsigned int grid = gpu->grid;
const unsigned int dim = (atoms+(grid-1))/grid;
CUDAStream<unsigned int>* psExclusion = new CUDAStream<unsigned int>((atoms*atoms+grid-1) / grid, 1u);
gpu->psExclusion = psExclusion;
gpu->sim.pExclusion = psExclusion->_pDevStream[0];
unsigned int* pExList = psExclusion->_pSysStream[0];
int exclusions = 0;
unsigned int pos = 0;
unsigned int* pWorkList = gpu->psWorkUnit->_pSysStream[0];
for (int i = 0; i < psExclusion->_length; ++i)
pExList[i] = 0xFFFFFFFF;
for (unsigned int x = 0; x < atoms; x += gpu->grid)
{
for (unsigned int y = 0; y < atoms; y += gpu->grid)
{
for (unsigned x1 = x; x1 < x + gpu->grid; x1++)
{
unsigned int mask = 0;
for (unsigned int y1 = y ; y1 < y + gpu->grid; y1++)
// Fill in the exclusions.
for (int atom1 = 0; atom1 < gpu->exclusions.size(); ++atom1)
{
mask >>= 1;
if (gpu->pExclusion[x1 * atoms + y1] == 0)
int x = atom1/grid;
int offset = atom1-x*grid;
for (int j = 0; j < gpu->exclusions[atom1].size(); ++j)
{
if (x1 >= y1)
exclusions++;
}
else
mask |= 0x80000000;
int atom2 = gpu->exclusions[atom1][j];
int y = atom2/grid;
int index = x*atoms+y*grid+offset;
pExList[index] &= 0xFFFFFFFF-(1<<(atom2-y*grid));
int cell = (x > y ? x+y*dim-y*(y+1)/2 : y+x*dim-x*(x+1)/2);
pWorkList[cell] |= 1;
}
pExList[pos++] = mask;
}
// Mark all interactions that involve a padding atom as being excluded.
for (int atom1 = gpu->natoms; atom1 < atoms; ++atom1)
{
int x = atom1/grid;
int offset1 = atom1-x*grid;
for (int atom2 = 0; atom2 < atoms; ++atom2)
{
int y = atom2/grid;
int index = x*atoms+y*grid+offset1;
pExList[index] &= 0xFFFFFFFF-(1<<(atom2-y*grid));
int offset2 = atom2-y*grid;
index = y*atoms+x*grid+offset2;
pExList[index] &= 0xFFFFFFFF-(1<<(atom1-x*grid));
int cell = (x > y ? x+y*dim-y*(y+1)/2 : y+x*dim-x*(x+1)/2);
pWorkList[cell] |= 1;
}
}
psExclusion->Upload();
gpu->psWorkUnit->Upload();
gpuSetConstants(gpu);
return exclusions;
}
extern "C"
......@@ -1842,14 +1965,12 @@ int gpuSetConstants(gpuContext gpu)
SetUpdateShakeHSim(gpu);
SetVerletUpdateSim(gpu);
SetBrownianUpdateSim(gpu);
SetSettleSim(gpu);
SetRandomSim(gpu);
if (gpu->sm_version >= SM_12)
{
SetCalculateCDLJForces_12Sim(gpu);
SetCalculateCDLJObcGbsaForces1_12Sim(gpu);
SetCalculateObcGbsaForces1_12Sim(gpu);
SetCalculateObcGbsaForces2_12Sim(gpu);
}
return 1;
......@@ -2705,3 +2826,329 @@ void gpuDumpObcLoop1(gpuContext gpu)
);
}
}
static void tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds)
{
// Recursively tag atoms as belonging to a particular molecule.
atomMolecule[atom] = molecule;
for (int i = 0; i < atomBonds[atom].size(); i++)
if (atomMolecule[atomBonds[atom][i]] == -1)
tagAtomsInMolecule(atomBonds[atom][i], molecule, atomMolecule, atomBonds);
}
static void findMoleculeGroups(gpuContext gpu)
{
// First make a list of constraints for future use.
vector<Constraint> constraints;
for (int i = 0; i < gpu->sim.ShakeConstraints; i++)
{
int atom1 = gpu->psShakeID->_pSysData[i].x;
int atom2 = gpu->psShakeID->_pSysData[i].y;
int atom3 = gpu->psShakeID->_pSysData[i].z;
int atom4 = gpu->psShakeID->_pSysData[i].w;
float distance2 = gpu->psShakeParameter->_pSysData[i].z;
constraints.push_back(Constraint(atom1, atom2, distance2));
if (atom3 != -1)
constraints.push_back(Constraint(atom1, atom3, distance2));
if (atom4 != -1)
constraints.push_back(Constraint(atom1, atom3, distance2));
}
for (int i = 0; i < gpu->sim.settleConstraints; i++)
{
int atom1 = gpu->psSettleID->_pSysData[i].x;
int atom2 = gpu->psSettleID->_pSysData[i].y;
int atom3 = gpu->psSettleID->_pSysData[i].z;
float distance12 = gpu->psSettleParameter->_pSysData[i].x;
float distance23 = gpu->psSettleParameter->_pSysData[i].y;
constraints.push_back(Constraint(atom1, atom2, distance12*distance12));
constraints.push_back(Constraint(atom1, atom3, distance12*distance12));
constraints.push_back(Constraint(atom2, atom3, distance23*distance23));
}
// First make a list of every other atom to which each atom is connect by a bond or constraint.
int numAtoms = gpu->natoms;
vector<vector<int> > atomBonds(numAtoms);
for (int i = 0; i < gpu->sim.bonds; i++)
{
int atom1 = gpu->psBondID->_pSysData[i].x;
int atom2 = gpu->psBondID->_pSysData[i].y;
atomBonds[atom1].push_back(atom2);
atomBonds[atom2].push_back(atom1);
}
for (int i = 0; i < constraints.size(); i++)
{
int atom1 = constraints[i].atom1;
int atom2 = constraints[i].atom2;
atomBonds[atom1].push_back(atom2);
atomBonds[atom2].push_back(atom1);
}
// Now tag atoms by which molecule they belong to.
vector<int> atomMolecule(numAtoms, -1);
int numMolecules = 0;
for (int i = 0; i < numAtoms; i++)
if (atomMolecule[i] == -1)
tagAtomsInMolecule(i, numMolecules++, atomMolecule, atomBonds);
vector<vector<int> > atomIndices(numMolecules);
for (int i = 0; i < numAtoms; i++)
atomIndices[atomMolecule[i]].push_back(i);
// Construct a description of each molecule.
vector<Molecule> molecules(numMolecules);
for (int i = 0; i < numMolecules; i++)
molecules[i].atoms = atomIndices[i];
for (int i = 0; i < gpu->sim.bonds; i++)
{
int atom1 = gpu->psBondID->_pSysData[i].x;
molecules[atomMolecule[atom1]].bonds.push_back(i);
}
for (int i = 0; i < gpu->sim.bond_angles; i++)
{
int atom1 = gpu->psBondAngleID1->_pSysData[i].x;
molecules[atomMolecule[atom1]].angles.push_back(i);
}
for (int i = 0; i < gpu->sim.dihedrals; i++)
{
int atom1 = gpu->psDihedralID1->_pSysData[i].x;
molecules[atomMolecule[atom1]].periodicTorsions.push_back(i);
}
for (int i = 0; i < gpu->sim.rb_dihedrals; i++)
{
int atom1 = gpu->psRbDihedralID1->_pSysData[i].x;
molecules[atomMolecule[atom1]].rbTorsions.push_back(i);
}
for (int i = 0; i < constraints.size(); i++)
{
molecules[atomMolecule[constraints[i].atom1]].constraints.push_back(i);
}
// Sort them into groups of identical molecules.
vector<Molecule> uniqueMolecules;
vector<vector<int> > moleculeInstances;
for (int molIndex = 0; molIndex < molecules.size(); molIndex++)
{
Molecule& mol = molecules[molIndex];
// See if it is identical to another molecule.
bool isNew = true;
for (int j = 0; j < uniqueMolecules.size() && isNew; j++)
{
Molecule& mol2 = uniqueMolecules[j];
bool identical = true;
if (mol.atoms.size() != mol2.atoms.size() || mol.bonds.size() != mol2.bonds.size()
|| mol.angles.size() != mol2.angles.size() || mol.periodicTorsions.size() != mol2.periodicTorsions.size()
|| mol.rbTorsions.size() != mol2.rbTorsions.size() || mol.constraints.size() != mol2.constraints.size())
identical = false;
int atomOffset = mol2.atoms[0]-mol.atoms[0];
float4* posq = gpu->psPosq4->_pSysData;
float4* velm = gpu->psVelm4->_pSysData;
float2* sigeps = gpu->psSigEps2->_pSysData;
for (int i = 0; i < mol.atoms.size() && identical; i++)
if (mol.atoms[i] != mol2.atoms[i]-atomOffset || posq[mol.atoms[i]].w != posq[mol2.atoms[i]].w ||
velm[mol.atoms[i]].w != velm[mol2.atoms[i]].w || sigeps[mol.atoms[i]].x != sigeps[mol2.atoms[i]].x ||
sigeps[mol.atoms[i]].y != sigeps[mol2.atoms[i]].y)
identical = false;
int4* bondID = gpu->psBondID->_pSysData;
float2* bondParam = gpu->psBondParameter->_pSysData;
for (int i = 0; i < mol.bonds.size() && identical; i++)
if (bondID[mol.bonds[i]].x != bondID[mol2.bonds[i]].x-atomOffset || bondID[mol.bonds[i]].y != bondID[mol2.bonds[i]].y-atomOffset ||
bondParam[mol.bonds[i]].x != bondParam[mol2.bonds[i]].x || bondParam[mol.bonds[i]].y != bondParam[mol2.bonds[i]].y)
identical = false;
int4* angleID = gpu->psBondAngleID1->_pSysData;
float2* angleParam = gpu->psBondAngleParameter->_pSysData;
for (int i = 0; i < mol.angles.size() && identical; i++)
if (angleID[mol.angles[i]].x != angleID[mol2.angles[i]].x-atomOffset ||
angleID[mol.angles[i]].y != angleID[mol2.angles[i]].y-atomOffset ||
angleID[mol.angles[i]].z != angleID[mol2.angles[i]].z-atomOffset ||
angleParam[mol.angles[i]].x != angleParam[mol2.angles[i]].x ||
angleParam[mol.angles[i]].y != angleParam[mol2.angles[i]].y)
identical = false;
int4* periodicID = gpu->psDihedralID1->_pSysData;
float4* periodicParam = gpu->psDihedralParameter->_pSysData;
for (int i = 0; i < mol.periodicTorsions.size() && identical; i++)
if (periodicID[mol.periodicTorsions[i]].x != periodicID[mol2.periodicTorsions[i]].x-atomOffset ||
periodicID[mol.periodicTorsions[i]].y != periodicID[mol2.periodicTorsions[i]].y-atomOffset ||
periodicID[mol.periodicTorsions[i]].z != periodicID[mol2.periodicTorsions[i]].z-atomOffset ||
periodicID[mol.periodicTorsions[i]].w != periodicID[mol2.periodicTorsions[i]].w-atomOffset ||
periodicParam[mol.periodicTorsions[i]].x != periodicParam[mol2.periodicTorsions[i]].x ||
periodicParam[mol.periodicTorsions[i]].y != periodicParam[mol2.periodicTorsions[i]].y ||
periodicParam[mol.periodicTorsions[i]].z != periodicParam[mol2.periodicTorsions[i]].z)
identical = false;
int4* rbID = gpu->psRbDihedralID1->_pSysData;
float4* rbParam1 = gpu->psRbDihedralParameter1->_pSysData;
float2* rbParam2 = gpu->psRbDihedralParameter2->_pSysData;
for (int i = 0; i < mol.rbTorsions.size() && identical; i++)
if (rbID[mol.rbTorsions[i]].x != rbID[mol2.rbTorsions[i]].x-atomOffset ||
rbID[mol.rbTorsions[i]].y != rbID[mol2.rbTorsions[i]].y-atomOffset ||
rbID[mol.rbTorsions[i]].z != rbID[mol2.rbTorsions[i]].z-atomOffset ||
rbID[mol.rbTorsions[i]].w != rbID[mol2.rbTorsions[i]].w-atomOffset ||
rbParam1[mol.rbTorsions[i]].x != rbParam1[mol2.rbTorsions[i]].x ||
rbParam1[mol.rbTorsions[i]].y != rbParam1[mol2.rbTorsions[i]].y ||
rbParam1[mol.rbTorsions[i]].z != rbParam1[mol2.rbTorsions[i]].z ||
rbParam1[mol.rbTorsions[i]].w != rbParam1[mol2.rbTorsions[i]].w ||
rbParam2[mol.rbTorsions[i]].x != rbParam2[mol2.rbTorsions[i]].x ||
rbParam2[mol.rbTorsions[i]].y != rbParam2[mol2.rbTorsions[i]].y)
identical = false;
for (int i = 0; i < mol.constraints.size() && identical; i++)
if (constraints[mol.constraints[i]].atom1 != constraints[mol2.constraints[i]].atom1-atomOffset ||
constraints[mol.constraints[i]].atom2 != constraints[mol2.constraints[i]].atom2-atomOffset ||
constraints[mol.constraints[i]].distance2 != constraints[mol2.constraints[i]].distance2)
identical = false;
if (identical)
{
moleculeInstances[j].push_back(mol.atoms[0]);
isNew = false;
}
}
if (isNew)
{
uniqueMolecules.push_back(mol);
moleculeInstances.push_back(vector<int>());
moleculeInstances[moleculeInstances.size()-1].push_back(mol.atoms[0]);
}
}
gpu->moleculeGroups.resize(moleculeInstances.size());
for (int i = 0; i < moleculeInstances.size(); i++)
{
gpu->moleculeGroups[i].instances = moleculeInstances[i];
vector<int>& atoms = uniqueMolecules[i].atoms;
gpu->moleculeGroups[i].atoms.resize(atoms.size());
for (int j = 0; j < atoms.size(); j++)
gpu->moleculeGroups[i].atoms[j] = atoms[j]-atoms[0];
}
}
extern "C"
void gpuReorderAtoms(gpuContext gpu)
{
if (gpu->natoms == 0 || gpu->sim.nonbondedCutoffSqr == 0.0)
return;
if (gpu->moleculeGroups.size() == 0)
findMoleculeGroups(gpu);
// Find the range of positions and the number of bins along each axis.
int numAtoms = gpu->natoms;
gpu->psPosq4->Download();
gpu->psVelm4->Download();
float4* posq = gpu->psPosq4->_pSysData;
float4* velm = gpu->psVelm4->_pSysData;
float minx = posq[0].x, maxx = posq[0].x;
float miny = posq[0].y, maxy = posq[0].y;
float minz = posq[0].z, maxz = posq[0].z;
if (gpu->sim.nonbondedMethod == PERIODIC)
{
minx = miny = minz = 0.0;
maxx = gpu->sim.periodicBoxSizeX;
maxy = gpu->sim.periodicBoxSizeY;
maxz = gpu->sim.periodicBoxSizeZ;
}
else
{
for (int i = 1; i < numAtoms; i++)
{
minx = min(minx, posq[i].x);
maxx = max(maxx, posq[i].x);
miny = min(miny, posq[i].y);
maxy = max(maxy, posq[i].y);
minz = min(minz, posq[i].z);
maxz = max(maxz, posq[i].z);
}
}
float binWidth = 0.2*sqrt(gpu->sim.nonbondedCutoffSqr);
int xbins = 1 + (int) ((maxx-minx)/binWidth);
int ybins = 1 + (int) ((maxy-miny)/binWidth);
int zbins = 1 + (int) ((maxz-minz)/binWidth);
// Loop over each group of identical molecules and reorder them.
vector<int> originalIndex(numAtoms);
vector<float4> newPosq(numAtoms);
vector<float4> newVelm(numAtoms);
for (int group = 0; group < gpu->moleculeGroups.size(); group++)
{
// Find the center of each molecule.
gpuMoleculeGroup& mol = gpu->moleculeGroups[group];
int numMolecules = mol.instances.size();
vector<int>& atoms = mol.atoms;
vector<float3> molPos(numMolecules);
for (int i = 0; i < numMolecules; i++)
{
molPos[i].x = 0.0f;
molPos[i].y = 0.0f;
molPos[i].z = 0.0f;
for (int j = 0; j < atoms.size(); j++)
{
int atom = atoms[j]+mol.instances[i];
molPos[i].x += posq[atom].x;
molPos[i].y += posq[atom].y;
molPos[i].z += posq[atom].z;
}
molPos[i].x /= atoms.size();
molPos[i].y /= atoms.size();
molPos[i].z /= atoms.size();
}
if (gpu->sim.nonbondedMethod == PERIODIC)
{
// Move each molecule position into the same box.
for (int i = 0; i < numMolecules; i++)
{
molPos[i].x -= floor(molPos[i].x/gpu->sim.periodicBoxSizeX)*gpu->sim.periodicBoxSizeX;
molPos[i].y -= floor(molPos[i].y/gpu->sim.periodicBoxSizeY)*gpu->sim.periodicBoxSizeY;
molPos[i].z -= floor(molPos[i].z/gpu->sim.periodicBoxSizeZ)*gpu->sim.periodicBoxSizeZ;
}
}
// Select a bin for each molecule, then sort them by bin.
vector<pair<int, int> > molBins(numMolecules);
for (int i = 0; i < numMolecules; i++)
{
int x = (int) ((molPos[i].x-minx)/binWidth);
int y = (int) ((molPos[i].y-miny)/binWidth);
int z = (int) ((molPos[i].z-minz)/binWidth);
int yodd = y&1;
int zodd = z&1;
int bin = z*xbins*ybins;
bin += (zodd ? ybins-y : y)*xbins;
bin += (yodd ? xbins-x : x);
molBins[i] = pair<int, int>(bin, i);
}
sort(molBins.begin(), molBins.end());
// Reorder the atoms.
for (int i = 0; i < numMolecules; i++)
{
for (int j = 0; j < atoms.size(); j++)
{
int oldIndex = mol.instances[molBins[i].second]+atoms[j];
int newIndex = mol.instances[i]+atoms[j];
originalIndex[newIndex] = gpu->psAtomIndex->_pSysStream[0][oldIndex];
newPosq[newIndex] = posq[oldIndex];
newVelm[newIndex] = velm[oldIndex];
}
}
}
// Update the streams.
for (int i = 0; i < numAtoms; i++)
posq[i] = newPosq[i];
gpu->psPosq4->Upload();
for (int i = 0; i < numAtoms; i++)
velm[i] = newVelm[i];
gpu->psVelm4->Upload();
for (int i = 0; i < numAtoms; i++)
gpu->psAtomIndex->_pSysData[i] = originalIndex[i];
gpu->psAtomIndex->Upload();
}
......@@ -33,14 +33,20 @@
* -------------------------------------------------------------------------- */
#include "cudatypes.h"
#include "cudpp.h"
#include <vector>
struct gpuAtomType {
string name;
std::string name;
char symbol;
float r;
};
struct gpuMoleculeGroup {
std::vector<int> atoms;
std::vector<int> instances;
};
enum SM_VERSION
{
SM_10,
......@@ -61,8 +67,9 @@ struct _gpuContext {
int gAtomTypes;
cudaGmxSimulation sim;
unsigned int* pOutputBufferCounter;
unsigned int* pExclusion;
std::vector<std::vector<int> > exclusions;
unsigned char* pAtomSymbol;
std::vector<gpuMoleculeGroup> moleculeGroups;
float iterations;
float epsfac;
float solventDielectric;
......@@ -71,8 +78,11 @@ struct _gpuContext {
bool bCalculateCM;
bool bRemoveCM;
bool bRecalculateBornRadii;
bool bOutputBufferPerWarp;
bool bIncludeGBSA;
unsigned long seed;
SM_VERSION sm_version;
CUDPPHandle cudpp;
CUDAStream<float4>* psPosq4;
CUDAStream<float4>* psPosqP4;
CUDAStream<float4>* psOldPosq4;
......@@ -103,15 +113,21 @@ struct _gpuContext {
CUDAStream<int>* psNonShakeID;
CUDAStream<int4>* psShakeID;
CUDAStream<float4>* psShakeParameter;
CUDAStream<int4>* psSettleID;
CUDAStream<float2>* psSettleParameter;
CUDAStream<unsigned int>* psExclusion;
CUDAStream<unsigned int>* psWorkUnit;
CUDAStream<unsigned int>* psInteractingWorkUnit;
CUDAStream<unsigned int>* psInteractionFlag;
CUDAStream<size_t>* psInteractionCount;
CUDAStream<float4>* psRandom4; // Pointer to sets of 4 random numbers for MD integration
CUDAStream<float2>* psRandom2; // Pointer to sets of 2 random numbers for MD integration
CUDAStream<uint4>* psRandomSeed; // Pointer to each random seed
CUDAStream<int>* psRandomPosition; // Pointer to random number positions
CUDAStream<float4>* psLinearMomentum; // Pointer to total linear momentum per CTA
CUDAStream<int>* psAtomIndex; // The original index of each atom
CUDAStream<float4>* psGridBoundingBox; // The size of each grid cell
CUDAStream<float4>* psGridCenter; // The center and radius for each grid cell
};
typedef struct _gpuContext *gpuContext;
......@@ -156,10 +172,10 @@ void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const std::
const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q1, const std::vector<float>& q2);
extern "C"
float gpuGetAtomicRadius(gpuContext gpu, string s);
float gpuGetAtomicRadius(gpuContext gpu, std::string s);
extern "C"
unsigned char gpuGetAtomicSymbol(gpuContext gpu, string s);
unsigned char gpuGetAtomicSymbol(gpuContext gpu, std::string s);
extern "C"
int gpuReadAtomicParameters(gpuContext gpu, char* fname);
......@@ -169,7 +185,13 @@ int gpuReadCoulombParameters(gpuContext gpu, char* fname);
extern "C"
void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const std::vector<int>& atom, const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q,
const std::vector<char>& symbol, const std::vector<vector<int> >& exclusions);
const std::vector<char>& symbol, const std::vector<std::vector<int> >& exclusions, CudaNonbondedMethod method);
extern "C"
void gpuSetNonbondedCutoff(gpuContext gpu, float cutoffDistance, float solventDielectric);
extern "C"
void gpuSetPeriodicBoxSize(gpuContext gpu, float xsize, float ysize, float zsize);
extern "C"
void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const std::vector<int>& atom, const std::vector<float>& radius, const std::vector<float>& scale);
......@@ -227,7 +249,7 @@ extern "C"
int gpuBuildThreadBlockWorkList(gpuContext gpu);
extern "C"
int gpuBuildExclusionList(gpuContext gpu);
void gpuBuildExclusionList(gpuContext gpu);
extern "C"
int gpuSetConstants(gpuContext gpu);
......@@ -274,4 +296,7 @@ void gpuDumpObcInfo(gpuContext gpu);
extern "C"
void gpuDumpObcLoop1(gpuContext gpu);
extern "C"
void gpuReorderAtoms(gpuContext gpu);
#endif //__GPUTYPES_H__
......@@ -54,15 +54,8 @@ struct Atom {
float fx;
float fy;
float fz;
float eps2;
float sig2;
};
__shared__ Atom sA[G8X_NONBOND_THREADS_PER_BLOCK];
__shared__ unsigned int sWorkUnit[G8X_NONBOND_WORKUNITS_PER_SM];
__shared__ unsigned int sNext[GRID];
static __constant__ cudaGmxSimulation cSim;
void SetCalculateCDLJForcesSim(gpuContext gpu)
......@@ -79,310 +72,102 @@ void GetCalculateCDLJForcesSim(gpuContext gpu)
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void kCalculateCDLJForces_kernel()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder);
int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder);
if (threadIdx.x < end - pos)
{
sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
}
if (threadIdx.x < GRID)
{
sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
}
__syncthreads();
// Include versions of the kernels for N^2 calculations.
// Now change pos and end to reflect work queue just read
// into shared memory
end = end - pos;
pos = end - (threadIdx.x >> GRIDBITS) - 1;
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateCDLJForces.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateCDLJForces.h"
while (pos >= 0)
{
// Include versions of the kernels with cutoffs.
// Extract cell coordinates from appropriate work unit
unsigned int x = sWorkUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
bool bExclusionFlag = (x & 0x1);
x = (x >> 17) << GRIDBITS;
float4 apos; // Local atom x, y, z, q
float3 af; // Local atom fx, fy, fz
float dx;
float dy;
float dz;
float r2;
float invR;
float sig;
float sig2;
float sig6;
float eps;
float dEdR;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
Atom* psA = &sA[tbx];
if (!bExclusionFlag)
{
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned int i = x + tgx;
apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].q = apos.w;
sA[threadIdx.x].sig = a.x;
sA[threadIdx.x].eps = a.y;
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
apos.w *= cSim.epsfac;
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrt(r2);
sig = a.x + psA[j].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[j].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[j].q * invR;
dEdR *= invR * invR;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
}
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateCDLJForces.h"
#include "kFindInteractingBlocks.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateCDLJForces.h"
// Write results
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
int j = y + tgx;
unsigned int i = x + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j];
apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = temp.w;
sA[threadIdx.x].sig = temp1.x;
sA[threadIdx.x].eps = temp1.y;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
sA[threadIdx.x].sig2 = a.x;
sA[threadIdx.x].eps2 = a.y;
apos.w *= cSim.epsfac;
// Include versions of the kernels with periodic boundary conditions.
for (j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrt(r2);
sig = a.x + psA[tj].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[tj].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[tj].q * invR;
dEdR *= invR * invR;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
tj = sNext[tj];
}
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateCDLJForces.h"
#include "kFindInteractingBlocks.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateCDLJForces.h"
// Write results
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
of.x = sA[threadIdx.x].fx;
of.y = sA[threadIdx.x].fy;
of.z = sA[threadIdx.x].fz;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
}
}
else // bExclusion
{
// Read exclusion data
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
unsigned int i = x + tgx;
apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].q = apos.w;
sA[threadIdx.x].sig = a.x;
sA[threadIdx.x].eps = a.y;
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
sA[threadIdx.x].sig2 = a.x;
sA[threadIdx.x].eps2 = a.y;
apos.w *= cSim.epsfac;
__global__ extern void kCalculateCDLJCutoffForces_12_kernel();
for (unsigned int j = 0; j < GRID; j++)
void kCalculateCDLJForces(gpuContext gpu)
{
// printf("kCalculateCDLJCutoffForces\n");
CUDPPResult result;
size_t numWithInteractions;
switch (gpu->sim.nonbondedMethod)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrt(r2);
sig = psA[tgx].sig2 + psA[j].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = psA[tgx].eps2 * psA[j].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[j].q * invR;
dEdR *= invR * invR;
if (!(excl & 0x1))
case NO_CUTOFF:
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
else
kCalculateCDLJN2Forces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
LAUNCHERROR("kCalculateCDLJN2Forces");
break;
case CUTOFF:
kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsCutoff");
kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
result = cudppCompact(gpu->cudpp, gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractionCount,
gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits);
if (result != CUDPP_SUCCESS)
{
dEdR = 0.0f;
}
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
excl >>= 1;
}
// Write results
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
printf("Error in cudppCompact: %d\n", result);
exit(-1);
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
excl = (excl >> tgx) | (excl << (GRID - tgx));
int j = y + tgx;
unsigned int i = x + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j];
apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = temp.w;
sA[threadIdx.x].sig = temp1.x;
sA[threadIdx.x].eps = temp1.y;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
sA[threadIdx.x].sig2 = a.x;
sA[threadIdx.x].eps2 = a.y;
apos.w *= cSim.epsfac;
for (j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrt(r2);
sig = psA[tgx].sig2 + psA[tj].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = psA[tgx].eps2 * psA[tj].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[tj].q * invR;
dEdR *= invR * invR;
if (!(excl & 0x1))
gpu->psInteractionCount->Download();
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateCDLJCutoffForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
LAUNCHERROR("kCalculateCDLJCutoffForces");
break;
case PERIODIC:
kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsPeriodic");
kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
result = cudppCompact(gpu->cudpp, gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractionCount,
gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits);
if (result != CUDPP_SUCCESS)
{
dEdR = 0.0f;
}
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
excl >>= 1;
tj = sNext[tj];
printf("Error in cudppCompact: %d\n", result);
exit(-1);
}
// Write results
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
of.x = sA[threadIdx.x].fx;
of.y = sA[threadIdx.x].fy;
of.z = sA[threadIdx.x].fz;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
}
}
pos -= cSim.nonbond_workBlock;
}
}
__global__ extern void kCalculateCDLJForces_12_kernel();
void kCalculateCDLJForces(gpuContext gpu)
{
// printf("kCalculateCDLJForces\n");
if (gpu->sm_version < SM_12)
kCalculateCDLJForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
gpu->psInteractionCount->Download();
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJPeriodicByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateCDLJForces_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
LAUNCHERROR("kCalculateCDLJForces");
kCalculateCDLJPeriodicForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
LAUNCHERROR("kCalculateCDLJPeriodicForces");
}
}
\ No newline at end of file
......@@ -33,9 +33,6 @@
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
......@@ -54,15 +51,8 @@ struct Atom {
float fy;
float fz;
float fb;
float q2;
float junk;
};
__shared__ Atom sA[G8X_NONBOND_THREADS_PER_BLOCK];
__shared__ unsigned int sWorkUnit[G8X_NONBOND_WORKUNITS_PER_SM];
__shared__ unsigned int sNext[GRID];
static __constant__ cudaGmxSimulation cSim;
void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu)
......@@ -79,376 +69,122 @@ void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu)
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void kCalculateCDLJObcGbsaForces1_kernel()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder);
int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder);
if (threadIdx.x < end - pos)
{
sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
}
if (threadIdx.x < GRID)
{
sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
}
__syncthreads();
// Include versions of the kernel for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateCDLJObcGbsaForces1.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateCDLJObcGbsaForces1.h"
// Include versions of the kernel with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateCDLJObcGbsaForces1.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateCDLJObcGbsaForces1.h"
// Include versions of the kernel with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateCDLJObcGbsaForces1.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateCDLJObcGbsaForces1.h"
extern __global__ void kFindBlockBoundsCutoff_kernel();
extern __global__ void kFindBlockBoundsPeriodic_kernel();
extern __global__ void kFindBlocksWithInteractionsCutoff_kernel();
extern __global__ void kFindBlocksWithInteractionsPeriodic_kernel();
// Now change pos and end to reflect work queue just read
// into shared memory
end = end - pos;
pos = end - (threadIdx.x >> GRIDBITS) - 1;
void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
{
// printf("kCalculateCDLJObcGbsaForces1\n");
while (pos >= 0)
{
// check if Born radii need to be calculated
// Extract cell coordinates from appropriate work unit
unsigned int x = sWorkUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
bool bExclusionFlag = (x & 0x1);
x = (x >> 17) << GRIDBITS;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int i = x + tgx;
float4 apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
float br = cSim.pBornRadii[i];
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
Atom* psA = &sA[tbx];
if (!bExclusionFlag)
{
if (x == y) // Handle diagonals uniquely at 50% efficiency
kClearBornForces(gpu);
CUDPPResult result;
size_t numWithInteractions;
switch (gpu->sim.nonbondedMethod)
{
// Read fixed atom data into registers and GRF
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].q = cSim.epsfac * apos.w;
sA[threadIdx.x].q2 = cSim.preFactor * apos.w;
sA[threadIdx.x].sig = a.x;
sA[threadIdx.x].eps = a.y;
sA[threadIdx.x].br = br;
float4 af;
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
af.w = 0.0f;
for (unsigned int j = 0; j < GRID; j++)
case NO_CUTOFF:
if (gpu->bRecalculateBornRadii)
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
// CDLJ part
float invR = 1.0f / sqrt(r2);
float sig = a.x + psA[j].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[j].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[j].q * invR;
dEdR *= invR * invR;
//float dEdR = 0.0f;
// ObcGbsaForce1 part
float alpha2_ij = br * psA[j].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrt(denominator2);
float Gpol = (apos.w * psA[j].q2) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
af.w += dGpol_dalpha2_ij * psA[j].br;
dEdR += Gpol * (1.0f - 0.25f * expTerm);
// Add Forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
}
// Write results
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
kCalculateObcGbsaBornSum(gpu);
kReduceObcGbsaBornSum(gpu);
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
int j = y + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j];
sA[threadIdx.x].br = cSim.pBornRadii[j];
float4 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
sA[threadIdx.x].fb = af.w = 0.0f;
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = cSim.epsfac * temp.w;
sA[threadIdx.x].q2 = cSim.preFactor * temp.w;
sA[threadIdx.x].sig = temp1.x;
sA[threadIdx.x].eps = temp1.y;
for (j = 0; j < GRID; j++)
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
else
kCalculateCDLJObcGbsaN2Forces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
LAUNCHERROR("kCalculateCDLJObcGbsaN2Forces1");
break;
case CUTOFF:
kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsCutoff");
kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
result = cudppCompact(gpu->cudpp, gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractionCount,
gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits);
if (result != CUDPP_SUCCESS)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
// CDLJ part
float invR = 1.0f / sqrt(r2);
float sig = a.x + psA[tj].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[tj].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[tj].q * invR;
dEdR *= invR * invR;
//float dEdR = 0.0f;
// ObcGbsaForce1 part
float alpha2_ij = br * psA[tj].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrt(denominator2);
float Gpol = (apos.w * psA[tj].q2) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
af.w += dGpol_dalpha2_ij * psA[tj].br;
psA[tj].fb += dGpol_dalpha2_ij * br;
dEdR += Gpol * (1.0f - 0.25f * expTerm);
// Add forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
tj = sNext[tj];
}
// Write results
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
af.x = sA[threadIdx.x].fx;
af.y = sA[threadIdx.x].fy;
af.z = sA[threadIdx.x].fz;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = sA[threadIdx.x].fb;
printf("Error in cudppCompact: %d\n", result);
exit(-1);
}
}
else // bExclusion
{
// Read exclusion data
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
float4 af;
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
af.w = 0.0f;
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].q = cSim.epsfac * apos.w;
sA[threadIdx.x].q2 = cSim.preFactor * apos.w;
sA[threadIdx.x].sig = a.x;
sA[threadIdx.x].eps = a.y;
sA[threadIdx.x].br = br;
for (unsigned int j = 0; j < GRID; j++)
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
// CDLJ part
float invR = 1.0f / sqrt(r2);
float sig = a.x + psA[j].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[j].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[j].q * invR;
dEdR *= invR * invR;
if (!(excl & 0x1))
gpu->psInteractionCount->Download();
if (gpu->bRecalculateBornRadii)
{
dEdR = 0.0f;
}
//float dEdR = 0.0f;
// ObcGbsaForce1 part
float alpha2_ij = br * psA[j].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrt(denominator2);
float Gpol = (apos.w * psA[j].q2) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
af.w += dGpol_dalpha2_ij * psA[j].br;
dEdR += Gpol * (1.0f - 0.25f * expTerm);
// Add Forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
excl >>= 1;
}
// Write results
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
kCalculateObcGbsaBornSum(gpu);
kReduceObcGbsaBornSum(gpu);
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
float4 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
sA[threadIdx.x].fb = af.w = 0.0f;
int j = y + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j];
sA[threadIdx.x].br = cSim.pBornRadii[j];
excl = (excl >> tgx) | (excl << (GRID - tgx));
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = cSim.epsfac * temp.w;
sA[threadIdx.x].q2 = cSim.preFactor * temp.w;
sA[threadIdx.x].sig = temp1.x;
sA[threadIdx.x].eps = temp1.y;
for (j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
// CDLJ part
float invR = 1.0f / sqrt(r2);
float sig = a.x + psA[tj].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[tj].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[tj].q * invR;
dEdR *= invR * invR;
if (!(excl & 0x1))
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateCDLJObcGbsaCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
LAUNCHERROR("kCalculateCDLJCutoffForces");
break;
case PERIODIC:
kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsPeriodic");
kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
result = cudppCompact(gpu->cudpp, gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractionCount,
gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits);
if (result != CUDPP_SUCCESS)
{
dEdR = 0.0f;
}
//float dEdR = 0.0f;
// ObcGbsaForce1 part
float alpha2_ij = br * psA[tj].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrt(denominator2);
float Gpol = (apos.w * psA[tj].q2) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
af.w += dGpol_dalpha2_ij * psA[tj].br;
psA[tj].fb += dGpol_dalpha2_ij * br;
dEdR += Gpol * (1.0f - 0.25f * expTerm);
// Add forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
excl >>= 1;
tj = sNext[tj];
printf("Error in cudppCompact: %d\n", result);
exit(-1);
}
// Write results
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
af.x = sA[threadIdx.x].fx;
af.y = sA[threadIdx.x].fy;
af.z = sA[threadIdx.x].fz;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = sA[threadIdx.x].fb;
}
}
pos -= cSim.nonbond_workBlock;
}
}
__global__ extern void kCalculateCDLJObcGbsaForces1_12_kernel();
void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
{
//printf("In kCalculateCDLJObcGbsaForces1 QQQ\n");
// check if Born radii need to be calculated
if( gpu->bRecalculateBornRadii ){
gpu->psInteractionCount->Download();
if (gpu->bRecalculateBornRadii)
{
kCalculateObcGbsaBornSum(gpu);
kReduceObcGbsaBornSum(gpu);
}
if (gpu->sm_version < SM_12)
kCalculateCDLJObcGbsaForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaPeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateCDLJObcGbsaForces1_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
if( 0 ){
static int step = 0;
// int numPrint = -1;
step++;
//WriteArrayToFile1( gpu, "ObcGbsaBornBRad", step, gpu->psBornRadii, numPrint );
//gpuDumpCoordinates( gpu );
kReduceBornSumAndForces( gpu );
gpuDumpObcLoop1( gpu );
}
LAUNCHERROR("kCalculateCDLJObcGbsaForces1");
kCalculateCDLJObcGbsaPeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
LAUNCHERROR("kCalculateCDLJPeriodicForces");
break;
}
}
......@@ -440,6 +440,8 @@ __global__ void kCalculateLocalForces_kernel()
pos += blockDim.x * gridDim.x;
}
if (cSim.nonbondedMethod == NO_CUTOFF)
{
while (pos < cSim.LJ14_offset)
{
unsigned int pos1 = pos - cSim.rb_dihedral_offset;
......@@ -483,6 +485,110 @@ __global__ void kCalculateLocalForces_kernel()
}
pos += blockDim.x * gridDim.x;
}
}
else if (cSim.nonbondedMethod == CUTOFF)
{
while (pos < cSim.LJ14_offset)
{
unsigned int pos1 = pos - cSim.rb_dihedral_offset;
if (pos1 < cSim.LJ14s)
{
int4 atom = cSim.pLJ14ID[pos1];
float4 LJ14 = cSim.pLJ14Parameter[pos1];
float4 a1 = cSim.pPosq[atom.x];
float4 a2 = cSim.pPosq[atom.y];
float3 d;
d.x = a1.x - a2.x;
d.y = a1.y - a2.y;
d.z = a1.z - a2.z;
float r2 = DOT3(d, d);
float inverseR = 1.0f / sqrt(r2);
float sig2 = inverseR * LJ14.y;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float dEdR = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
dEdR += LJ14.z * (inverseR - 2.0f * cSim.reactionFieldK * r2);
dEdR *= inverseR * inverseR;
if (r2 > cSim.nonbondedCutoffSqr)
{
dEdR = 0.0f;
}
unsigned int offsetA = atom.x + atom.z * cSim.stride;
unsigned int offsetB = atom.y + atom.w * cSim.stride;
float4 forceA = {0.0f, 0.0f, 0.0f, 0.0f};
if (atom.z < cSim.totalNonbondOutputBuffers)
forceA = cSim.pForce4[offsetA];
float4 forceB = {0.0f, 0.0f, 0.0f, 0.0f};
if (atom.w < cSim.totalNonbondOutputBuffers)
forceB = cSim.pForce4[offsetB];
d.x *= dEdR;
d.y *= dEdR;
d.z *= dEdR;
forceA.x += d.x;
forceA.y += d.y;
forceA.z += d.z;
forceB.x -= d.x;
forceB.y -= d.y;
forceB.z -= d.z;
cSim.pForce4[offsetA] = forceA;
cSim.pForce4[offsetB] = forceB;
}
pos += blockDim.x * gridDim.x;
}
}
else if (cSim.nonbondedMethod == PERIODIC)
{
while (pos < cSim.LJ14_offset)
{
unsigned int pos1 = pos - cSim.rb_dihedral_offset;
if (pos1 < cSim.LJ14s)
{
int4 atom = cSim.pLJ14ID[pos1];
float4 LJ14 = cSim.pLJ14Parameter[pos1];
float4 a1 = cSim.pPosq[atom.x];
float4 a2 = cSim.pPosq[atom.y];
float3 d;
d.x = a1.x - a2.x;
d.y = a1.y - a2.y;
d.z = a1.z - a2.z;
d.x -= floor(d.x/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
d.y -= floor(d.x/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
d.z -= floor(d.x/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = DOT3(d, d);
float inverseR = 1.0f / sqrt(r2);
float sig2 = inverseR * LJ14.y;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float dEdR = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
dEdR += LJ14.z * (inverseR - 2.0f * cSim.reactionFieldK * r2);
dEdR *= inverseR * inverseR;
if (r2 > cSim.nonbondedCutoffSqr)
{
dEdR = 0.0f;
}
unsigned int offsetA = atom.x + atom.z * cSim.stride;
unsigned int offsetB = atom.y + atom.w * cSim.stride;
float4 forceA = {0.0f, 0.0f, 0.0f, 0.0f};
if (atom.z < cSim.totalNonbondOutputBuffers)
forceA = cSim.pForce4[offsetA];
float4 forceB = {0.0f, 0.0f, 0.0f, 0.0f};
if (atom.w < cSim.totalNonbondOutputBuffers)
forceB = cSim.pForce4[offsetB];
d.x *= dEdR;
d.y *= dEdR;
d.z *= dEdR;
forceA.x += d.x;
forceA.y += d.y;
forceA.z += d.z;
forceB.x -= d.x;
forceB.y -= d.y;
forceB.z -= d.z;
cSim.pForce4[offsetA] = forceA;
cSim.pForce4[offsetB] = forceB;
}
pos += blockDim.x * gridDim.x;
}
}
}
......
......@@ -53,10 +53,6 @@ struct Atom {
float junk;
};
__shared__ Atom sA[GT2XX_NONBOND_THREADS_PER_BLOCK];
__shared__ unsigned int sWorkUnit[GT2XX_NONBOND_WORKUNITS_PER_SM];
__shared__ unsigned int sNext[GRID];
static __constant__ cudaGmxSimulation cSim;
void SetCalculateObcGbsaBornSumSim(gpuContext gpu)
......@@ -73,6 +69,50 @@ void GetCalculateObcGbsaBornSumSim(gpuContext gpu)
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateObcGbsaBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateObcGbsaBornSum.h"
// Include versions of the kernels with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateObcGbsaBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateObcGbsaBornSum.h"
// Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateObcGbsaBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateObcGbsaBornSum.h"
__global__ void kClearObcGbsaBornSum_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.stride * cSim.nonbondOutputBuffers)
{
((float*)cSim.pBornSum)[pos] = 0.0f;
pos += gridDim.x * blockDim.x;
}
}
__global__ void kReduceObcGbsaBornSum_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
......@@ -127,175 +167,40 @@ if( 0 ){
LAUNCHERROR("kReduceObcGbsaBornSum");
}
__global__ void kCalculateObcGbsaBornSum_kernel()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int pos = (blockIdx.x * cSim.workUnits) / gridDim.x;
int end = ((blockIdx.x + 1) * cSim.workUnits) / gridDim.x;
if (threadIdx.x < end - pos)
{
sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
}
if (threadIdx.x < GRID)
{
sNext[threadIdx.x] = (threadIdx.x - 1) & (GRID - 1);
}
__syncthreads();
// Now change pos and end to reflect work queue just read
// into shared memory
end = end - pos;
pos = end - (threadIdx.x >> GRIDBITS) - 1;
while (pos >= 0)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = sWorkUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
x = (x >> 17) << GRIDBITS;
float dx;
float dy;
float dz;
float r2;
float r;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
Atom* psA = &sA[tbx];
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned int i = x + tgx;
float4 apos = cSim.pPosq[i]; // Local atom x, y, z, sum
float2 ar = cSim.pObcData[i]; // Local atom vr, sr
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].r = ar.x;
sA[threadIdx.x].sr = ar.y;
apos.w = 0.0f;
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
r = sqrt(r2);
float rInverse = 1.0f / r;
float rScaledRadiusJ = r + psA[j].sr;
if ((j != tgx) && (ar.x < rScaledRadiusJ))
{
float l_ij = 1.0f / max(ar.x, fabs(r - psA[j].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = log(u_ij / l_ij);
apos.w += l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * psA[j].sr * psA[j].sr * rInverse) *
(l_ij2 - u_ij2);
if (ar.x < (psA[j].r - r))
{
apos.w += 2.0f * ((1.0f / ar.x) - l_ij);
}
}
}
// Write results
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = apos.w;
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
int j = y + tgx;
unsigned int i = x + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pObcData[j];
float4 apos = cSim.pPosq[i]; // Local atom x, y, z, sum
float2 ar = cSim.pObcData[i]; // Local atom vr, sr
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].r = temp1.x;
sA[threadIdx.x].sr = temp1.y;
sA[threadIdx.x].sum = apos.w = 0.0f;
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
r = sqrt(r2);
float rInverse = 1.0f / r;
float rScaledRadiusJ = r + psA[tj].sr;
if (ar.x < rScaledRadiusJ)
{
float l_ij = 1.0f / max(ar.x, fabs(r - psA[tj].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = log(u_ij / l_ij);
float term = l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * psA[tj].sr * psA[tj].sr * rInverse) *
(l_ij2 - u_ij2);
if (ar.x < (psA[tj].sr - r))
{
term += 2.0f * ((1.0f / ar.x) - l_ij);
}
apos.w += term;
}
float rScaledRadiusI = r + ar.y;
if (psA[tj].r < rScaledRadiusI)
{
float l_ij = 1.0f / max(psA[tj].r, fabs(r - ar.y));
float u_ij = 1.0f / rScaledRadiusI;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = log(u_ij / l_ij);
float term = l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * ar.y * ar.y * rInverse) *
(l_ij2 - u_ij2);
if (psA[tj].r < (ar.y - r))
{
term += 2.0f * ((1.0f / psA[tj].r) - l_ij);
}
psA[tj].sum += term;
}
tj = sNext[tj];
}
// Write results
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = apos.w;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = sA[threadIdx.x].sum;
}
pos -= cSim.nonbond_workBlock;
}
}
void kCalculateObcGbsaBornSum(gpuContext gpu)
{
// printf("kCalculateObcgbsaBornSum\n");
kCalculateObcGbsaBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
kClearObcGbsaBornSum_kernel<<<gpu->sim.blocks, 384>>>();
LAUNCHERROR("kClearBornSum");
size_t numWithInteractions;
switch (gpu->sim.nonbondedMethod)
{
case NO_CUTOFF:
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
else
kCalculateObcGbsaN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
break;
case CUTOFF:
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateObcGbsaCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
break;
case PERIODIC:
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaPeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateObcGbsaPeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
break;
}
LAUNCHERROR("kCalculateBornSum");
}
......@@ -52,18 +52,9 @@ struct Atom {
float fy;
float fz;
float fb;
// float sum;
// float oneOverR;
int pos;
int wx;
int wy;
};
__shared__ Atom sA[G8X_BORNFORCE2_THREADS_PER_BLOCK];
__shared__ unsigned int sWorkUnit[G8X_NONBOND_WORKUNITS_PER_SM];
__shared__ unsigned int sNext[GRID];
static __constant__ cudaGmxSimulation cSim;
void SetCalculateObcGbsaForces2Sim(gpuContext gpu)
......@@ -80,283 +71,72 @@ void GetCalculateObcGbsaForces2Sim(gpuContext gpu)
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void kCalculateObcGbsaForces2_kernel()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int pos = cSim.bf2WorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.bf2WorkUnitsPerBlockRemainder);
int end = cSim.bf2WorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.bf2WorkUnitsPerBlockRemainder);
if (threadIdx.x < end - pos)
{
sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
}
if (threadIdx.x < GRID)
{
sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
}
__syncthreads();
// Now change pos and end to reflect work queue just read
// into shared memory
end = end - pos;
sA[threadIdx.x].pos = end - (threadIdx.x >> GRIDBITS) - 1;
while (sA[threadIdx.x].pos >= 0)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = sWorkUnit[sA[threadIdx.x].pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
x = (x >> 17) << GRIDBITS;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int i = x + tgx;
float4 apos = cSim.pPosq[i];
float2 a = cSim.pObcData[i];
float fb = cSim.pBornForce[i];
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
Atom* psA = &sA[tbx];
sA[threadIdx.x].wx = x;
sA[threadIdx.x].wy = y;
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
float3 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
// float sum = 0.0f;
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
// float oneOverR = 1.0f / a.x;
sA[threadIdx.x].r = a.x;
sA[threadIdx.x].sr = a.y;
sA[threadIdx.x].sr2 = a.y * a.y;
sA[threadIdx.x].fb = fb;
for (unsigned int j = sNext[tgx]; j != tgx; j = sNext[j])
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrt(r2);
// Atom I Born forces and sum
float rScaledRadiusJ = r + psA[j].sr;
float l_ij = 1.0f / max(a.x, fabs(r - psA[j].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float rInverse = 1.0f / r;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float r2Inverse = rInverse * rInverse;
float t1 = log (u_ij / l_ij);
float t2 = (l_ij2 - u_ij2);
float t3 = t2 * rInverse;
t1 *= rInverse;
// Born Forces term
float term = 0.125f *
(1.000f + psA[j].sr2 * r2Inverse) * t3 +
0.250f * t1 * r2Inverse;
float dE = fb * term;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * psA[j].sr2) * t3;
// if (a.x < (psA[j].sr - r))
// {
// term += 2.0f * (oneOverR - l_ij);
// }
if (a.x >= rScaledRadiusJ)
{
dE = /*term =*/ 0.0f;
}
float d = dx * dE;
af.x -= d;
psA[j].fx += d;
d = dy * dE;
af.y -= d;
psA[j].fy += d;
d = dz * dE;
af.z -= d;
psA[j].fz += d;
// sum += term;
}
// Write results
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
float4 of;
of.x = af.x + sA[threadIdx.x].fx;
of.y = af.y + sA[threadIdx.x].fy;
of.z = af.z + sA[threadIdx.x].fz;
of.w = 0.0f;
cSim.pForce4b[offset] = of;
// cSim.pBornSum[offset] = sum;
}
else
{
// Read fixed atom data into registers and GRF
int j = y + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pObcData[j];
sA[threadIdx.x].fb = cSim.pBornForce[j];
float3 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
// sA[threadIdx.x].sum = 0.0f;
// float sum = 0.0f;
float sr2 = a.y * a.y;
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].r = temp1.x;
sA[threadIdx.x].sr = temp1.y;
sA[threadIdx.x].sr2 = temp1.y * temp1.y;
// sA[threadIdx.x].oneOverR = 1.0f / temp1.x;
for (j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrt(r2);
// Atom I Born Forces and sum
float r2Inverse = 1.0f / r2;
float rScaledRadiusJ = r + psA[tj].sr;
float rInverse = 1.0f / r;
float l_ij = 1.0f / max(a.x, fabs(r - psA[tj].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float t1 = log (u_ij / l_ij);
float t2 = (l_ij2 - u_ij2);
float t3 = t2 * rInverse;
t1 *= rInverse;
// Born Forces term
float term = 0.125f *
(1.000f + psA[tj].sr2 * r2Inverse) * t3 +
0.250f * t1 * r2Inverse;
float dE = fb * term;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * psA[tj].sr2) * t3;
// if (a.x < (psA[tj].sr - r))
// {
// term += 2.0f * ((1.0f / a.x) - l_ij);
// }
if (a.x >= rScaledRadiusJ)
{
dE = /*term =*/ 0.0f;
}
// Include versions of the kernels for N^2 calculations.
float d = dx * dE;
af.x -= d;
psA[tj].fx += d;
d = dy * dE;
af.y -= d;
psA[tj].fy += d;
d = dz * dE;
af.z -= d;
psA[tj].fz += d;
// sum += term;
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateObcGbsaForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateObcGbsaForces2.h"
// Atom J Born Forces and sum
float rScaledRadiusI = r + a.y;
l_ij = 1.0f / max(psA[tj].r, fabs(r - a.y));
u_ij = 1.0f / rScaledRadiusI;
l_ij2 = l_ij * l_ij;
u_ij2 = u_ij * u_ij;
t1 = log (u_ij / l_ij);
t2 = (l_ij2 - u_ij2);
t3 = t2 * rInverse;
t1 *= rInverse;
// Include versions of the kernels with cutoffs.
// Born Forces term
term = 0.125f *
(1.000f + sr2 * r2Inverse) * t3 +
0.250f * t1 * r2Inverse;
dE = psA[tj].fb * term;
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateObcGbsaForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateObcGbsaForces2.h"
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * sr2) * t3;
//
// if (psA[tj].r < (a.y - r))
// {
// term += 2.0f * (psA[tj].oneOverR - l_ij);
// }
if (psA[tj].r >= rScaledRadiusI)
{
dE = /*term =*/ 0.0f;
}
dx *= dE;
dy *= dE;
dz *= dE;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
af.x -= dx;
af.y -= dy;
af.z -= dz;
// psA[tj].sum += term;
// Include versions of the kernels with periodic boundary conditions.
tj = sNext[tj];
}
// Write results
int offset = sA[threadIdx.x].wx + tgx + (sA[threadIdx.x].wy >> GRIDBITS) * cSim.stride;
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
cSim.pForce4b[offset] = of;
// cSim.pBornSum[offset] = sum;
offset = sA[threadIdx.x].wy + tgx + (sA[threadIdx.x].wx >> GRIDBITS) * cSim.stride;
of.x = sA[threadIdx.x].fx;
of.y = sA[threadIdx.x].fy;
of.z = sA[threadIdx.x].fz;
cSim.pForce4b[offset] = of;
// cSim.pBornSum[offset] = sA[threadIdx.x].sum;
}
sA[threadIdx.x].pos -= cSim.bornForce2_workBlock;
}
}
__global__ extern void kCalculateObcGbsaForces2_12_kernel();
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateObcGbsaForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateObcGbsaForces2.h"
void kCalculateObcGbsaForces2(gpuContext gpu)
{
//printf("kCalculateObcGbsaForces2\n");
if (gpu->sm_version < SM_12)
kCalculateObcGbsaForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block>>>();
size_t numWithInteractions;
switch (gpu->sim.nonbondedMethod)
{
case NO_CUTOFF:
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
else
kCalculateObcGbsaForces2_12_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block>>>();
if( 0 ){
static int step = 0;
//int numPrint = -1;
step++;
//WriteArrayToFile1( gpu, "ObcGbsaBornBRad", step, gpu->psBornRadii, numPrint );
//gpuDumpCoordinates( gpu );
kReduceBornSumAndForces( gpu );
gpuDumpObcLoop1( gpu );
}
kCalculateObcGbsaN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
break;
case CUTOFF:
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaCutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateObcGbsaCutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
break;
case PERIODIC:
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaPeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateObcGbsaPeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
break;
}
LAUNCHERROR("kCalculateObcGbsaForces2");
}
......@@ -61,9 +61,9 @@ void GetForcesSim(gpuContext gpu)
__global__ void kClearForces_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.stride4 * cSim.outputBuffers)
while (pos < cSim.stride * cSim.outputBuffers)
{
((float*)cSim.pForce4)[pos] = 0.0f;
cSim.pForce4[pos] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
pos += gridDim.x * blockDim.x;
}
}
......
......@@ -61,7 +61,6 @@ void GetVerletUpdateSim(gpuContext gpu)
__global__ void kVerletUpdatePart1_kernel()
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
__syncthreads();
while (pos < cSim.atoms)
{
......@@ -175,7 +174,6 @@ void kVerletUpdatePart1(gpuContext gpu)
__global__ void kVerletUpdatePart2_kernel()
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
__syncthreads();
while (pos < cSim.atoms)
{
......@@ -208,7 +206,6 @@ __global__ void kVerletUpdatePart2CM_kernel()
extern __shared__ float3 sCM[];
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
float3 CM = {0.0f, 0.0f, 0.0f};
__syncthreads();
while (pos < cSim.atoms)
{
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008 Stanford University and the Authors. *
* Portions copyright (c) 2008-2009 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -419,6 +419,26 @@ void ReferenceCalcGBSAOBCForceKernel::initialize(const System& system, const GBS
obcParameters->setScaledRadiusFactors(scaleFactors);
obcParameters->setSolventDielectric( static_cast<RealOpenMM>(force.getSolventDielectric()) );
obcParameters->setSoluteDielectric( static_cast<RealOpenMM>(force.getSoluteDielectric()) );
// If there is a NonbondedForce in this system, use it to initialize cutoffs and periodic boundary conditions.
for (int i = 0; i < system.getNumForces(); i++) {
const NonbondedForce* nonbonded = dynamic_cast<const NonbondedForce*>(&system.getForce(i));
if (nonbonded != NULL) {
if (nonbonded->getNonbondedMethod() != NonbondedForce::NoCutoff)
obcParameters->setUseCutoff(nonbonded->getCutoffDistance());
if (nonbonded->getNonbondedMethod() == NonbondedForce::CutoffPeriodic) {
Vec3 boxVectors[3];
nonbonded->getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
RealOpenMM periodicBoxSize[3];
periodicBoxSize[0] = (RealOpenMM) boxVectors[0][0];
periodicBoxSize[1] = (RealOpenMM) boxVectors[1][1];
periodicBoxSize[2] = (RealOpenMM) boxVectors[2][2];
obcParameters->setPeriodic(periodicBoxSize);
}
break;
}
}
obc = new CpuObc(obcParameters);
obc->setIncludeAceApproximation(true);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment