Commit 1010df33 authored by Peter Eastman's avatar Peter Eastman
Browse files

Checking in Cuda implementation of explicit solvent

parent df4b64cb
......@@ -84,4 +84,14 @@ INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
SET(FINDCUDA_DIR ${CMAKE_CURRENT_SOURCE_DIR}/cuda-cmake)
IF (APPLE)
LINK_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/cudpp/mac)
ELSE (APPLE)
IF (WIN32)
LINK_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/cudpp/win)
INSTALL_FILES(/lib FILES ${CMAKE_CURRENT_SOURCE_DIR}/cudpp/win/cudpp32.dll)
ELSE (WIN32)
LINK_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/cudpp/linux)
ENDIF (WIN32)
ENDIF(APPLE)
SUBDIRS (sharedTarget staticTarget)
......@@ -39,42 +39,40 @@
using namespace OpenMM;
StreamImpl* CudaStreamFactory::createStreamImpl(std::string name, int size, Stream::DataType type, const Platform& platform, OpenMMContextImpl& context) const {
if (name == "particlePositions") {
CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
if (name == "particlePositions") {
float padding[] = {100000.0f, 100000.0f, 100000.0f, 0.2f};
return new CudaStreamImpl<float4>(name, size, type, platform, data.gpu->psPosq4, 4, padding, data.gpu);
}
if (name == "particleVelocities") {
CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
float padding[] = {0.0f, 0.0f, 0.0f, 0.0f};
return new CudaStreamImpl<float4>(name, size, type, platform, data.gpu->psVelm4, 4, padding, data.gpu);
}
if (name == "particleForces") {
CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
float padding[] = {0.0f, 0.0f, 0.0f, 0.0f};
return new CudaStreamImpl<float4>(name, size, type, platform, data.gpu->psForce4, 4, padding, data.gpu);
}
switch (type) {
case Stream::Float:
case Stream::Double:
return new CudaStreamImpl<float1>(name, size, type, platform, 1, NULL);
return new CudaStreamImpl<float1>(name, size, type, platform, 1, data.gpu);
case Stream::Float2:
case Stream::Double2:
return new CudaStreamImpl<float2>(name, size, type, platform, 1, NULL);
return new CudaStreamImpl<float2>(name, size, type, platform, 1, data.gpu);
case Stream::Float3:
case Stream::Double3:
return new CudaStreamImpl<float3>(name, size, type, platform, 1, NULL);
return new CudaStreamImpl<float3>(name, size, type, platform, 1, data.gpu);
case Stream::Float4:
case Stream::Double4:
return new CudaStreamImpl<float4>(name, size, type, platform, 1, NULL);
return new CudaStreamImpl<float4>(name, size, type, platform, 1, data.gpu);
case Stream::Integer:
return new CudaStreamImpl<int1>(name, size, type, platform, 1, NULL);
return new CudaStreamImpl<int1>(name, size, type, platform, 1, data.gpu);
case Stream::Integer2:
return new CudaStreamImpl<int2>(name, size, type, platform, 1, NULL);
return new CudaStreamImpl<int2>(name, size, type, platform, 1, data.gpu);
case Stream::Integer3:
return new CudaStreamImpl<int3>(name, size, type, platform, 1, NULL);
return new CudaStreamImpl<int3>(name, size, type, platform, 1, data.gpu);
case Stream::Integer4:
return new CudaStreamImpl<int4>(name, size, type, platform, 1, NULL);
return new CudaStreamImpl<int4>(name, size, type, platform, 1, data.gpu);
}
throw OpenMMException("Tried to create a Stream with an illegal DataType.");
}
......@@ -132,23 +132,24 @@ CudaStreamImpl<T>::~CudaStreamImpl() {
template <class T>
void CudaStreamImpl<T>::loadFromArray(const void* array) {
float* data = reinterpret_cast<float*>(stream->_pSysData);
int* order = gpu->psAtomIndex->_pSysData;
if (baseType == Stream::Float) {
float* arrayData = (float*) array;
for (int i = 0; i < getSize(); ++i)
for (int j = 0; j < width; ++j)
data[i*rowOffset+j] = arrayData[i*width+j];
data[i*rowOffset+j] = arrayData[order[i]*width+j];
}
else if (baseType == Stream::Double) {
double* arrayData = (double*) array;
for (int i = 0; i < getSize(); ++i)
for (int j = 0; j < width; ++j)
data[i*rowOffset+j] = (float) arrayData[i*width+j];
data[i*rowOffset+j] = (float) arrayData[order[i]*width+j];
}
else {
int* arrayData = (int*) array;
for (int i = 0; i < getSize(); ++i)
for (int j = 0; j < width; ++j)
data[i*rowOffset+j] = (float) arrayData[i*width+j];
data[i*rowOffset+j] = (float) arrayData[order[i]*width+j];
}
for (int i = getSize(); i < (int) stream->_length; ++i)
for (int j = 0; j < rowOffset; ++j)
......@@ -167,23 +168,24 @@ template <class T>
void CudaStreamImpl<T>::saveToArray(void* array) {
stream->Download();
float* data = reinterpret_cast<float*>(stream->_pSysData);
int* order = gpu->psAtomIndex->_pSysData;
if (baseType == Stream::Float) {
float* arrayData = (float*) array;
for (int i = 0; i < getSize(); ++i)
for (int j = 0; j < width; ++j)
arrayData[i*width+j] = data[i*rowOffset+j];
arrayData[order[i]*width+j] = data[i*rowOffset+j];
}
else if (baseType == Stream::Double) {
double* arrayData = (double*) array;
for (int i = 0; i < getSize(); ++i)
for (int j = 0; j < width; ++j)
arrayData[i*width+j] = data[i*rowOffset+j];
arrayData[order[i]*width+j] = data[i*rowOffset+j];
}
else {
int* arrayData = (int*) array;
for (int i = 0; i < getSize(); ++i)
for (int j = 0; j < width; ++j)
arrayData[i*width+j] = (int) data[i*rowOffset+j];
arrayData[order[i]*width+j] = (int) data[i*rowOffset+j];
}
}
......
......@@ -41,19 +41,19 @@ extern void kGenerateRandoms(gpuContext gpu);
extern void kCalculateCDLJObcGbsaForces1(gpuContext gpu);
extern void kCalculateCDLJObcGbsaForces1_12(gpuContext gpu);
extern void kCalculateCDLJForces(gpuContext gpu);
extern void kCalculateCDLJForces_12(gpuContext gpu);
extern void kCalculateObcGbsaForces1(gpuContext gpu);
extern void kCalculateObcGbsaForces1_12(gpuContext gpu);
extern void kReduceObcGbsaBornForces(gpuContext gpu);
extern void kCalculateObcGbsaForces2(gpuContext gpu);
extern void kCalculateObcGbsaForces2_12(gpuContext gpu);
extern void kCalculateLocalForces(gpuContext gpu);
extern void kCalculateAndersenThermostat(gpuContext gpu);
extern void kReduceBornSumAndForces(gpuContext gpu);
extern void kUpdatePart1(gpuContext gpu);
extern void kApplyFirstShake(gpuContext gpu);
extern void kApplyFirstSettle(gpuContext gpu);
extern void kUpdatePart2(gpuContext gpu);
extern void kApplySecondShake(gpuContext gpu);
extern void kApplySecondSettle(gpuContext gpu);
extern void kVerletUpdatePart1(gpuContext gpu);
extern void kVerletUpdatePart2(gpuContext gpu);
extern void kBrownianUpdatePart1(gpuContext gpu);
......@@ -66,12 +66,8 @@ extern void kClearBornForces(gpuContext gpu);
// Initializers
extern void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
extern void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
extern void SetCalculateCDLJObcGbsaForces1_12Sim(gpuContext gpu);
extern void GetCalculateCDLJObcGbsaForces1_12Sim(gpuContext gpu);
extern void SetCalculateCDLJForcesSim(gpuContext gpu);
extern void GetCalculateCDLJForcesSim(gpuContext gpu);
extern void SetCalculateCDLJForces_12Sim(gpuContext gpu);
extern void GetCalculateCDLJForces_12Sim(gpuContext gpu);
extern void SetCalculateLocalForcesSim(gpuContext gpu);
extern void GetCalculateLocalForcesSim(gpuContext gpu);
extern void SetCalculateObcGbsaBornSumSim(gpuContext gpu);
......@@ -82,14 +78,14 @@ extern void SetCalculateObcGbsaForces1_12Sim(gpuContext gpu);
extern void GetCalculateObcGbsaForces1_12Sim(gpuContext gpu);
extern void SetCalculateObcGbsaForces2Sim(gpuContext gpu);
extern void GetCalculateObcGbsaForces2Sim(gpuContext gpu);
extern void SetCalculateObcGbsaForces2_12Sim(gpuContext gpu);
extern void GetCalculateObcGbsaForces2_12Sim(gpuContext gpu);
extern void SetCalculateAndersenThermostatSim(gpuContext gpu);
extern void GetCalculateAndersenThermostatSim(gpuContext gpu);
extern void SetForcesSim(gpuContext gpu);
extern void GetForcesSim(gpuContext gpu);
extern void SetUpdateShakeHSim(gpuContext gpu);
extern void GetUpdateShakeHSim(gpuContext gpu);
extern void SetSettleSim(gpuContext gpu);
extern void GetSettleSim(gpuContext gpu);
extern void SetVerletUpdateSim(gpuContext gpu);
extern void GetVerletUpdateSim(gpuContext gpu);
extern void SetBrownianUpdateSim(gpuContext gpu);
......
......@@ -36,11 +36,12 @@
#include <limits>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <builtin_types.h>
#include <vector_functions.h>
using namespace std;
#define RTERROR(status, s) \
if (status != cudaSuccess) { \
......@@ -228,6 +229,12 @@ static const int GT2XX_RANDOM_THREADS_PER_BLOCK = 384;
static const int G8X_NONBOND_WORKUNITS_PER_SM = 220;
static const int GT2XX_NONBOND_WORKUNITS_PER_SM = 256;
enum CudaNonbondedMethod
{
NO_CUTOFF,
CUTOFF,
PERIODIC
};
struct cudaGmxSimulation {
// Constants
......@@ -236,6 +243,7 @@ struct cudaGmxSimulation {
unsigned int blocks; // Number of blocks to launch across linear kernels
unsigned int nonbond_blocks; // Number of blocks to launch across CDLJ and Born Force Part1
unsigned int bornForce2_blocks; // Number of blocks to launch across Born Force 2
unsigned int interaction_blocks; // Number of blocks to launch when identifying interacting tiles
unsigned int threads_per_block; // Threads per block to launch
unsigned int nonbond_threads_per_block; // Threads per block in nonbond kernel calls
unsigned int bornForce2_threads_per_block; // Threads per block in nonbond kernel calls
......@@ -245,12 +253,17 @@ struct cudaGmxSimulation {
unsigned int bsf_reduce_threads_per_block; // Threads per block in Born Sum And Forces reduction calls
unsigned int max_shake_threads_per_block; // Maximum threads per block in shake kernel calls
unsigned int shake_threads_per_block; // Threads per block in shake kernel calls
unsigned int settle_threads_per_block; // Threads per block in SETTLE kernel calls
unsigned int nonshake_threads_per_block; // Threads per block in nonshaking kernel call
unsigned int max_localForces_threads_per_block; // Threads per block in local forces kernel calls
unsigned int localForces_threads_per_block; // Threads per block in local forces kernel calls
unsigned int random_threads_per_block; // Threads per block in RNG kernel calls
unsigned int interaction_threads_per_block; // Threads per block when identifying interacting tiles
unsigned int workUnits; // Number of work units
unsigned int* pWorkUnit; // Pointer to work units
unsigned int* pInteractingWorkUnit; // Pointer to work units that have interactions
unsigned int* pInteractionFlag; // Flags for which work units have interactions
size_t* pInteractionCount; // A count of the number of work units which have interactions
unsigned int nonbond_workBlock; // Number of work units running simultaneously per block in CDLJ and Born Force Part 1
unsigned int bornForce2_workBlock; // Number of work units running second half of Born Forces calculation
unsigned int workUnitsPerSM; // Number of workblocks per SM
......@@ -270,6 +283,12 @@ struct cudaGmxSimulation {
unsigned int outputBuffers; // Number of output buffers
float bigFloat; // Floating point value used as a flag for Shaken atoms
float epsfac; // Epsilon factor for CDLJ calculations
CudaNonbondedMethod nonbondedMethod; // How to handle nonbonded interactions
float nonbondedCutoffSqr; // Cutoff distance for CDLJ calculations
float periodicBoxSizeX; // The X dimension of the periodic box
float periodicBoxSizeY; // The Y dimension of the periodic box
float periodicBoxSizeZ; // The Z dimension of the periodic box
float reactionFieldK; // Constant for reaction field correction
float probeRadius; // SASA probe radius
float surfaceAreaFactor; // ACE approximation surface area factor
float electricConstant; // ACE approximation electric constant
......@@ -326,6 +345,7 @@ struct cudaGmxSimulation {
float4* pLJ14Parameter; // Lennard Jones 1-4 parameters
float inverseTotalMass; // Used in linear momentum removal
unsigned int ShakeConstraints; // Total number of Shake constraints
unsigned int settleConstraints; // Total number of Settle constraints
unsigned int NonShakeConstraints; // Total number of NonShake atoms
unsigned int maxShakeIterations; // Maximum shake iterations
unsigned int degreesOfFreedom; // Number of degrees of freedom in system
......@@ -334,12 +354,17 @@ struct cudaGmxSimulation {
int* pNonShakeID; // Not Shaking atoms
int4* pShakeID; // Shake atoms and phase
float4* pShakeParameter; // Shake parameters
int4* pSettleID; // Settle atoms
float2* pSettleParameter; // Settle parameters
unsigned int* pExclusion; // Nonbond exclusion data
unsigned int bond_offset; // Offset to end of bonds
unsigned int bond_angle_offset; // Offset to end of bond angles
unsigned int dihedral_offset; // Offset to end of dihedrals
unsigned int rb_dihedral_offset; // Offset to end of Ryckaert Bellemans dihedrals
unsigned int LJ14_offset; // Offset to end of Lennard Jones 1-4 parameters
int* pAtomIndex; // The original index of each atom
float4* pGridBoundingBox; // The size of each grid cell
float4* pGridCenter; // The center of each grid cell
// Mutable stuff
float4* pPosq; // Pointer to atom positions and charges
......
This diff is collapsed.
......@@ -33,14 +33,20 @@
* -------------------------------------------------------------------------- */
#include "cudatypes.h"
#include "cudpp.h"
#include <vector>
struct gpuAtomType {
string name;
std::string name;
char symbol;
float r;
};
struct gpuMoleculeGroup {
std::vector<int> atoms;
std::vector<int> instances;
};
enum SM_VERSION
{
SM_10,
......@@ -61,8 +67,9 @@ struct _gpuContext {
int gAtomTypes;
cudaGmxSimulation sim;
unsigned int* pOutputBufferCounter;
unsigned int* pExclusion;
std::vector<std::vector<int> > exclusions;
unsigned char* pAtomSymbol;
std::vector<gpuMoleculeGroup> moleculeGroups;
float iterations;
float epsfac;
float solventDielectric;
......@@ -71,8 +78,11 @@ struct _gpuContext {
bool bCalculateCM;
bool bRemoveCM;
bool bRecalculateBornRadii;
bool bOutputBufferPerWarp;
bool bIncludeGBSA;
unsigned long seed;
SM_VERSION sm_version;
CUDPPHandle cudpp;
CUDAStream<float4>* psPosq4;
CUDAStream<float4>* psPosqP4;
CUDAStream<float4>* psOldPosq4;
......@@ -103,15 +113,21 @@ struct _gpuContext {
CUDAStream<int>* psNonShakeID;
CUDAStream<int4>* psShakeID;
CUDAStream<float4>* psShakeParameter;
CUDAStream<int4>* psSettleID;
CUDAStream<float2>* psSettleParameter;
CUDAStream<unsigned int>* psExclusion;
CUDAStream<unsigned int>* psWorkUnit;
CUDAStream<unsigned int>* psInteractingWorkUnit;
CUDAStream<unsigned int>* psInteractionFlag;
CUDAStream<size_t>* psInteractionCount;
CUDAStream<float4>* psRandom4; // Pointer to sets of 4 random numbers for MD integration
CUDAStream<float2>* psRandom2; // Pointer to sets of 2 random numbers for MD integration
CUDAStream<uint4>* psRandomSeed; // Pointer to each random seed
CUDAStream<int>* psRandomPosition; // Pointer to random number positions
CUDAStream<float4>* psLinearMomentum; // Pointer to total linear momentum per CTA
CUDAStream<int>* psAtomIndex; // The original index of each atom
CUDAStream<float4>* psGridBoundingBox; // The size of each grid cell
CUDAStream<float4>* psGridCenter; // The center and radius for each grid cell
};
typedef struct _gpuContext *gpuContext;
......@@ -156,10 +172,10 @@ void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const std::
const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q1, const std::vector<float>& q2);
extern "C"
float gpuGetAtomicRadius(gpuContext gpu, string s);
float gpuGetAtomicRadius(gpuContext gpu, std::string s);
extern "C"
unsigned char gpuGetAtomicSymbol(gpuContext gpu, string s);
unsigned char gpuGetAtomicSymbol(gpuContext gpu, std::string s);
extern "C"
int gpuReadAtomicParameters(gpuContext gpu, char* fname);
......@@ -169,7 +185,13 @@ int gpuReadCoulombParameters(gpuContext gpu, char* fname);
extern "C"
void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const std::vector<int>& atom, const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q,
const std::vector<char>& symbol, const std::vector<vector<int> >& exclusions);
const std::vector<char>& symbol, const std::vector<std::vector<int> >& exclusions, CudaNonbondedMethod method);
extern "C"
void gpuSetNonbondedCutoff(gpuContext gpu, float cutoffDistance, float solventDielectric);
extern "C"
void gpuSetPeriodicBoxSize(gpuContext gpu, float xsize, float ysize, float zsize);
extern "C"
void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const std::vector<int>& atom, const std::vector<float>& radius, const std::vector<float>& scale);
......@@ -227,7 +249,7 @@ extern "C"
int gpuBuildThreadBlockWorkList(gpuContext gpu);
extern "C"
int gpuBuildExclusionList(gpuContext gpu);
void gpuBuildExclusionList(gpuContext gpu);
extern "C"
int gpuSetConstants(gpuContext gpu);
......@@ -274,4 +296,7 @@ void gpuDumpObcInfo(gpuContext gpu);
extern "C"
void gpuDumpObcLoop1(gpuContext gpu);
extern "C"
void gpuReorderAtoms(gpuContext gpu);
#endif //__GPUTYPES_H__
......@@ -54,15 +54,8 @@ struct Atom {
float fx;
float fy;
float fz;
float eps2;
float sig2;
};
__shared__ Atom sA[G8X_NONBOND_THREADS_PER_BLOCK];
__shared__ unsigned int sWorkUnit[G8X_NONBOND_WORKUNITS_PER_SM];
__shared__ unsigned int sNext[GRID];
static __constant__ cudaGmxSimulation cSim;
void SetCalculateCDLJForcesSim(gpuContext gpu)
......@@ -79,310 +72,102 @@ void GetCalculateCDLJForcesSim(gpuContext gpu)
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void kCalculateCDLJForces_kernel()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder);
int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder);
if (threadIdx.x < end - pos)
{
sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
}
if (threadIdx.x < GRID)
{
sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
}
__syncthreads();
// Include versions of the kernels for N^2 calculations.
// Now change pos and end to reflect work queue just read
// into shared memory
end = end - pos;
pos = end - (threadIdx.x >> GRIDBITS) - 1;
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateCDLJForces.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateCDLJForces.h"
while (pos >= 0)
{
// Include versions of the kernels with cutoffs.
// Extract cell coordinates from appropriate work unit
unsigned int x = sWorkUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
bool bExclusionFlag = (x & 0x1);
x = (x >> 17) << GRIDBITS;
float4 apos; // Local atom x, y, z, q
float3 af; // Local atom fx, fy, fz
float dx;
float dy;
float dz;
float r2;
float invR;
float sig;
float sig2;
float sig6;
float eps;
float dEdR;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
Atom* psA = &sA[tbx];
if (!bExclusionFlag)
{
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned int i = x + tgx;
apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].q = apos.w;
sA[threadIdx.x].sig = a.x;
sA[threadIdx.x].eps = a.y;
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
apos.w *= cSim.epsfac;
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrt(r2);
sig = a.x + psA[j].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[j].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[j].q * invR;
dEdR *= invR * invR;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
}
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateCDLJForces.h"
#include "kFindInteractingBlocks.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateCDLJForces.h"
// Write results
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
int j = y + tgx;
unsigned int i = x + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j];
apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = temp.w;
sA[threadIdx.x].sig = temp1.x;
sA[threadIdx.x].eps = temp1.y;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
sA[threadIdx.x].sig2 = a.x;
sA[threadIdx.x].eps2 = a.y;
apos.w *= cSim.epsfac;
// Include versions of the kernels with periodic boundary conditions.
for (j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrt(r2);
sig = a.x + psA[tj].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[tj].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[tj].q * invR;
dEdR *= invR * invR;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
tj = sNext[tj];
}
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateCDLJForces.h"
#include "kFindInteractingBlocks.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateCDLJForces.h"
// Write results
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
of.x = sA[threadIdx.x].fx;
of.y = sA[threadIdx.x].fy;
of.z = sA[threadIdx.x].fz;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
}
}
else // bExclusion
{
// Read exclusion data
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
unsigned int i = x + tgx;
apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].q = apos.w;
sA[threadIdx.x].sig = a.x;
sA[threadIdx.x].eps = a.y;
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
sA[threadIdx.x].sig2 = a.x;
sA[threadIdx.x].eps2 = a.y;
apos.w *= cSim.epsfac;
__global__ extern void kCalculateCDLJCutoffForces_12_kernel();
for (unsigned int j = 0; j < GRID; j++)
void kCalculateCDLJForces(gpuContext gpu)
{
// printf("kCalculateCDLJCutoffForces\n");
CUDPPResult result;
size_t numWithInteractions;
switch (gpu->sim.nonbondedMethod)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrt(r2);
sig = psA[tgx].sig2 + psA[j].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = psA[tgx].eps2 * psA[j].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[j].q * invR;
dEdR *= invR * invR;
if (!(excl & 0x1))
case NO_CUTOFF:
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
else
kCalculateCDLJN2Forces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
LAUNCHERROR("kCalculateCDLJN2Forces");
break;
case CUTOFF:
kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsCutoff");
kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
result = cudppCompact(gpu->cudpp, gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractionCount,
gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits);
if (result != CUDPP_SUCCESS)
{
dEdR = 0.0f;
}
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
excl >>= 1;
}
// Write results
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
printf("Error in cudppCompact: %d\n", result);
exit(-1);
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
excl = (excl >> tgx) | (excl << (GRID - tgx));
int j = y + tgx;
unsigned int i = x + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j];
apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = temp.w;
sA[threadIdx.x].sig = temp1.x;
sA[threadIdx.x].eps = temp1.y;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
sA[threadIdx.x].sig2 = a.x;
sA[threadIdx.x].eps2 = a.y;
apos.w *= cSim.epsfac;
for (j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrt(r2);
sig = psA[tgx].sig2 + psA[tj].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = psA[tgx].eps2 * psA[tj].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[tj].q * invR;
dEdR *= invR * invR;
if (!(excl & 0x1))
gpu->psInteractionCount->Download();
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateCDLJCutoffForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
LAUNCHERROR("kCalculateCDLJCutoffForces");
break;
case PERIODIC:
kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsPeriodic");
kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
result = cudppCompact(gpu->cudpp, gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractionCount,
gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits);
if (result != CUDPP_SUCCESS)
{
dEdR = 0.0f;
}
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
excl >>= 1;
tj = sNext[tj];
printf("Error in cudppCompact: %d\n", result);
exit(-1);
}
// Write results
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
of.x = sA[threadIdx.x].fx;
of.y = sA[threadIdx.x].fy;
of.z = sA[threadIdx.x].fz;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
}
}
pos -= cSim.nonbond_workBlock;
}
}
__global__ extern void kCalculateCDLJForces_12_kernel();
void kCalculateCDLJForces(gpuContext gpu)
{
// printf("kCalculateCDLJForces\n");
if (gpu->sm_version < SM_12)
kCalculateCDLJForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
gpu->psInteractionCount->Download();
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJPeriodicByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateCDLJForces_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
LAUNCHERROR("kCalculateCDLJForces");
kCalculateCDLJPeriodicForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
LAUNCHERROR("kCalculateCDLJPeriodicForces");
}
}
\ No newline at end of file
......@@ -440,6 +440,8 @@ __global__ void kCalculateLocalForces_kernel()
pos += blockDim.x * gridDim.x;
}
if (cSim.nonbondedMethod == NO_CUTOFF)
{
while (pos < cSim.LJ14_offset)
{
unsigned int pos1 = pos - cSim.rb_dihedral_offset;
......@@ -483,6 +485,110 @@ __global__ void kCalculateLocalForces_kernel()
}
pos += blockDim.x * gridDim.x;
}
}
else if (cSim.nonbondedMethod == CUTOFF)
{
while (pos < cSim.LJ14_offset)
{
unsigned int pos1 = pos - cSim.rb_dihedral_offset;
if (pos1 < cSim.LJ14s)
{
int4 atom = cSim.pLJ14ID[pos1];
float4 LJ14 = cSim.pLJ14Parameter[pos1];
float4 a1 = cSim.pPosq[atom.x];
float4 a2 = cSim.pPosq[atom.y];
float3 d;
d.x = a1.x - a2.x;
d.y = a1.y - a2.y;
d.z = a1.z - a2.z;
float r2 = DOT3(d, d);
float inverseR = 1.0f / sqrt(r2);
float sig2 = inverseR * LJ14.y;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float dEdR = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
dEdR += LJ14.z * (inverseR - 2.0f * cSim.reactionFieldK * r2);
dEdR *= inverseR * inverseR;
if (r2 > cSim.nonbondedCutoffSqr)
{
dEdR = 0.0f;
}
unsigned int offsetA = atom.x + atom.z * cSim.stride;
unsigned int offsetB = atom.y + atom.w * cSim.stride;
float4 forceA = {0.0f, 0.0f, 0.0f, 0.0f};
if (atom.z < cSim.totalNonbondOutputBuffers)
forceA = cSim.pForce4[offsetA];
float4 forceB = {0.0f, 0.0f, 0.0f, 0.0f};
if (atom.w < cSim.totalNonbondOutputBuffers)
forceB = cSim.pForce4[offsetB];
d.x *= dEdR;
d.y *= dEdR;
d.z *= dEdR;
forceA.x += d.x;
forceA.y += d.y;
forceA.z += d.z;
forceB.x -= d.x;
forceB.y -= d.y;
forceB.z -= d.z;
cSim.pForce4[offsetA] = forceA;
cSim.pForce4[offsetB] = forceB;
}
pos += blockDim.x * gridDim.x;
}
}
else if (cSim.nonbondedMethod == PERIODIC)
{
while (pos < cSim.LJ14_offset)
{
unsigned int pos1 = pos - cSim.rb_dihedral_offset;
if (pos1 < cSim.LJ14s)
{
int4 atom = cSim.pLJ14ID[pos1];
float4 LJ14 = cSim.pLJ14Parameter[pos1];
float4 a1 = cSim.pPosq[atom.x];
float4 a2 = cSim.pPosq[atom.y];
float3 d;
d.x = a1.x - a2.x;
d.y = a1.y - a2.y;
d.z = a1.z - a2.z;
d.x -= floor(d.x/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
d.y -= floor(d.x/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
d.z -= floor(d.x/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = DOT3(d, d);
float inverseR = 1.0f / sqrt(r2);
float sig2 = inverseR * LJ14.y;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float dEdR = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
dEdR += LJ14.z * (inverseR - 2.0f * cSim.reactionFieldK * r2);
dEdR *= inverseR * inverseR;
if (r2 > cSim.nonbondedCutoffSqr)
{
dEdR = 0.0f;
}
unsigned int offsetA = atom.x + atom.z * cSim.stride;
unsigned int offsetB = atom.y + atom.w * cSim.stride;
float4 forceA = {0.0f, 0.0f, 0.0f, 0.0f};
if (atom.z < cSim.totalNonbondOutputBuffers)
forceA = cSim.pForce4[offsetA];
float4 forceB = {0.0f, 0.0f, 0.0f, 0.0f};
if (atom.w < cSim.totalNonbondOutputBuffers)
forceB = cSim.pForce4[offsetB];
d.x *= dEdR;
d.y *= dEdR;
d.z *= dEdR;
forceA.x += d.x;
forceA.y += d.y;
forceA.z += d.z;
forceB.x -= d.x;
forceB.y -= d.y;
forceB.z -= d.z;
cSim.pForce4[offsetA] = forceA;
cSim.pForce4[offsetB] = forceB;
}
pos += blockDim.x * gridDim.x;
}
}
}
......
......@@ -53,10 +53,6 @@ struct Atom {
float junk;
};
__shared__ Atom sA[GT2XX_NONBOND_THREADS_PER_BLOCK];
__shared__ unsigned int sWorkUnit[GT2XX_NONBOND_WORKUNITS_PER_SM];
__shared__ unsigned int sNext[GRID];
static __constant__ cudaGmxSimulation cSim;
void SetCalculateObcGbsaBornSumSim(gpuContext gpu)
......@@ -73,6 +69,50 @@ void GetCalculateObcGbsaBornSumSim(gpuContext gpu)
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateObcGbsaBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateObcGbsaBornSum.h"
// Include versions of the kernels with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateObcGbsaBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateObcGbsaBornSum.h"
// Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateObcGbsaBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateObcGbsaBornSum.h"
__global__ void kClearObcGbsaBornSum_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.stride * cSim.nonbondOutputBuffers)
{
((float*)cSim.pBornSum)[pos] = 0.0f;
pos += gridDim.x * blockDim.x;
}
}
__global__ void kReduceObcGbsaBornSum_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
......@@ -127,175 +167,40 @@ if( 0 ){
LAUNCHERROR("kReduceObcGbsaBornSum");
}
__global__ void kCalculateObcGbsaBornSum_kernel()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int pos = (blockIdx.x * cSim.workUnits) / gridDim.x;
int end = ((blockIdx.x + 1) * cSim.workUnits) / gridDim.x;
if (threadIdx.x < end - pos)
{
sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
}
if (threadIdx.x < GRID)
{
sNext[threadIdx.x] = (threadIdx.x - 1) & (GRID - 1);
}
__syncthreads();
// Now change pos and end to reflect work queue just read
// into shared memory
end = end - pos;
pos = end - (threadIdx.x >> GRIDBITS) - 1;
while (pos >= 0)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = sWorkUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
x = (x >> 17) << GRIDBITS;
float dx;
float dy;
float dz;
float r2;
float r;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
Atom* psA = &sA[tbx];
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned int i = x + tgx;
float4 apos = cSim.pPosq[i]; // Local atom x, y, z, sum
float2 ar = cSim.pObcData[i]; // Local atom vr, sr
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].r = ar.x;
sA[threadIdx.x].sr = ar.y;
apos.w = 0.0f;
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
r = sqrt(r2);
float rInverse = 1.0f / r;
float rScaledRadiusJ = r + psA[j].sr;
if ((j != tgx) && (ar.x < rScaledRadiusJ))
{
float l_ij = 1.0f / max(ar.x, fabs(r - psA[j].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = log(u_ij / l_ij);
apos.w += l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * psA[j].sr * psA[j].sr * rInverse) *
(l_ij2 - u_ij2);
if (ar.x < (psA[j].r - r))
{
apos.w += 2.0f * ((1.0f / ar.x) - l_ij);
}
}
}
// Write results
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = apos.w;
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
int j = y + tgx;
unsigned int i = x + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pObcData[j];
float4 apos = cSim.pPosq[i]; // Local atom x, y, z, sum
float2 ar = cSim.pObcData[i]; // Local atom vr, sr
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].r = temp1.x;
sA[threadIdx.x].sr = temp1.y;
sA[threadIdx.x].sum = apos.w = 0.0f;
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
r = sqrt(r2);
float rInverse = 1.0f / r;
float rScaledRadiusJ = r + psA[tj].sr;
if (ar.x < rScaledRadiusJ)
{
float l_ij = 1.0f / max(ar.x, fabs(r - psA[tj].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = log(u_ij / l_ij);
float term = l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * psA[tj].sr * psA[tj].sr * rInverse) *
(l_ij2 - u_ij2);
if (ar.x < (psA[tj].sr - r))
{
term += 2.0f * ((1.0f / ar.x) - l_ij);
}
apos.w += term;
}
float rScaledRadiusI = r + ar.y;
if (psA[tj].r < rScaledRadiusI)
{
float l_ij = 1.0f / max(psA[tj].r, fabs(r - ar.y));
float u_ij = 1.0f / rScaledRadiusI;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = log(u_ij / l_ij);
float term = l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * ar.y * ar.y * rInverse) *
(l_ij2 - u_ij2);
if (psA[tj].r < (ar.y - r))
{
term += 2.0f * ((1.0f / psA[tj].r) - l_ij);
}
psA[tj].sum += term;
}
tj = sNext[tj];
}
// Write results
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = apos.w;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = sA[threadIdx.x].sum;
}
pos -= cSim.nonbond_workBlock;
}
}
void kCalculateObcGbsaBornSum(gpuContext gpu)
{
// printf("kCalculateObcgbsaBornSum\n");
kCalculateObcGbsaBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
kClearObcGbsaBornSum_kernel<<<gpu->sim.blocks, 384>>>();
LAUNCHERROR("kClearBornSum");
size_t numWithInteractions;
switch (gpu->sim.nonbondedMethod)
{
case NO_CUTOFF:
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
else
kCalculateObcGbsaN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
break;
case CUTOFF:
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateObcGbsaCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
break;
case PERIODIC:
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaPeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateObcGbsaPeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
break;
}
LAUNCHERROR("kCalculateBornSum");
}
......@@ -52,18 +52,9 @@ struct Atom {
float fy;
float fz;
float fb;
// float sum;
// float oneOverR;
int pos;
int wx;
int wy;
};
__shared__ Atom sA[G8X_BORNFORCE2_THREADS_PER_BLOCK];
__shared__ unsigned int sWorkUnit[G8X_NONBOND_WORKUNITS_PER_SM];
__shared__ unsigned int sNext[GRID];
static __constant__ cudaGmxSimulation cSim;
void SetCalculateObcGbsaForces2Sim(gpuContext gpu)
......@@ -80,283 +71,72 @@ void GetCalculateObcGbsaForces2Sim(gpuContext gpu)
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void kCalculateObcGbsaForces2_kernel()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int pos = cSim.bf2WorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.bf2WorkUnitsPerBlockRemainder);
int end = cSim.bf2WorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.bf2WorkUnitsPerBlockRemainder);
if (threadIdx.x < end - pos)
{
sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
}
if (threadIdx.x < GRID)
{
sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
}
__syncthreads();
// Now change pos and end to reflect work queue just read
// into shared memory
end = end - pos;
sA[threadIdx.x].pos = end - (threadIdx.x >> GRIDBITS) - 1;
while (sA[threadIdx.x].pos >= 0)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = sWorkUnit[sA[threadIdx.x].pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
x = (x >> 17) << GRIDBITS;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int i = x + tgx;
float4 apos = cSim.pPosq[i];
float2 a = cSim.pObcData[i];
float fb = cSim.pBornForce[i];
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
Atom* psA = &sA[tbx];
sA[threadIdx.x].wx = x;
sA[threadIdx.x].wy = y;
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
float3 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
// float sum = 0.0f;
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
// float oneOverR = 1.0f / a.x;
sA[threadIdx.x].r = a.x;
sA[threadIdx.x].sr = a.y;
sA[threadIdx.x].sr2 = a.y * a.y;
sA[threadIdx.x].fb = fb;
for (unsigned int j = sNext[tgx]; j != tgx; j = sNext[j])
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrt(r2);
// Atom I Born forces and sum
float rScaledRadiusJ = r + psA[j].sr;
float l_ij = 1.0f / max(a.x, fabs(r - psA[j].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float rInverse = 1.0f / r;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float r2Inverse = rInverse * rInverse;
float t1 = log (u_ij / l_ij);
float t2 = (l_ij2 - u_ij2);
float t3 = t2 * rInverse;
t1 *= rInverse;
// Born Forces term
float term = 0.125f *
(1.000f + psA[j].sr2 * r2Inverse) * t3 +
0.250f * t1 * r2Inverse;
float dE = fb * term;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * psA[j].sr2) * t3;
// if (a.x < (psA[j].sr - r))
// {
// term += 2.0f * (oneOverR - l_ij);
// }
if (a.x >= rScaledRadiusJ)
{
dE = /*term =*/ 0.0f;
}
float d = dx * dE;
af.x -= d;
psA[j].fx += d;
d = dy * dE;
af.y -= d;
psA[j].fy += d;
d = dz * dE;
af.z -= d;
psA[j].fz += d;
// sum += term;
}
// Write results
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
float4 of;
of.x = af.x + sA[threadIdx.x].fx;
of.y = af.y + sA[threadIdx.x].fy;
of.z = af.z + sA[threadIdx.x].fz;
of.w = 0.0f;
cSim.pForce4b[offset] = of;
// cSim.pBornSum[offset] = sum;
}
else
{
// Read fixed atom data into registers and GRF
int j = y + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pObcData[j];
sA[threadIdx.x].fb = cSim.pBornForce[j];
float3 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
// sA[threadIdx.x].sum = 0.0f;
// float sum = 0.0f;
float sr2 = a.y * a.y;
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].r = temp1.x;
sA[threadIdx.x].sr = temp1.y;
sA[threadIdx.x].sr2 = temp1.y * temp1.y;
// sA[threadIdx.x].oneOverR = 1.0f / temp1.x;
for (j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrt(r2);
// Atom I Born Forces and sum
float r2Inverse = 1.0f / r2;
float rScaledRadiusJ = r + psA[tj].sr;
float rInverse = 1.0f / r;
float l_ij = 1.0f / max(a.x, fabs(r - psA[tj].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float t1 = log (u_ij / l_ij);
float t2 = (l_ij2 - u_ij2);
float t3 = t2 * rInverse;
t1 *= rInverse;
// Born Forces term
float term = 0.125f *
(1.000f + psA[tj].sr2 * r2Inverse) * t3 +
0.250f * t1 * r2Inverse;
float dE = fb * term;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * psA[tj].sr2) * t3;
// if (a.x < (psA[tj].sr - r))
// {
// term += 2.0f * ((1.0f / a.x) - l_ij);
// }
if (a.x >= rScaledRadiusJ)
{
dE = /*term =*/ 0.0f;
}
// Include versions of the kernels for N^2 calculations.
float d = dx * dE;
af.x -= d;
psA[tj].fx += d;
d = dy * dE;
af.y -= d;
psA[tj].fy += d;
d = dz * dE;
af.z -= d;
psA[tj].fz += d;
// sum += term;
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateObcGbsaForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateObcGbsaForces2.h"
// Atom J Born Forces and sum
float rScaledRadiusI = r + a.y;
l_ij = 1.0f / max(psA[tj].r, fabs(r - a.y));
u_ij = 1.0f / rScaledRadiusI;
l_ij2 = l_ij * l_ij;
u_ij2 = u_ij * u_ij;
t1 = log (u_ij / l_ij);
t2 = (l_ij2 - u_ij2);
t3 = t2 * rInverse;
t1 *= rInverse;
// Include versions of the kernels with cutoffs.
// Born Forces term
term = 0.125f *
(1.000f + sr2 * r2Inverse) * t3 +
0.250f * t1 * r2Inverse;
dE = psA[tj].fb * term;
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateObcGbsaForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateObcGbsaForces2.h"
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * sr2) * t3;
//
// if (psA[tj].r < (a.y - r))
// {
// term += 2.0f * (psA[tj].oneOverR - l_ij);
// }
if (psA[tj].r >= rScaledRadiusI)
{
dE = /*term =*/ 0.0f;
}
dx *= dE;
dy *= dE;
dz *= dE;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
af.x -= dx;
af.y -= dy;
af.z -= dz;
// psA[tj].sum += term;
// Include versions of the kernels with periodic boundary conditions.
tj = sNext[tj];
}
// Write results
int offset = sA[threadIdx.x].wx + tgx + (sA[threadIdx.x].wy >> GRIDBITS) * cSim.stride;
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
cSim.pForce4b[offset] = of;
// cSim.pBornSum[offset] = sum;
offset = sA[threadIdx.x].wy + tgx + (sA[threadIdx.x].wx >> GRIDBITS) * cSim.stride;
of.x = sA[threadIdx.x].fx;
of.y = sA[threadIdx.x].fy;
of.z = sA[threadIdx.x].fz;
cSim.pForce4b[offset] = of;
// cSim.pBornSum[offset] = sA[threadIdx.x].sum;
}
sA[threadIdx.x].pos -= cSim.bornForce2_workBlock;
}
}
__global__ extern void kCalculateObcGbsaForces2_12_kernel();
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateObcGbsaForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateObcGbsaForces2.h"
void kCalculateObcGbsaForces2(gpuContext gpu)
{
//printf("kCalculateObcGbsaForces2\n");
if (gpu->sm_version < SM_12)
kCalculateObcGbsaForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block>>>();
size_t numWithInteractions;
switch (gpu->sim.nonbondedMethod)
{
case NO_CUTOFF:
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
else
kCalculateObcGbsaForces2_12_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block>>>();
if( 0 ){
static int step = 0;
//int numPrint = -1;
step++;
//WriteArrayToFile1( gpu, "ObcGbsaBornBRad", step, gpu->psBornRadii, numPrint );
//gpuDumpCoordinates( gpu );
kReduceBornSumAndForces( gpu );
gpuDumpObcLoop1( gpu );
}
kCalculateObcGbsaN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
break;
case CUTOFF:
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaCutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateObcGbsaCutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
break;
case PERIODIC:
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaPeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateObcGbsaPeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
break;
}
LAUNCHERROR("kCalculateObcGbsaForces2");
}
......@@ -61,9 +61,9 @@ void GetForcesSim(gpuContext gpu)
__global__ void kClearForces_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.stride4 * cSim.outputBuffers)
while (pos < cSim.stride * cSim.outputBuffers)
{
((float*)cSim.pForce4)[pos] = 0.0f;
cSim.pForce4[pos] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
pos += gridDim.x * blockDim.x;
}
}
......
......@@ -61,7 +61,6 @@ void GetVerletUpdateSim(gpuContext gpu)
__global__ void kVerletUpdatePart1_kernel()
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
__syncthreads();
while (pos < cSim.atoms)
{
......@@ -175,7 +174,6 @@ void kVerletUpdatePart1(gpuContext gpu)
__global__ void kVerletUpdatePart2_kernel()
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
__syncthreads();
while (pos < cSim.atoms)
{
......@@ -208,7 +206,6 @@ __global__ void kVerletUpdatePart2CM_kernel()
extern __shared__ float3 sCM[];
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
float3 CM = {0.0f, 0.0f, 0.0f};
__syncthreads();
while (pos < cSim.atoms)
{
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008 Stanford University and the Authors. *
* Portions copyright (c) 2008-2009 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -419,6 +419,26 @@ void ReferenceCalcGBSAOBCForceKernel::initialize(const System& system, const GBS
obcParameters->setScaledRadiusFactors(scaleFactors);
obcParameters->setSolventDielectric( static_cast<RealOpenMM>(force.getSolventDielectric()) );
obcParameters->setSoluteDielectric( static_cast<RealOpenMM>(force.getSoluteDielectric()) );
// If there is a NonbondedForce in this system, use it to initialize cutoffs and periodic boundary conditions.
for (int i = 0; i < system.getNumForces(); i++) {
const NonbondedForce* nonbonded = dynamic_cast<const NonbondedForce*>(&system.getForce(i));
if (nonbonded != NULL) {
if (nonbonded->getNonbondedMethod() != NonbondedForce::NoCutoff)
obcParameters->setUseCutoff(nonbonded->getCutoffDistance());
if (nonbonded->getNonbondedMethod() == NonbondedForce::CutoffPeriodic) {
Vec3 boxVectors[3];
nonbonded->getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
RealOpenMM periodicBoxSize[3];
periodicBoxSize[0] = (RealOpenMM) boxVectors[0][0];
periodicBoxSize[1] = (RealOpenMM) boxVectors[1][1];
periodicBoxSize[2] = (RealOpenMM) boxVectors[2][2];
obcParameters->setPeriodic(periodicBoxSize);
}
break;
}
}
obc = new CpuObc(obcParameters);
obc->setIncludeAceApproximation(true);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment