Commit f6346776 authored by Peter Eastman's avatar Peter Eastman
Browse files

Continuing to implement new CUDA platform: CustomGBForce

parent 5feaa943
......@@ -123,6 +123,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
int major, minor;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
gpuArchitecture = intToString(major)+intToString(minor);
computeCapability = major+0.1*minor;
defaultOptimizationOptions = "--use_fast_math";
unsigned int flags = CU_CTX_MAP_HOST;
if (useBlockingSync)
......
......@@ -105,10 +105,16 @@ public:
CUdevice getDevice() {
return device;
}
/**
* Get the compute capability of the device associated with this object.
*/
double getComputeCapability() const {
return computeCapability;
}
/**
* Get the index of the CUdevice associated with this object.
*/
int getDeviceIndex() {
int getDeviceIndex() const {
return deviceIndex;
}
/**
......@@ -444,7 +450,7 @@ private:
void validateMolecules();
static bool hasInitializedCuda;
const System& system;
double time;
double time, computeCapability;
CudaPlatform::PlatformData& platformData;
int deviceIndex;
int contextIndex;
......
......@@ -98,8 +98,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
return new CudaCalcCustomNonbondedForceKernel(name, platform, cu, context.getSystem());
if (name == CalcGBSAOBCForceKernel::Name())
return new CudaCalcGBSAOBCForceKernel(name, platform, cu);
// if (name == CalcCustomGBForceKernel::Name())
// return new CudaCalcCustomGBForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomGBForceKernel::Name())
return new CudaCalcCustomGBForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomExternalForceKernel::Name())
return new CudaCalcCustomExternalForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomHbondForceKernel::Name())
......
This diff is collapsed.
......@@ -715,58 +715,58 @@ private:
std::vector<void*> computeSumArgs, force1Args;
};
///**
// * This kernel is invoked by CustomGBForce to calculate the forces acting on the system.
// */
//class CudaCalcCustomGBForceKernel : public CalcCustomGBForceKernel {
//public:
// CudaCalcCustomGBForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCustomGBForceKernel(name, platform),
// hasInitializedKernels(false), cu(cu), params(NULL), computedValues(NULL), energyDerivs(NULL), longEnergyDerivs(NULL), globals(NULL),
// valueBuffers(NULL), longValueBuffers(NULL), tabulatedFunctionParams(NULL), system(system) {
// }
// ~CudaCalcCustomGBForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the CustomGBForce this kernel will be used for
// */
// void initialize(const System& system, const CustomGBForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
// /**
// * Copy changed parameters over to a context.
// *
// * @param context the context to copy parameters to
// * @param force the CustomGBForce to copy the parameters from
// */
// void copyParametersToContext(ContextImpl& context, const CustomGBForce& force);
//private:
// bool hasInitializedKernels, needParameterGradient;
// int maxTiles, numComputedValues;
// CudaContext& cu;
// CudaParameterSet* params;
// CudaParameterSet* computedValues;
// CudaParameterSet* energyDerivs;
// CudaArray<cl_long>* longEnergyDerivs;
// CudaArray<cl_float>* globals;
// CudaArray<cl_float>* valueBuffers;
// CudaArray<cl_long>* longValueBuffers;
// CudaArray<mm_float4>* tabulatedFunctionParams;
// std::vector<std::string> globalParamNames;
// std::vector<cl_float> globalParamValues;
// std::vector<CudaArray<mm_float4>*> tabulatedFunctions;
// std::vector<bool> pairValueUsesParam, pairEnergyUsesParam, pairEnergyUsesValue;
// System& system;
// CUfunction pairValueKernel, perParticleValueKernel, pairEnergyKernel, perParticleEnergyKernel, gradientChainRuleKernel;
//};
/**
* This kernel is invoked by CustomGBForce to calculate the forces acting on the system.
*/
class CudaCalcCustomGBForceKernel : public CalcCustomGBForceKernel {
public:
CudaCalcCustomGBForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCustomGBForceKernel(name, platform),
hasInitializedKernels(false), cu(cu), params(NULL), computedValues(NULL), energyDerivs(NULL), longEnergyDerivs(NULL), globals(NULL),
valueBuffers(NULL), tabulatedFunctionParams(NULL), system(system) {
}
~CudaCalcCustomGBForceKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param force the CustomGBForce this kernel will be used for
*/
void initialize(const System& system, const CustomGBForce& force);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
/**
* Copy changed parameters over to a context.
*
* @param context the context to copy parameters to
* @param force the CustomGBForce to copy the parameters from
*/
void copyParametersToContext(ContextImpl& context, const CustomGBForce& force);
private:
bool hasInitializedKernels, needParameterGradient;
int maxTiles, numComputedValues;
CudaContext& cu;
CudaParameterSet* params;
CudaParameterSet* computedValues;
CudaParameterSet* energyDerivs;
CudaArray* longEnergyDerivs;
CudaArray* globals;
CudaArray* valueBuffers;
CudaArray* tabulatedFunctionParams;
std::vector<std::string> globalParamNames;
std::vector<float> globalParamValues;
std::vector<CudaArray*> tabulatedFunctions;
std::vector<bool> pairValueUsesParam, pairEnergyUsesParam, pairEnergyUsesValue;
System& system;
CUfunction pairValueKernel, perParticleValueKernel, pairEnergyKernel, perParticleEnergyKernel, gradientChainRuleKernel;
std::vector<void*> pairValueArgs, perParticleValueArgs, pairEnergyArgs, perParticleEnergyArgs, gradientChainRuleArgs;
};
/**
* This kernel is invoked by CustomExternalForce to calculate the forces acting on the system and the energy of the system.
......
......@@ -52,7 +52,7 @@ CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(c
int multiprocessors;
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, context.getDevice()));
numForceThreadBlocks = 2*multiprocessors;
forceThreadBlockSize = 256;
forceThreadBlockSize = (context.getComputeCapability() < 2.0 ? 128 : 256);
}
CudaNonbondedUtilities::~CudaNonbondedUtilities() {
......@@ -441,8 +441,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
if ((localDataSize/4)%2 == 0 && !context.getUseDoublePrecision())
defines["PARAMETER_SIZE_IS_EVEN"] = "1";
string file;
CUmodule program = context.createModule(context.replaceStrings(CudaKernelSources::vectorOps+CudaKernelSources::nonbonded, replacements), defines);
CUmodule program = context.createModule(CudaKernelSources::vectorOps+context.replaceStrings(CudaKernelSources::nonbonded, replacements), defines);
CUfunction kernel = context.getKernel(program, "computeNonbonded");
// Set arguments to the Kernel.
......
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif
#ifdef USE_SYMMETRIC
real tempForce = 0;
#else
real3 tempForce1 = make_real3(0);
real3 tempForce2 = make_real3(0);
#endif
COMPUTE_FORCE
#ifdef USE_SYMMETRIC
dEdR += tempForce*invR;
#else
dEdR1 += tempForce1;
dEdR2 += tempForce2;
#endif
}
#define STORE_DERIVATIVE_1(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (deriv##INDEX##_1*0xFFFFFFFF)));
#define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0xFFFFFFFF)));
#define TILE_SIZE 32
typedef struct {
real4 posq;
real3 force;
ATOM_PARAMETER_DATA
#ifdef NEED_PADDING
float padding;
#endif
} AtomData;
/**
* Compute a force based on pair interactions.
*/
extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer,
const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions, const unsigned int* __restrict__ exclusionIndices,
const unsigned int* __restrict__ exclusionRowIndices,
#ifdef USE_CUTOFF
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags
#else
unsigned int numTiles
#endif
PARAMETER_ARGUMENTS) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps;
#endif
real energy = 0;
unsigned int lasty = 0xFFFFFFFF;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
__shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
__shared__ int exclusionIndex[WARPS_PER_GROUP];
do {
// Extract the coordinates of this tile
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
unsigned int x, y;
real3 force = make_real3(0);
DECLARE_ATOM1_DERIVATIVES
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
bool hasExclusions = false;
#endif
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = threadIdx.x;
localData[localAtomIndex].posq = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+j;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
real dEdR = 0;
real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
COMPUTE_INTERACTION
dEdR /= -r;
}
energy += 0.5f*tempEnergy;
delta *= dEdR;
force -= delta;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
}
}
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = threadIdx.x;
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
localData[localAtomIndex].posq = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
localData[localAtomIndex].force = make_real3(0);
CLEAR_LOCAL_DERIVATIVES
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags == 0) {
// No interactions in this tile.
}
else
#endif
{
// Compute the full set of interactions in this tile.
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+tj;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj;
real dEdR = 0;
real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION
dEdR /= -r;
}
energy += tempEnergy;
delta *= dEdR;
force -= delta;
atom2 = tbx+tj;
localData[atom2].force += delta;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
}
}
lasty = y;
// Write results.
if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0xFFFFFFFF)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0xFFFFFFFF)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0xFFFFFFFF)));
STORE_DERIVATIVES_1
}
if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0xFFFFFFFF)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0xFFFFFFFF)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0xFFFFFFFF)));
STORE_DERIVATIVES_2
}
pos++;
} while (pos < end);
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
}
/**
* Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms.
*/
extern "C" __global__ void computePerParticleEnergy(long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq
PARAMETER_ARGUMENTS) {
real energy = 0;
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
// Load the derivatives
LOAD_DERIVATIVES
// Now calculate the per-particle energy terms.
real4 pos = posq[index];
real3 force = make_real3(0, 0, 0);
COMPUTE_ENERGY
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
}
/**
* Compute chain rule terms for computed values that depend explicitly on particle coordinates.
*/
extern "C" __global__ void computeGradientChainRuleTerms(long long* __restrict__ forceBuffers, const real4* __restrict__ posq
PARAMETER_ARGUMENTS) {
const real scale = RECIP((real) 0xFFFFFFFF);
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
real4 pos = posq[index];
real3 force = make_real3(scale*forceBuffers[index], scale*forceBuffers[index+PADDED_NUM_ATOMS], scale*forceBuffers[index+PADDED_NUM_ATOMS*2]);
COMPUTE_FORCES
forceBuffers[index] = (long long) (force.x*0xFFFFFFFF);
forceBuffers[index+PADDED_NUM_ATOMS] = (long long) (force.y*0xFFFFFFFF);
forceBuffers[index+PADDED_NUM_ATOMS*2] = (long long) (force.z*0xFFFFFFFF);
}
}
#define TILE_SIZE 32
typedef struct {
real4 posq;
real value, temp;
ATOM_PARAMETER_DATA
#ifdef NEED_PADDING
float padding;
#endif
} AtomData;
/**
* Compute a value based on pair interactions.
*/
extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions,
const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, unsigned long long* __restrict__ global_value,
#ifdef USE_CUTOFF
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags
#else
unsigned int numTiles
#endif
PARAMETER_ARGUMENTS) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps;
#endif
real energy = 0;
unsigned int lasty = 0xFFFFFFFF;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
__shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
__shared__ int exclusionIndex[WARPS_PER_GROUP];
do {
// Extract the coordinates of this tile
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
unsigned int x, y;
real value = 0;
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
bool hasExclusions = false;
#endif
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = threadIdx.x;
localData[localAtomIndex].posq = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+j;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
real tempValue1 = 0;
real tempValue2 = 0;
#ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif
COMPUTE_VALUE
}
value += tempValue1;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
}
}
else {
// This is an off-diagonal tile.
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
localData[threadIdx.x].posq = posq[j];
const unsigned int localAtomIndex = threadIdx.x;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
localData[threadIdx.x].value = 0;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
int atom2 = tbx+j;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real tempValue1 = 0;
real tempValue2 = 0;
if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_VALUE
}
value += tempValue1;
}
localData[threadIdx.x].temp = tempValue2;
// Sum the forces on atom2.
if (tgx % 4 == 0)
localData[threadIdx.x].temp += localData[threadIdx.x+1].temp+localData[threadIdx.x+2].temp+localData[threadIdx.x+3].temp;
if (tgx == 0)
localData[tbx+j].value += localData[threadIdx.x].temp+localData[threadIdx.x+4].temp+localData[threadIdx.x+8].temp+localData[threadIdx.x+12].temp+localData[threadIdx.x+16].temp+localData[threadIdx.x+20].temp+localData[threadIdx.x+24].temp+localData[threadIdx.x+28].temp;
}
}
}
}
else
#endif
{
// Compute the full set of interactions in this tile.
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+tj;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj;
real tempValue1 = 0;
real tempValue2 = 0;
#ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#endif
COMPUTE_VALUE
}
value += tempValue1;
localData[tbx+tj].value += tempValue2;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
}
}
// Write results.
if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (value*0xFFFFFFFF)));
}
if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx;
atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0xFFFFFFFF)));
}
lasty = y;
pos++;
} while (pos < end);
}
/**
* Reduce a pairwise computed value, and compute per-particle values.
*/
extern "C" __global__ void computePerParticleValues(real4* posq, long long* valueBuffers
PARAMETER_ARGUMENTS) {
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
// Load the pairwise value
real sum = valueBuffers[index]/(real) 0xFFFFFFFF;
// Now calculate other values
real4 pos = posq[index];
COMPUTE_VALUES
}
}
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment