Commit 86aacbd8 authored by Peter Eastman's avatar Peter Eastman
Browse files

Continuing to implement new CUDA platform: NonbondedForce

parent eb64fa2f
......@@ -18,7 +18,7 @@ IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${CUDA_LIBRARIES} ${PTHREADS_LIB})
TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${CUDA_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} ${PTHREADS_LIB})
SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_BUILDING_SHARED_LIBRARY")
INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
......@@ -66,7 +66,8 @@ void CudaBondedUtilities::addPrefixCode(const string& source) {
void CudaBondedUtilities::initialize(const System& system) {
int numForces = forceAtoms.size();
if (numForces == 0)
hasInteractions = (numForces > 0);
if (!hasInteractions)
return;
// Build the lists of atom indices.
......@@ -164,6 +165,8 @@ void CudaBondedUtilities::computeInteractions(int groups) {
for (int i = 0; i < (int) arguments.size(); i++)
kernelArgs.push_back(&arguments[i]);
}
if (!hasInteractions)
return;
kernelArgs[3] = &groups;
context.executeKernel(kernel, &kernelArgs[0], maxBonds);
}
......@@ -131,7 +131,7 @@ private:
std::vector<std::string> prefixCode;
std::vector<void*> kernelArgs;
int numForceBuffers, maxBonds;
bool hasInitializedKernels;
bool hasInitializedKernels, hasInteractions;
};
} // namespace OpenMM
......
This diff is collapsed.
......@@ -324,6 +324,8 @@ public:
void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
periodicBoxSize = make_double4(xsize, ysize, zsize, 0.0);
invPeriodicBoxSize = make_double4(1.0/xsize, 1.0/ysize, 1.0/zsize, 0.0);
periodicBoxSizeFloat = make_float4((float) xsize, (float) ysize, (float) zsize, 0.0f);
invPeriodicBoxSizeFloat = make_float4(1.0f/(float) xsize, 1.0f/(float) ysize, 1.0f/(float) zsize, 0.0f);
}
/**
* Get the inverse of the size of the periodic box.
......@@ -331,6 +333,20 @@ public:
double4 getInvPeriodicBoxSize() const {
return invPeriodicBoxSize;
}
/**
* Get a pointer to the size of the periodic box, represented as either a float4 or double4 depending on
* this context's precision. This value is suitable for passing to kernels as an argument.
*/
void* getPeriodicBoxSizePointer() {
return (useDoublePrecision ? reinterpret_cast<void*>(&periodicBoxSize) : reinterpret_cast<void*>(&periodicBoxSizeFloat));
}
/**
* Get a pointer to the inverse of the size of the periodic box, represented as either a float4 or double4 depending on
* this context's precision. This value is suitable for passing to kernels as an argument.
*/
void* getInvPeriodicBoxSizePointer() {
return (useDoublePrecision ? reinterpret_cast<void*>(&invPeriodicBoxSize) : reinterpret_cast<void*>(&invPeriodicBoxSizeFloat));
}
/**
* Get the CudaIntegrationUtilities for this context.
*/
......@@ -349,12 +365,12 @@ public:
CudaBondedUtilities& getBondedUtilities() {
return *bonded;
}
// /**
// * Get the CudaNonbondedUtilities for this context.
// */
// CudaNonbondedUtilities& getNonbondedUtilities() {
// return *nonbonded;
// }
/**
* Get the CudaNonbondedUtilities for this context.
*/
CudaNonbondedUtilities& getNonbondedUtilities() {
return *nonbonded;
}
/**
* Get the thread used by this context for executing parallel computations.
*/
......@@ -429,8 +445,8 @@ private:
int numThreadBlocks;
bool useBlockingSync, useDoublePrecision, accumulateInDouble, contextIsValid, atomsWereReordered, moleculesInvalid;
std::string compiler, tempDir, gpuArchitecture;
double4 periodicBoxSize;
double4 invPeriodicBoxSize;
float4 periodicBoxSizeFloat, invPeriodicBoxSizeFloat;
double4 periodicBoxSize, invPeriodicBoxSize;
std::string defaultOptimizationOptions;
std::map<std::string, std::string> compilationDefines;
CUcontext context;
......@@ -458,7 +474,7 @@ private:
CudaIntegrationUtilities* integration;
CudaExpressionUtilities* expression;
CudaBondedUtilities* bonded;
// CudaNonbondedUtilities* nonbonded;
CudaNonbondedUtilities* nonbonded;
WorkThread* thread;
};
......
......@@ -92,8 +92,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
return new CudaCalcCMAPTorsionForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomTorsionForceKernel::Name())
return new CudaCalcCustomTorsionForceKernel(name, platform, cu, context.getSystem());
// if (name == CalcNonbondedForceKernel::Name())
// return new CudaCalcNonbondedForceKernel(name, platform, cu, context.getSystem());
if (name == CalcNonbondedForceKernel::Name())
return new CudaCalcNonbondedForceKernel(name, platform, cu, context.getSystem());
// if (name == CalcCustomNonbondedForceKernel::Name())
// return new CudaCalcCustomNonbondedForceKernel(name, platform, cu, context.getSystem());
// if (name == CalcGBSAOBCForceKernel::Name())
......
This diff is collapsed.
......@@ -34,6 +34,7 @@
#include "CudaSort.h"
#include "openmm/kernels.h"
#include "openmm/System.h"
#include <cufft.h>
namespace OpenMM {
......@@ -542,87 +543,86 @@ private:
std::vector<float> globalParamValues;
};
///**
// * This kernel is invoked by NonbondedForce to calculate the forces acting on the system.
// */
//class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
//public:
// CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcNonbondedForceKernel(name, platform),
// hasInitializedKernel(false), cu(cu), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), pmeGrid(NULL),
// pmeGrid2(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeBsplineDTheta(NULL),
// pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), fft(NULL) {
// }
// ~CudaCalcNonbondedForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the NonbondedForce this kernel will be used for
// */
// void initialize(const System& system, const NonbondedForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @param includeDirect true if direct space interactions should be included
// * @param includeReciprocal true if reciprocal space interactions should be included
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal);
// /**
// * Copy changed parameters over to a context.
// *
// * @param context the context to copy parameters to
// * @param force the NonbondedForce to copy the parameters from
// */
// void copyParametersToContext(ContextImpl& context, const NonbondedForce& force);
//private:
// struct SortTrait {
// typedef mm_int2 DataType;
// typedef cl_int KeyType;
// static const char* clDataType() {return "int2";}
// static const char* clKeyType() {return "int";}
// static const char* clMinKey() {return "INT_MIN";}
// static const char* clMaxKey() {return "INT_MAX";}
// static const char* clMaxValue() {return "(int2) (INT_MAX, INT_MAX)";}
// static const char* clSortKey() {return "value.y";}
// };
// CudaContext& cu;
// bool hasInitializedKernel;
// CudaArray<mm_float2>* sigmaEpsilon;
// CudaArray<mm_float4>* exceptionParams;
// CudaArray<mm_float2>* cosSinSums;
// CudaArray<mm_float2>* pmeGrid;
// CudaArray<mm_float2>* pmeGrid2;
// CudaArray<cl_float>* pmeBsplineModuliX;
// CudaArray<cl_float>* pmeBsplineModuliY;
// CudaArray<cl_float>* pmeBsplineModuliZ;
// CudaArray<mm_float4>* pmeBsplineTheta;
// CudaArray<mm_float4>* pmeBsplineDTheta;
// CudaArray<cl_int>* pmeAtomRange;
// CudaArray<mm_int2>* pmeAtomGridIndex;
// CudaSort<SortTrait>* sort;
// CudaFFT3D* fft;
// CUfunction ewaldSumsKernel;
// CUfunction ewaldForcesKernel;
// CUfunction pmeGridIndexKernel;
// CUfunction pmeAtomRangeKernel;
// CUfunction pmeZIndexKernel;
// CUfunction pmeUpdateBsplinesKernel;
// CUfunction pmeSpreadChargeKernel;
// CUfunction pmeFinishSpreadChargeKernel;
// CUfunction pmeConvolutionKernel;
// CUfunction pmeInterpolateForceKernel;
// std::map<std::string, std::string> pmeDefines;
// std::vector<std::pair<int, int> > exceptionAtoms;
// double ewaldSelfEnergy, dispersionCoefficient, alpha;
// int interpolateForceThreads;
// bool hasCoulomb, hasLJ;
// static const int PmeOrder = 5;
//};
//
/**
* This kernel is invoked by NonbondedForce to calculate the forces acting on the system.
*/
class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public:
CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcNonbondedForceKernel(name, platform),
cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), pmeGrid(NULL),
pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeBsplineDTheta(NULL),
pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL) {
}
~CudaCalcNonbondedForceKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param force the NonbondedForce this kernel will be used for
*/
void initialize(const System& system, const NonbondedForce& force);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @param includeDirect true if direct space interactions should be included
* @param includeReciprocal true if reciprocal space interactions should be included
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal);
/**
* Copy changed parameters over to a context.
*
* @param context the context to copy parameters to
* @param force the NonbondedForce to copy the parameters from
*/
void copyParametersToContext(ContextImpl& context, const NonbondedForce& force);
private:
class SortTrait : public CudaSort::SortTrait {
int getDataSize() const {return 8;}
int getKeySize() const {return 4;}
const char* getDataType() const {return "int2";}
const char* getKeyType() const {return "int";}
const char* getMinKey() const {return "INT_MIN";}
const char* getMaxKey() const {return "INT_MAX";}
const char* getMaxValue() const {return "make_int2(INT_MAX, INT_MAX)";}
const char* getSortKey() const {return "value.y";}
};
CudaContext& cu;
bool hasInitializedFFT;
CudaArray* sigmaEpsilon;
CudaArray* exceptionParams;
CudaArray* cosSinSums;
CudaArray* pmeGrid;
CudaArray* pmeBsplineModuliX;
CudaArray* pmeBsplineModuliY;
CudaArray* pmeBsplineModuliZ;
CudaArray* pmeBsplineTheta;
CudaArray* pmeBsplineDTheta;
CudaArray* pmeAtomRange;
CudaArray* pmeAtomGridIndex;
CudaSort* sort;
cufftHandle fft;
CUfunction ewaldSumsKernel;
CUfunction ewaldForcesKernel;
CUfunction pmeGridIndexKernel;
CUfunction pmeAtomRangeKernel;
CUfunction pmeZIndexKernel;
CUfunction pmeUpdateBsplinesKernel;
CUfunction pmeSpreadChargeKernel;
CUfunction pmeFinishSpreadChargeKernel;
CUfunction pmeConvolutionKernel;
CUfunction pmeInterpolateForceKernel;
std::map<std::string, std::string> pmeDefines;
std::vector<std::pair<int, int> > exceptionAtoms;
double ewaldSelfEnergy, dispersionCoefficient, alpha;
int interpolateForceThreads;
bool hasCoulomb, hasLJ;
static const int PmeOrder = 5;
};
///**
// * This kernel is invoked by CustomNonbondedForce to calculate the forces acting on the system.
// */
......
This diff is collapsed.
......@@ -38,7 +38,7 @@ namespace OpenMM {
/**
* This class provides a generic interface for calculating nonbonded interactions. It does this in two
* ways. First, it can be used to create Kernels that evaluate nonbonded interactions. Clients
* ways. First, it can be used to create kernels that evaluate nonbonded interactions. Clients
* only need to provide the code for evaluating a single interaction and the list of parameters it depends on.
* A complete kernel is then synthesized using an appropriate algorithm to evaluate all interactions on all
* atoms.
......@@ -64,209 +64,199 @@ namespace OpenMM {
class OPENMM_EXPORT CudaNonbondedUtilities {
public:
class ParameterInfo;
// CudaNonbondedUtilities(CudaContext& context);
// ~CudaNonbondedUtilities();
// /**
// * Add a nonbonded interaction to be evaluated by the default interaction kernel.
// *
// * @param usesCutoff specifies whether a cutoff should be applied to this interaction
// * @param usesPeriodic specifies whether periodic boundary conditions should be applied to this interaction
// * @param usesExclusions specifies whether this interaction uses exclusions. If this is true, it must have identical exclusions to every other interaction.
// * @param cutoffDistance the cutoff distance for this interaction (ignored if usesCutoff is false)
// * @param exclusionList for each atom, specifies the list of other atoms whose interactions should be excluded
// * @param kernel the code to evaluate the interaction
// * @param forceGroup the force group in which the interaction should be calculated
// */
// void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup);
// /**
// * Add a per-atom parameter that the default interaction kernel may depend on.
// */
// void addParameter(const ParameterInfo& parameter);
// /**
// * Add an array (other than a per-atom parameter) that should be passed as an argument to the default interaction kernel.
// */
// void addArgument(const ParameterInfo& parameter);
// /**
// * Specify the list of exclusions that an interaction outside the default kernel will depend on.
// *
// * @param exclusionList for each atom, specifies the list of other atoms whose interactions should be excluded
// */
// void requestExclusions(const std::vector<std::vector<int> >& exclusionList);
// /**
// * Initialize this object in preparation for a simulation.
// */
// void initialize(const System& system);
// /**
// * Get the number of force buffers required for nonbonded forces.
// */
// int getNumForceBuffers() {
// return numForceBuffers;
// }
// /**
// * Get the number of energy buffers required for nonbonded forces.
// */
// int getNumEnergyBuffers() {
// return numForceThreadBlocks*forceThreadBlockSize;
// }
// /**
// * Get whether a cutoff is being used.
// */
// bool getUseCutoff() {
// return useCutoff;
// }
// /**
// * Get whether periodic boundary conditions are being used.
// */
// bool getUsePeriodic() {
// return usePeriodic;
// }
// /**
// * Get whether there is one force buffer per atom block.
// */
// bool getForceBufferPerAtomBlock() {
// return forceBufferPerAtomBlock;
// }
// /**
// * Get the number of work groups used for computing nonbonded forces.
// */
// int getNumForceThreadBlocks() {
// return numForceThreadBlocks;
// }
// /**
// * Get the size of each work group used for computing nonbonded forces.
// */
// int getForceThreadBlockSize() {
// return forceThreadBlockSize;
// }
// /**
// * Get the cutoff distance.
// */
// double getCutoffDistance() {
// return cutoff;
// }
// /**
// * Get whether any interactions have been added.
// */
// bool getHasInteractions() {
// return cutoff != -1.0;
// }
// /**
// * Get the force group in which nonbonded interactions should be computed.
// */
// int getForceGroup() {
// return nonbondedForceGroup;
// }
// /**
// * Prepare to compute interactions. This updates the neighbor list.
// */
// void prepareInteractions();
// /**
// * Compute the nonbonded interactions.
// */
// void computeInteractions();
// /**
// * Check to see if the neighbor list arrays are large enough, and make them bigger if necessary.
// */
// void updateNeighborListSize();
// /**
// * Get the array containing the center of each atom block.
// */
// CudaArray<mm_float4>& getBlockCenters() {
// return *blockCenter;
// }
// /**
// * Get the array containing the dimensions of each atom block.
// */
// CudaArray<mm_float4>& getBlockBoundingBoxes() {
// return *blockBoundingBox;
// }
// /**
// * Get the array whose first element contains the number of tiles with interactions.
// */
// CudaArray<cl_uint>& getInteractionCount() {
// return *interactionCount;
// }
// /**
// * Get the array containing tiles with interactions.
// */
// CudaArray<mm_ushort2>& getInteractingTiles() {
// return *interactingTiles;
// }
// /**
// * Get the array containing flags for tiles with interactions.
// */
// CudaArray<cl_uint>& getInteractionFlags() {
// return *interactionFlags;
// }
// /**
// * Get the array containing exclusion flags.
// */
// CudaArray<cl_uint>& getExclusions() {
// return *exclusions;
// }
// /**
// * Get the array containing the index into the exclusion array for each tile.
// */
// CudaArray<cl_uint>& getExclusionIndices() {
// return *exclusionIndices;
// }
// /**
// * Get the array listing where the exclusion data starts for each row.
// */
// CudaArray<cl_uint>& getExclusionRowIndices() {
// return *exclusionRowIndices;
// }
// /**
// * Get the index of the first tile this context is responsible for processing.
// */
// int getStartTileIndex() const {
// return startTileIndex;
// }
// /**
// * Get the total number of tiles this context is responsible for processing.
// */
// int getNumTiles() const {
// return numTiles;
// }
// /**
// * Set the range of tiles that should be processed by this context.
// */
// void setTileRange(int startTileIndex, int numTiles);
// /**
// * Create a Kernel for evaluating a nonbonded interaction. Cutoffs and periodic boundary conditions
// * are assumed to be the same as those for the default interaction Kernel, since this kernel will use
// * the same neighbor list.
// *
// * @param source the source code for evaluating the force and energy
// * @param params the per-atom parameters this kernel may depend on
// * @param arguments arrays (other than per-atom parameters) that should be passed as arguments to the kernel
// * @param useExclusions specifies whether exclusions are applied to this interaction
// * @param isSymmetric specifies whether the interaction is symmetric
// */
// cl::Kernel createInteractionKernel(const std::string& source, const std::vector<ParameterInfo>& params, const std::vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) const;
CudaNonbondedUtilities(CudaContext& context);
~CudaNonbondedUtilities();
/**
* Add a nonbonded interaction to be evaluated by the default interaction kernel.
*
* @param usesCutoff specifies whether a cutoff should be applied to this interaction
* @param usesPeriodic specifies whether periodic boundary conditions should be applied to this interaction
* @param usesExclusions specifies whether this interaction uses exclusions. If this is true, it must have identical exclusions to every other interaction.
* @param cutoffDistance the cutoff distance for this interaction (ignored if usesCutoff is false)
* @param exclusionList for each atom, specifies the list of other atoms whose interactions should be excluded
* @param kernel the code to evaluate the interaction
* @param forceGroup the force group in which the interaction should be calculated
*/
void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup);
/**
* Add a per-atom parameter that the default interaction kernel may depend on.
*/
void addParameter(const ParameterInfo& parameter);
/**
* Add an array (other than a per-atom parameter) that should be passed as an argument to the default interaction kernel.
*/
void addArgument(const ParameterInfo& parameter);
/**
* Specify the list of exclusions that an interaction outside the default kernel will depend on.
*
* @param exclusionList for each atom, specifies the list of other atoms whose interactions should be excluded
*/
void requestExclusions(const std::vector<std::vector<int> >& exclusionList);
/**
* Initialize this object in preparation for a simulation.
*/
void initialize(const System& system);
/**
* Get the number of energy buffers required for nonbonded forces.
*/
int getNumEnergyBuffers() {
return numForceThreadBlocks*forceThreadBlockSize;
}
/**
* Get whether a cutoff is being used.
*/
bool getUseCutoff() {
return useCutoff;
}
/**
* Get whether periodic boundary conditions are being used.
*/
bool getUsePeriodic() {
return usePeriodic;
}
/**
* Get the number of work groups used for computing nonbonded forces.
*/
int getNumForceThreadBlocks() {
return numForceThreadBlocks;
}
/**
* Get the size of each work group used for computing nonbonded forces.
*/
int getForceThreadBlockSize() {
return forceThreadBlockSize;
}
/**
* Get the cutoff distance.
*/
double getCutoffDistance() {
return cutoff;
}
/**
* Get whether any interactions have been added.
*/
bool getHasInteractions() {
return cutoff != -1.0;
}
/**
* Get the force group in which nonbonded interactions should be computed.
*/
int getForceGroup() {
return nonbondedForceGroup;
}
/**
* Prepare to compute interactions. This updates the neighbor list.
*/
void prepareInteractions();
/**
* Compute the nonbonded interactions.
*/
void computeInteractions();
/**
* Check to see if the neighbor list arrays are large enough, and make them bigger if necessary.
*/
void updateNeighborListSize();
/**
* Get the array containing the center of each atom block.
*/
CudaArray& getBlockCenters() {
return *blockCenter;
}
/**
* Get the array containing the dimensions of each atom block.
*/
CudaArray& getBlockBoundingBoxes() {
return *blockBoundingBox;
}
/**
* Get the array whose first element contains the number of tiles with interactions.
*/
CudaArray& getInteractionCount() {
return *interactionCount;
}
/**
* Get the array containing tiles with interactions.
*/
CudaArray& getInteractingTiles() {
return *interactingTiles;
}
/**
* Get the array containing flags for tiles with interactions.
*/
CudaArray& getInteractionFlags() {
return *interactionFlags;
}
/**
* Get the array containing exclusion flags.
*/
CudaArray& getExclusions() {
return *exclusions;
}
/**
* Get the array containing the index into the exclusion array for each tile.
*/
CudaArray& getExclusionIndices() {
return *exclusionIndices;
}
/**
* Get the array listing where the exclusion data starts for each row.
*/
CudaArray& getExclusionRowIndices() {
return *exclusionRowIndices;
}
/**
* Get the index of the first tile this context is responsible for processing.
*/
int getStartTileIndex() const {
return startTileIndex;
}
/**
* Get the total number of tiles this context is responsible for processing.
*/
int getNumTiles() const {
return numTiles;
}
/**
* Set the range of tiles that should be processed by this context.
*/
void setTileRange(int startTileIndex, int numTiles);
/**
* Create a Kernel for evaluating a nonbonded interaction. Cutoffs and periodic boundary conditions
* are assumed to be the same as those for the default interaction Kernel, since this kernel will use
* the same neighbor list.
*
* @param source the source code for evaluating the force and energy
* @param params the per-atom parameters this kernel may depend on
* @param arguments arrays (other than per-atom parameters) that should be passed as arguments to the kernel
* @param useExclusions specifies whether exclusions are applied to this interaction
* @param isSymmetric specifies whether the interaction is symmetric
*/
CUfunction createInteractionKernel(const std::string& source, std::vector<ParameterInfo>& params, std::vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric);
private:
// static int findExclusionIndex(int x, int y, const std::vector<cl_uint>& exclusionIndices, const std::vector<cl_uint>& exclusionRowIndices);
// CudaContext& context;
// cl::Kernel forceKernel;
// cl::Kernel findBlockBoundsKernel;
// cl::Kernel findInteractingBlocksKernel;
// cl::Kernel findInteractionsWithinBlocksKernel;
// CudaArray<cl_uint>* exclusions;
// CudaArray<cl_uint>* exclusionIndices;
// CudaArray<cl_uint>* exclusionRowIndices;
// CudaArray<mm_ushort2>* interactingTiles;
// CudaArray<cl_uint>* interactionFlags;
// CudaArray<cl_uint>* interactionCount;
// CudaArray<mm_float4>* blockCenter;
// CudaArray<mm_float4>* blockBoundingBox;
// std::vector<std::vector<int> > atomExclusions;
// std::vector<ParameterInfo> parameters;
// std::vector<ParameterInfo> arguments;
// std::string kernelSource;
// std::map<std::string, std::string> kernelDefines;
// double cutoff;
// bool useCutoff, usePeriodic, forceBufferPerAtomBlock, deviceIsCpu, anyExclusions;
// int numForceBuffers, startTileIndex, numTiles, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup;
static int findExclusionIndex(int x, int y, const std::vector<unsigned int>& exclusionIndices, const std::vector<unsigned int>& exclusionRowIndices);
CudaContext& context;
CUfunction forceKernel;
CUfunction findBlockBoundsKernel;
CUfunction findInteractingBlocksKernel;
CUfunction findInteractionsWithinBlocksKernel;
CudaArray* exclusions;
CudaArray* exclusionIndices;
CudaArray* exclusionRowIndices;
CudaArray* interactingTiles;
CudaArray* interactionFlags;
CudaArray* interactionCount;
CudaArray* blockCenter;
CudaArray* blockBoundingBox;
unsigned int* pinnedInteractionCount;
std::vector<void*> forceArgs, findBlockBoundsArgs, findInteractingBlocksArgs, findInteractionsWithinBlocksArgs;
std::vector<std::vector<int> > atomExclusions;
std::vector<ParameterInfo> parameters;
std::vector<ParameterInfo> arguments;
std::string kernelSource;
std::map<std::string, std::string> kernelDefines;
double cutoff;
bool useCutoff, usePeriodic, anyExclusions;
int startTileIndex, numTiles, maxTiles, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup, numAtoms;
};
/**
......@@ -309,7 +299,7 @@ public:
int getSize() const {
return size;
}
CUdeviceptr getMemory() const {
CUdeviceptr& getMemory() {
return memory;
}
private:
......
......@@ -77,7 +77,7 @@ CudaParameterSet::~CudaParameterSet() {
CHECK_RESULT(cuMemFree(buffers[i].getMemory()));
}
void CudaParameterSet::getParameterValues(vector<vector<float> >& values) const {
void CudaParameterSet::getParameterValues(vector<vector<float> >& values) {
values.resize(numObjects);
for (int i = 0; i < numObjects; i++)
values[i].resize(numParameters);
......
......@@ -71,7 +71,7 @@ public:
*
* @param values on exit, values[i][j] contains the value of parameter j for object i
*/
void getParameterValues(std::vector<std::vector<float> >& values) const;
void getParameterValues(std::vector<std::vector<float> >& values);
/**
* Set the values of all parameters.
*
......@@ -82,7 +82,7 @@ public:
* Get a set of CudaNonbondedUtilities::ParameterInfo objects which describe the Buffers
* containing the data.
*/
const std::vector<CudaNonbondedUtilities::ParameterInfo>& getBuffers() const {
std::vector<CudaNonbondedUtilities::ParameterInfo>& getBuffers() {
return buffers;
}
/**
......
......@@ -41,7 +41,7 @@ namespace OpenMM {
* sort and the key for sorting it. Here is an example of a trait class for
* sorting floats:
*
* class SortTrait : public CudaSort::SortTrait {
* class FloatTrait : public CudaSort::SortTrait {
* int getDataSize() const {return 4;}
* int getKeySize() const {return 4;}
* const char* getDataType() const {return "float";}
......
#if USE_EWALD
bool needCorrection = isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
if (!isExcluded || needCorrection) {
real tempForce = 0.0f;
if (r2 < CUTOFF_SQUARED || needCorrection) {
const real alphaR = EWALD_ALPHA*r;
const real expAlphaRSqr = EXP(-alphaR*alphaR);
const real prefactor = 138.935456f*posq1.w*posq2.w*invR;
// This approximation for erfc is from Abramowitz and Stegun (1964) p. 299. They cite the following as
// the original source: C. Hastings, Jr., Approximations for Digital Computers (1955). It has a maximum
// error of 3e-7.
real t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*alphaR)*alphaR)*alphaR)*alphaR)*alphaR)*alphaR;
t *= t;
t *= t;
t *= t;
const real erfcAlphaR = RECIP(t*t);
if (needCorrection) {
// Subtract off the part of this interaction that was included in the reciprocal space contribution.
tempForce = -prefactor*((1.0f-erfcAlphaR)-alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
tempEnergy += -prefactor*(1.0f-erfcAlphaR);
}
else {
#if HAS_LENNARD_JONES
real sig = sigmaEpsilon1.x + sigmaEpsilon2.x;
real sig2 = invR*sig;
sig2 *= sig2;
real sig6 = sig2*sig2*sig2;
real epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
tempForce = epssig6*(12.0f*sig6 - 6.0f) + prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
tempEnergy += epssig6*(sig6 - 1.0f) + prefactor*erfcAlphaR;
#else
tempForce = prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
tempEnergy += prefactor*erfcAlphaR;
#endif
}
}
dEdR += tempForce*invR*invR;
}
#else
{
#ifdef USE_CUTOFF
unsigned int includeInteraction = (!isExcluded && r2 < CUTOFF_SQUARED);
#else
unsigned int includeInteraction = (!isExcluded);
#endif
real tempForce = 0.0f;
#if HAS_LENNARD_JONES
real sig = sigmaEpsilon1.x + sigmaEpsilon2.x;
real sig2 = invR*sig;
sig2 *= sig2;
real sig6 = sig2*sig2*sig2;
real epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
tempForce = epssig6*(12.0f*sig6 - 6.0f);
tempEnergy += includeInteraction ? epssig6*(sig6 - 1) : 0;
#endif
#if HAS_COULOMB
#ifdef USE_CUTOFF
const real prefactor = 138.935456f*posq1.w*posq2.w;
tempForce += prefactor*(invR - 2.0f*REACTION_FIELD_K*r2);
tempEnergy += includeInteraction ? prefactor*(invR + REACTION_FIELD_K*r2 - REACTION_FIELD_C) : 0;
#else
const real prefactor = 138.935456f*posq1.w*posq2.w*invR;
tempForce += prefactor;
tempEnergy += includeInteraction ? prefactor : 0;
#endif
#endif
dEdR += includeInteraction ? tempForce*invR*invR : 0;
}
#endif
\ No newline at end of file
/**
* Convert a real4 to a real3 by removing its last element.
*/
__device__ real3 ccb_trim(real4 v) {
inline __device__ real3 ccb_trim(real4 v) {
return make_real3(v.x, v.y, v.z);
}
/**
* Compute the difference between two vectors, setting the fourth component to the squared magnitude.
*/
__device__ real4 ccb_delta(real4 vec1, real4 vec2) {
inline __device__ real4 ccb_delta(real4 vec1, real4 vec2) {
real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
result.w = result.x*result.x + result.y*result.y + result.z*result.z;
return result;
......@@ -38,7 +38,7 @@ __device__ real ccb_computeAngle(real4 vec1, real4 vec2) {
/**
* Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
*/
__device__ real4 ccb_computeCross(real4 vec1, real4 vec2) {
inline __device__ real4 ccb_computeCross(real4 vec1, real4 vec2) {
real3 cp = cross(vec1, vec2);
return make_real4(cp.x, cp.y, cp.z, cp.x*cp.x+cp.y*cp.y+cp.z*cp.z);
}
__device__ real2 multofReal2(real2 a, real2 b) {
return make_real2(a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
}
/**
* Precompute the cosine and sine sums which appear in each force term.
*/
extern "C" __global__ void calculateEwaldCosSinSums(real* __restrict__ energyBuffer, const real4* __restrict__ posq, real2* __restrict__ cosSinSum, real4 periodicBoxSize) {
const unsigned int ksizex = 2*KMAX_X-1;
const unsigned int ksizey = 2*KMAX_Y-1;
const unsigned int ksizez = 2*KMAX_Z-1;
const unsigned int totalK = ksizex*ksizey*ksizez;
real3 reciprocalBoxSize = make_real3(2*M_PI/periodicBoxSize.x, 2*M_PI/periodicBoxSize.y, 2*M_PI/periodicBoxSize.z);
real reciprocalCoefficient = ONE_4PI_EPS0*4*M_PI/(periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
unsigned int index = blockIdx.x*blockDim.x+threadIdx.x;
real energy = 0;
while (index < (KMAX_Y-1)*ksizez+KMAX_Z)
index += blockDim.x*gridDim.x;
while (index < totalK) {
// Find the wave vector (kx, ky, kz) this index corresponds to.
int rx = index/(ksizey*ksizez);
int remainder = index - rx*ksizey*ksizez;
int ry = remainder/ksizez;
int rz = remainder - ry*ksizez - KMAX_Z + 1;
ry += -KMAX_Y + 1;
real kx = rx*reciprocalBoxSize.x;
real ky = ry*reciprocalBoxSize.y;
real kz = rz*reciprocalBoxSize.z;
// Compute the sum for this wave vector.
real2 sum = make_real2(0);
for (int atom = 0; atom < NUM_ATOMS; atom++) {
real4 apos = posq[atom];
real phase = apos.x*kx;
real2 structureFactor = make_real2(cos(phase), sin(phase));
phase = apos.y*ky;
structureFactor = multofReal2(structureFactor, make_real2(cos(phase), sin(phase)));
phase = apos.z*kz;
structureFactor = multofReal2(structureFactor, make_real2(cos(phase), sin(phase)));
sum += apos.w*structureFactor;
}
cosSinSum[index] = sum;
// Compute the contribution to the energy.
real k2 = kx*kx + ky*ky + kz*kz;
real ak = EXP(k2*EXP_COEFFICIENT) / k2;
energy += reciprocalCoefficient*ak*(sum.x*sum.x + sum.y*sum.y);
index += blockDim.x*gridDim.x;
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
}
/**
* Compute the reciprocal space part of the Ewald force, using the precomputed sums from the
* previous routine.
*/
extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__ forceBuffers, const real4* __restrict__ posq, const real2* __restrict__ cosSinSum, real4 periodicBoxSize) {
unsigned int atom = blockIdx.x*blockDim.x+threadIdx.x;
real3 reciprocalBoxSize = make_real3(2*M_PI/periodicBoxSize.x, 2*M_PI/periodicBoxSize.y, 2*M_PI/periodicBoxSize.z);
real reciprocalCoefficient = ONE_4PI_EPS0*4*M_PI/(periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
while (atom < NUM_ATOMS) {
real3 force = make_real3(0);
real4 apos = posq[atom];
// Loop over all wave vectors.
int lowry = 0;
int lowrz = 1;
for (int rx = 0; rx < KMAX_X; rx++) {
real kx = rx*reciprocalBoxSize.x;
for (int ry = lowry; ry < KMAX_Y; ry++) {
real ky = ry*reciprocalBoxSize.y;
real phase = apos.x*kx;
real2 tab_xy = make_real2(cos(phase), sin(phase));
phase = apos.y*ky;
tab_xy = multofReal2(tab_xy, make_real2(cos(phase), sin(phase)));
for (int rz = lowrz; rz < KMAX_Z; rz++) {
real kz = rz*reciprocalBoxSize.z;
// Compute the force contribution of this wave vector.
int index = rx*(KMAX_Y*2-1)*(KMAX_Z*2-1) + (ry+KMAX_Y-1)*(KMAX_Z*2-1) + (rz+KMAX_Z-1);
real k2 = kx*kx + ky*ky + kz*kz;
real ak = EXP(k2*EXP_COEFFICIENT)/k2;
phase = apos.z*kz;
real2 structureFactor = multofReal2(tab_xy, make_real2(cos(phase), sin(phase)));
real2 sum = cosSinSum[index];
real dEdR = 2*reciprocalCoefficient*ak*apos.w*(sum.x*structureFactor.y - sum.y*structureFactor.x);
force.x += dEdR*kx;
force.y += dEdR*ky;
force.z += dEdR*kz;
lowrz = 1 - KMAX_Z;
}
lowry = 1 - KMAX_Y;
}
}
// Record the force on the atom.
atomicAdd(&forceBuffers[atom], static_cast<unsigned long long>((long long) (force.x*0xFFFFFFFF)));
atomicAdd(&forceBuffers[atom+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0xFFFFFFFF)));
atomicAdd(&forceBuffers[atom+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0xFFFFFFFF)));
atom += blockDim.x*gridDim.x;
}
}
#define TILE_SIZE 32
#define GROUP_SIZE 64
#define BUFFER_GROUPS 4
#define BUFFER_SIZE BUFFER_GROUPS*GROUP_SIZE
/**
* Find a bounding box for the atoms in each block.
*/
extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ posq, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ interactionCount) {
int index = blockIdx.x*blockDim.x+threadIdx.x;
int base = index*TILE_SIZE;
while (base < numAtoms) {
real4 pos = posq[base];
#ifdef USE_PERIODIC
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
real4 firstPoint = pos;
#endif
real4 minPos = pos;
real4 maxPos = pos;
int last = min(base+TILE_SIZE, numAtoms);
for (int i = base+1; i < last; i++) {
pos = posq[i];
#ifdef USE_PERIODIC
pos.x -= floor((pos.x-firstPoint.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
pos.y -= floor((pos.y-firstPoint.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
pos.z -= floor((pos.z-firstPoint.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
}
blockBoundingBox[index] = 0.5f*(maxPos-minPos);
blockCenter[index] = 0.5f*(maxPos+minPos);
index += blockDim.x*gridDim.x;
base = index*TILE_SIZE;
}
if (blockIdx.x == 0 && threadIdx.x == 0)
interactionCount[0] = 0;
}
/**
* This is called by findBlocksWithInteractions(). It compacts the list of blocks and writes them
* to global memory.
*/
__device__ void storeInteractionData(ushort2* buffer, int* valid, short* sum, ushort2* temp, int* baseIndex,
unsigned int* interactionCount, ushort2* interactingTiles, real4 periodicBoxSize,
real4 invPeriodicBoxSize, const real4* posq, const real4* blockCenter, const real4* blockBoundingBox, unsigned int maxTiles) {
// The buffer is full, so we need to compact it and write out results. Start by doing a parallel prefix sum.
for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
temp[i].x = (valid[i] ? 1 : 0);
__syncthreads();
int whichBuffer = 0;
for (int offset = 1; offset < BUFFER_SIZE; offset *= 2) {
if (whichBuffer == 0)
for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
temp[i].y = (i < offset ? temp[i].x : temp[i].x+temp[i-offset].x);
else
for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
temp[i].x = (i < offset ? temp[i].y : temp[i].y+temp[i-offset].y);
whichBuffer = 1-whichBuffer;
__syncthreads();
}
if (whichBuffer == 0)
for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
sum[i] = temp[i].x;
else
for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
sum[i] = temp[i].y;
__syncthreads();
int numValid = sum[BUFFER_SIZE-1];
__syncthreads();
// Compact the buffer.
for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
if (valid[i]) {
temp[sum[i]-1] = buffer[i];
sum[i] = valid[i];
valid[i] = false;
buffer[i] = make_ushort2(1, 1);
}
__syncthreads();
// Store it to global memory.
if (threadIdx.x == 0)
*baseIndex = atomicAdd(interactionCount, numValid);
__syncthreads();
if (*baseIndex+numValid <= maxTiles)
for (int i = threadIdx.x; i < numValid; i += GROUP_SIZE)
interactingTiles[*baseIndex+i] = temp[i];
__syncthreads();
}
/**
* Compare the bounding boxes for each pair of blocks. If they are sufficiently far apart,
* mark them as non-interacting.
*/
extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ interactionCount, ushort2* __restrict__ interactingTiles,
unsigned int* __restrict__ interactionFlags, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int startTileIndex,
unsigned int endTileIndex) {
__shared__ ushort2 buffer[BUFFER_SIZE];
__shared__ int valid[BUFFER_SIZE];
__shared__ short sum[BUFFER_SIZE];
__shared__ ushort2 temp[BUFFER_SIZE];
__shared__ int bufferFull;
__shared__ int globalIndex;
int valuesInBuffer = 0;
if (threadIdx.x == 0)
bufferFull = false;
for (int i = 0; i < BUFFER_GROUPS; ++i)
valid[i*GROUP_SIZE+threadIdx.x] = false;
__syncthreads();
for (int baseIndex = startTileIndex+blockIdx.x*blockDim.x; baseIndex < endTileIndex; baseIndex += blockDim.x*gridDim.x) {
// Identify the pair of blocks to compare.
int index = baseIndex+threadIdx.x;
if (index < endTileIndex) {
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*index));
unsigned int x = (index-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (index-y*NUM_BLOCKS+y*(y+1)/2);
}
// Find the distance between the bounding boxes of the two cells.
real4 delta = blockCenter[x]-blockCenter[y];
real4 boxSizea = blockBoundingBox[x];
real4 boxSizeb = blockBoundingBox[y];
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
delta.x = max(0.0f, fabs(delta.x)-boxSizea.x-boxSizeb.x);
delta.y = max(0.0f, fabs(delta.y)-boxSizea.y-boxSizeb.y);
delta.z = max(0.0f, fabs(delta.z)-boxSizea.z-boxSizeb.z);
if (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < CUTOFF_SQUARED) {
// Add this tile to the buffer.
int bufferIndex = valuesInBuffer*GROUP_SIZE+threadIdx.x;
valid[bufferIndex] = true;
buffer[bufferIndex] = make_ushort2(x, y);
valuesInBuffer++;
if (!bufferFull && valuesInBuffer == BUFFER_GROUPS)
bufferFull = true;
}
}
__syncthreads();
if (bufferFull) {
storeInteractionData(buffer, valid, sum, temp, &globalIndex, interactionCount, interactingTiles, periodicBoxSize, invPeriodicBoxSize, posq, blockCenter, blockBoundingBox, maxTiles);
valuesInBuffer = 0;
if (threadIdx.x == 0)
bufferFull = false;
__syncthreads();
}
}
storeInteractionData(buffer, valid, sum, temp, &globalIndex, interactionCount, interactingTiles, periodicBoxSize, invPeriodicBoxSize, posq, blockCenter, blockBoundingBox, maxTiles);
}
/**
* Compare each atom in one block to the bounding box of another block, and set
* flags for which ones are interacting.
*/
extern "C" __global__ void findInteractionsWithinBlocks(real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ posq, const ushort2* __restrict__ tiles, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ interactionFlags, const unsigned int* __restrict__ interactionCount, unsigned int maxTiles) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
unsigned int numTiles = interactionCount[0];
unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps;
unsigned int index = threadIdx.x & (TILE_SIZE - 1);
#if (__CUDA_ARCH__ < 200)
__shared__ unsigned int flags[128];
#endif
if (numTiles > maxTiles)
return;
unsigned int lasty = 0xFFFFFFFF;
real4 apos;
while (pos < end) {
// Extract the coordinates of this tile
ushort2 tileIndices = tiles[pos];
unsigned int x = tileIndices.x;
unsigned int y = tileIndices.y;
if (x == y) {
if (index == 0)
interactionFlags[pos] = 0xFFFFFFFF;
}
else {
// Load the bounding box for x and the atom positions for y.
real4 center = blockCenter[x];
real4 boxSize = blockBoundingBox[x];
if (y != lasty)
apos = posq[y*TILE_SIZE+index];
// Find the distance of the atom from the bounding box.
real4 delta = apos-center;
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
delta.x = max((real) 0, fabs(delta.x)-boxSize.x);
delta.y = max((real) 0, fabs(delta.y)-boxSize.y);
delta.z = max((real) 0, fabs(delta.z)-boxSize.z);
#if (__CUDA_ARCH__ < 200)
flags[threadIdx.x] = (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z > CUTOFF_SQUARED ? 0 : 1 << index);
if (index % 4 == 0)
flags[threadIdx.x] += flags[threadIdx.x+1]+flags[threadIdx.x+2]+flags[threadIdx.x+3];
unsigned int allFlags = 0;
if (index == 0)
allFlags = flags[threadIdx.x]+flags[threadIdx.x+4]+flags[threadIdx.x+8]+flags[threadIdx.x+12]+flags[threadIdx.x+16]+flags[threadIdx.x+20]+flags[threadIdx.x+24]+flags[threadIdx.x+28];
#else
unsigned int allFlags = __ballot(delta.x*delta.x+delta.y*delta.y+delta.z*delta.z > CUTOFF_SQUARED);
#endif
// Sum the flags.
if (index == 0) {
// Count how many flags are set, and based on that decide whether to compute all interactions
// or only a fraction of them.
unsigned int bits = (allFlags&0x55555555) + ((allFlags>>1)&0x55555555);
bits = (bits&0x33333333) + ((bits>>2)&0x33333333);
bits = (bits&0x0F0F0F0F) + ((bits>>4)&0x0F0F0F0F);
bits = (bits&0x00FF00FF) + ((bits>>8)&0x00FF00FF);
bits = (bits&0x0000FFFF) + ((bits>>16)&0x0000FFFF);
interactionFlags[pos] = (bits > 12 ? 0xFFFFFFFF : allFlags);
}
lasty = y;
}
pos++;
}
}
#define TILE_SIZE 32
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
typedef struct {
real x, y, z;
real q;
real fx, fy, fz;
ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN
real padding;
#endif
} AtomData;
/**
* Compute nonbonded interactions.
*/
extern "C" __global__ void computeNonbonded(
unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions,
const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
unsigned int startTileIndex, unsigned int numTileIndices
#ifdef USE_CUTOFF
, const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags
#endif
PARAMETER_ARGUMENTS) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
#ifdef USE_CUTOFF
const unsigned int numTiles = interactionCount[0];
unsigned int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
unsigned int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
#else
const unsigned int numTiles = numTileIndices;
unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif
real energy = 0.0f;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
__shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
__shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
__shared__ int exclusionIndex[WARPS_PER_GROUP];
do {
// Extract the coordinates of this tile
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
unsigned int x, y;
real3 force = make_real3(0);
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
bool hasExclusions = false;
#endif
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = threadIdx.x;
localData[localAtomIndex].x = posq1.x;
localData[localAtomIndex].y = posq1.y;
localData[localAtomIndex].z = posq1.z;
localData[localAtomIndex].q = posq1.w;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+j;
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC
real dEdR = 0.0f;
#else
real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0);
#endif
real tempEnergy = 0.0f;
COMPUTE_INTERACTION
energy += 0.5f*tempEnergy;
#ifdef USE_SYMMETRIC
force -= delta*dEdR;
#else
force -= dEdR1;
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
}
}
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = threadIdx.x;
unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z;
localData[localAtomIndex].q = tempPosq.w;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].fx = 0.0f;
localData[localAtomIndex].fy = 0.0f;
localData[localAtomIndex].fz = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
for (j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
bool isExcluded = false;
int atom2 = tbx+j;
int bufferIndex = 3*threadIdx.x;
#ifdef USE_SYMMETRIC
real dEdR = 0;
#else
real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0);
#endif
real tempEnergy = 0.0f;
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_SYMMETRIC
delta *= dEdR;
force -= delta;
tempBuffer[bufferIndex] = delta.x;
tempBuffer[bufferIndex+1] = delta.y;
tempBuffer[bufferIndex+2] = delta.z;
#else
force -= dEdR1;
tempBuffer[bufferIndex] = dEdR2.x;
tempBuffer[bufferIndex+1] = dEdR2.y;
tempBuffer[bufferIndex+2] = dEdR2.z;
#endif
// Sum the forces on atom2.
if (tgx % 4 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
}
if (tgx == 0) {
localData[tbx+j].fx += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
localData[tbx+j].fy += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
localData[tbx+j].fz += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
}
}
}
}
}
else
#endif
{
// Compute the full set of interactions in this tile.
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+tj;
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj;
#ifdef USE_SYMMETRIC
real dEdR = 0.0f;
#else
real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0);
#endif
real tempEnergy = 0.0f;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef USE_SYMMETRIC
delta *= dEdR;
force -= delta;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
#else
force -= dEdR1;
localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
#endif
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
}
}
// Write results.
if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0xFFFFFFFF)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0xFFFFFFFF)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0xFFFFFFFF)));
__threadfence_block();
}
if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0xFFFFFFFF)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0xFFFFFFFF)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0xFFFFFFFF)));
__threadfence_block();
}
pos++;
} while (pos < end);
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
}
real4 exceptionParams = PARAMS[index];
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2);
real sig2 = invR*exceptionParams.y;
sig2 *= sig2;
real sig6 = sig2*sig2*sig2;
real dEdR = exceptionParams.z*(12.0f*sig6-6.0f)*sig6;
real tempEnergy = exceptionParams.z*(sig6-1.0f)*sig6;
dEdR += exceptionParams.x*invR;
dEdR *= invR*invR;
tempEnergy += exceptionParams.x*invR;
energy += tempEnergy;
delta *= dEdR;
real3 force1 = -delta;
real3 force2 = delta;
extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, real4* __restrict__ pmeBsplineTheta, int2* __restrict__ pmeAtomGridIndex,
real4 periodicBoxSize, real4 invPeriodicBoxSize) {
extern __shared__ real3 bsplinesCache[];
real3* data = &bsplinesCache[threadIdx.x*PME_ORDER];
const real3 scale = make_real3(RECIP(PME_ORDER-1));
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
real4 pos = posq[i];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
real3 t = make_real3((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
(pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
(pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z);
pmeAtomGridIndex[i] = make_int2(i, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z);
data[PME_ORDER-1] = make_real3(0);
data[1] = dr;
data[0] = make_real3(1)-dr;
for (int j = 3; j < PME_ORDER; j++) {
real div = RECIP(j-1);
data[j-1] = div*dr*data[j-2];
for (int k = 1; k < (j-1); k++)
data[j-k-1] = div*((dr+make_real3(k)) *data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
data[0] = div*(make_real3(1)-dr)*data[0];
}
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
data[0] = scale*(make_real3(1)-dr)*data[0];
for (int j = 0; j < PME_ORDER; j++)
pmeBsplineTheta[i+j*NUM_ATOMS] = make_real4(data[j].x, data[j].y, data[j].z, pos.w); // Storing the charge here improves cache coherency in the charge spreading kernel
}
}
/**
* For each grid point, find the range of sorted atoms associated with that point.
*/
extern "C" __global__ void findAtomRangeForGrid(int2* __restrict__ pmeAtomGridIndex, int* __restrict__ pmeAtomRange, const real4* __restrict__ posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
int start = (NUM_ATOMS*(blockIdx.x*blockDim.x+threadIdx.x))/(blockDim.x*gridDim.x);
int end = (NUM_ATOMS*(blockIdx.x*blockDim.x+threadIdx.x+1))/(blockDim.x*gridDim.x);
int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
for (int i = start; i < end; ++i) {
int2 atomData = pmeAtomGridIndex[i];
int gridIndex = atomData.y;
if (gridIndex != last) {
for (int j = last+1; j <= gridIndex; ++j)
pmeAtomRange[j] = i;
last = gridIndex;
}
}
// Fill in values beyond the last atom.
if (blockIdx.x == gridDim.x-1 && threadIdx.x == blockDim.x-1) {
int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
for (int j = last+1; j <= gridSize; ++j)
pmeAtomRange[j] = NUM_ATOMS;
}
}
#define BUFFER_SIZE (PME_ORDER*PME_ORDER*PME_ORDER)
extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, unsigned long long* __restrict__ pmeGrid, const real4* __restrict__ pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
int ix = threadIdx.x/(PME_ORDER*PME_ORDER);
int remainder = threadIdx.x-ix*PME_ORDER*PME_ORDER;
int iy = remainder/PME_ORDER;
int iz = remainder-iy*PME_ORDER;
__shared__ real4 theta[PME_ORDER];
__shared__ real charge[BUFFER_SIZE];
__shared__ int basex[BUFFER_SIZE];
__shared__ int basey[BUFFER_SIZE];
__shared__ int basez[BUFFER_SIZE];
if (ix < PME_ORDER) {
for (int baseIndex = blockIdx.x*BUFFER_SIZE; baseIndex < NUM_ATOMS; baseIndex += gridDim.x*BUFFER_SIZE) {
// Load the next block of atoms into the buffers.
if (threadIdx.x < BUFFER_SIZE) {
int atomIndex = baseIndex+threadIdx.x;
if (atomIndex < NUM_ATOMS) {
real4 pos = posq[atomIndex];
charge[threadIdx.x] = pos.w;
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
basex[threadIdx.x] = (int) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X);
basey[threadIdx.x] = (int) ((pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y);
basez[threadIdx.x] = (int) ((pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
}
}
__syncthreads();
int lastIndex = min(BUFFER_SIZE, NUM_ATOMS-baseIndex);
for (int index = 0; index < lastIndex; index++) {
int atomIndex = index+baseIndex;
if (threadIdx.x < PME_ORDER)
theta[threadIdx.x] = pmeBsplineTheta[atomIndex+threadIdx.x*NUM_ATOMS];
__syncthreads();
real add = charge[index]*theta[ix].x*theta[iy].y*theta[iz].z;
int x = basex[index]+ix;
int y = basey[index]+iy;
int z = basez[index]+iz;
x -= (x >= GRID_SIZE_X ? GRID_SIZE_X : 0);
y -= (y >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
z -= (z >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
#ifdef USE_DOUBLE_PRECISION
atomicAdd(&pmeGrid[2*(x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z)], static_cast<unsigned long long>((long long) (add*0xFFFFFFFF)));
#else
atomicAdd(&pmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], static_cast<unsigned long long>((long long) (add*0xFFFFFFFF)));
#endif
}
}
}
}
extern "C" __global__ void finishSpreadCharge(long long* __restrict__ pmeGrid) {
real2* floatGrid = (real2*) pmeGrid;
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
real scale = EPSILON_FACTOR/(real) 0xFFFFFFFF;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
#ifdef USE_DOUBLE_PRECISION
long long value = pmeGrid[2*index];
#else
long long value = pmeGrid[index];
#endif
real2 floatValue = make_real2((real) (value*scale), 0);
floatGrid[index] = floatValue;
}
}
extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, real* __restrict__ energyBuffer, const real* __restrict__ pmeBsplineModuliX,
const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
const real recipScaleFactor = RECIP(M_PI*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
real energy = 0;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
int kx = index/(GRID_SIZE_Y*GRID_SIZE_Z);
int remainder = index-kx*GRID_SIZE_Y*GRID_SIZE_Z;
int ky = remainder/GRID_SIZE_Z;
int kz = remainder-ky*GRID_SIZE_Z;
if (kx == 0 && ky == 0 && kz == 0)
continue;
int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
real mhx = mx*invPeriodicBoxSize.x;
real mhy = my*invPeriodicBoxSize.y;
real mhz = mz*invPeriodicBoxSize.z;
real bx = pmeBsplineModuliX[kx];
real by = pmeBsplineModuliY[ky];
real bz = pmeBsplineModuliZ[kz];
real2 grid = pmeGrid[index];
real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
real denom = m2*bx*by*bz;
real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
pmeGrid[index] = make_real2(grid.x*eterm, grid.y*eterm);
energy += eterm*(grid.x*grid.x + grid.y*grid.y);
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*energy;
}
extern "C" __global__ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, const real2* __restrict__ pmeGrid,
real4 periodicBoxSize, real4 invPeriodicBoxSize) {
extern __shared__ real3 bsplinesCache[];
real3* data = &bsplinesCache[threadIdx.x*PME_ORDER];
real3* ddata = &bsplinesCache[threadIdx.x*PME_ORDER + blockDim.x*PME_ORDER];
const real scale = RECIP(PME_ORDER-1);
for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) {
real4 force = make_real4(0);
real4 pos = posq[atom];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
real3 t = make_real3((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
(pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
(pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z);
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
data[PME_ORDER-1] = make_real3(0);
data[1] = dr;
data[0] = make_real3(1)-dr;
for (int j = 3; j < PME_ORDER; j++) {
real div = RECIP(j-1);
data[j-1] = div*dr*data[j-2];
for (int k = 1; k < (j-1); k++)
data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
data[0] = div*(make_real3(1)-dr)*data[0];
}
ddata[0] = -data[0];
for (int j = 1; j < PME_ORDER; j++)
ddata[j] = data[j-1]-data[j];
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
data[0] = scale*(make_real3(1)-dr)*data[0];
// Compute the force on this atom.
for (int ix = 0; ix < PME_ORDER; ix++) {
int xindex = gridIndex.x+ix;
xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
for (int iy = 0; iy < PME_ORDER; iy++) {
int yindex = gridIndex.y+iy;
yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
for (int iz = 0; iz < PME_ORDER; iz++) {
int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
real gridvalue = pmeGrid[index].x;
force.x += ddata[ix].x*data[iy].y*data[iz].z*gridvalue;
force.y += data[ix].x*ddata[iy].y*data[iz].z*gridvalue;
force.z += data[ix].x*data[iy].y*ddata[iz].z*gridvalue;
}
}
}
real q = pos.w*EPSILON_FACTOR;
atomicAdd(&forceBuffers[atom], static_cast<unsigned long long>((long long) (-q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x*0xFFFFFFFF)));
atomicAdd(&forceBuffers[atom+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y*0xFFFFFFFF)));
atomicAdd(&forceBuffers[atom+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z*0xFFFFFFFF)));
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment