Commit a4b2a13b authored by Peter Eastman's avatar Peter Eastman
Browse files

Beginnings of the CUDA implementation of PME

parent e009228a
...@@ -17,7 +17,7 @@ CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/jama/include) ...@@ -17,7 +17,7 @@ CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/jama/include)
CUDA_ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES}) CUDA_ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
TARGET_LINK_LIBRARIES(${SHARED_TARGET} debug ${OPENMM_LIBRARY_NAME}_d optimized ${OPENMM_LIBRARY_NAME} cudpp cutil) TARGET_LINK_LIBRARIES(${SHARED_TARGET} debug ${OPENMM_LIBRARY_NAME}_d optimized ${OPENMM_LIBRARY_NAME} cudpp cutil ${CUFFT_TARGET_LINK})
SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_BUILDING_SHARED_LIBRARY") SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_BUILDING_SHARED_LIBRARY")
INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET}) INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
...@@ -298,7 +298,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -298,7 +298,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
gpuSetPeriodicBoxSize(gpu, (float)boxVectors[0][0], (float)boxVectors[1][1], (float)boxVectors[2][2]); gpuSetPeriodicBoxSize(gpu, (float)boxVectors[0][0], (float)boxVectors[1][1], (float)boxVectors[2][2]);
method = PERIODIC; method = PERIODIC;
} }
if (force.getNonbondedMethod() == NonbondedForce::Ewald) { if (force.getNonbondedMethod() == NonbondedForce::Ewald || force.getNonbondedMethod() == NonbondedForce::PME) {
Vec3 boxVectors[3]; Vec3 boxVectors[3];
force.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]); force.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
gpuSetPeriodicBoxSize(gpu, (float)boxVectors[0][0], (float)boxVectors[1][1], (float)boxVectors[2][2]); gpuSetPeriodicBoxSize(gpu, (float)boxVectors[0][0], (float)boxVectors[1][1], (float)boxVectors[2][2]);
...@@ -308,17 +308,23 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -308,17 +308,23 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
double my = boxVectors[1][1]/force.getCutoffDistance(); double my = boxVectors[1][1]/force.getCutoffDistance();
double mz = boxVectors[2][2]/force.getCutoffDistance(); double mz = boxVectors[2][2]/force.getCutoffDistance();
double pi = 3.1415926535897932385; double pi = 3.1415926535897932385;
int kmaxx = (int)std::ceil(-(mx/pi)*std::log(ewaldErrorTol)); if (force.getNonbondedMethod() == NonbondedForce::Ewald) {
int kmaxy = (int)std::ceil(-(my/pi)*std::log(ewaldErrorTol)); int kmaxx = (int)std::ceil(-(mx/pi)*std::log(ewaldErrorTol));
int kmaxz = (int)std::ceil(-(mz/pi)*std::log(ewaldErrorTol)); int kmaxy = (int)std::ceil(-(my/pi)*std::log(ewaldErrorTol));
if (kmaxx%2 == 0) int kmaxz = (int)std::ceil(-(mz/pi)*std::log(ewaldErrorTol));
kmaxx++; if (kmaxx%2 == 0)
if (kmaxy%2 == 0) kmaxx++;
kmaxy++; if (kmaxy%2 == 0)
if (kmaxz%2 == 0) kmaxy++;
kmaxz++; if (kmaxz%2 == 0)
gpuSetEwaldParameters(gpu, (float)alpha, kmaxx, kmaxy, kmaxz); kmaxz++;
method = EWALD; gpuSetEwaldParameters(gpu, (float) alpha, kmaxx, kmaxy, kmaxz);
method = EWALD;
}
else {
gpuSetPMEParameters(gpu, (float) alpha);
method = PARTICLE_MESH_EWALD;
}
} }
data.nonbondedMethod = method; data.nonbondedMethod = method;
gpuSetCoulombParameters(gpu, 138.935485f, particle, c6, c12, q, symbol, exclusionList, method); gpuSetCoulombParameters(gpu, 138.935485f, particle, c6, c12, q, symbol, exclusionList, method);
......
...@@ -77,6 +77,8 @@ extern void SetCalculateObcGbsaForces2Sim(gpuContext gpu); ...@@ -77,6 +77,8 @@ extern void SetCalculateObcGbsaForces2Sim(gpuContext gpu);
extern void GetCalculateObcGbsaForces2Sim(gpuContext gpu); extern void GetCalculateObcGbsaForces2Sim(gpuContext gpu);
extern void SetCalculateAndersenThermostatSim(gpuContext gpu); extern void SetCalculateAndersenThermostatSim(gpuContext gpu);
extern void GetCalculateAndersenThermostatSim(gpuContext gpu); extern void GetCalculateAndersenThermostatSim(gpuContext gpu);
extern void SetCalculatePMESim(gpuContext gpu);
extern void GetCalculatePMESim(gpuContext gpu);
extern void SetForcesSim(gpuContext gpu); extern void SetForcesSim(gpuContext gpu);
extern void GetForcesSim(gpuContext gpu); extern void GetForcesSim(gpuContext gpu);
extern void SetShakeHSim(gpuContext gpu); extern void SetShakeHSim(gpuContext gpu);
......
...@@ -35,6 +35,7 @@ ...@@ -35,6 +35,7 @@
#include <string> #include <string>
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#include <cufft.h>
#include <builtin_types.h> #include <builtin_types.h>
#include <vector_functions.h> #include <vector_functions.h>
...@@ -236,12 +237,15 @@ static const unsigned int MAX_TABULATED_FUNCTIONS = 4; ...@@ -236,12 +237,15 @@ static const unsigned int MAX_TABULATED_FUNCTIONS = 4;
static const float PI = 3.14159265358979323846f; static const float PI = 3.14159265358979323846f;
static const int PME_ORDER = 4;
enum CudaNonbondedMethod enum CudaNonbondedMethod
{ {
NO_CUTOFF, NO_CUTOFF,
CUTOFF, CUTOFF,
PERIODIC, PERIODIC,
EWALD EWALD,
PARTICLE_MESH_EWALD
}; };
enum ExpressionOp { enum ExpressionOp {
...@@ -354,6 +358,13 @@ struct cudaGmxSimulation { ...@@ -354,6 +358,13 @@ struct cudaGmxSimulation {
float4* pTabulatedFunctionCoefficients[MAX_TABULATED_FUNCTIONS]; // The spline coefficients for each tabulated function float4* pTabulatedFunctionCoefficients[MAX_TABULATED_FUNCTIONS]; // The spline coefficients for each tabulated function
float4* pTabulatedFunctionParams; // The min, max, and spacing for each tabulated function float4* pTabulatedFunctionParams; // The min, max, and spacing for each tabulated function
float2* pEwaldCosSinSum; // Pointer to the cos/sin sums (ewald) float2* pEwaldCosSinSum; // Pointer to the cos/sin sums (ewald)
int3 pmeGridSize; // The dimensions of the grid for particle mesh Ewald
cufftComplex* pPmeGrid; // Grid points for particle mesh Ewald
float* pPmeBsplineModuli[3];
float4* pPmeBsplineTheta;
float4* pPmeBsplineDtheta;
int4* pPmeParticleIndex; // The grid indices for each atom
float4* pPmeParticleFraction; // Fractional offset in the grid for each atom in all three dimensions.
unsigned int bonds; // Number of bonds unsigned int bonds; // Number of bonds
int4* pBondID; // Bond atom and output buffer IDs int4* pBondID; // Bond atom and output buffer IDs
float2* pBondParameter; // Bond parameters float2* pBondParameter; // Bond parameters
......
...@@ -757,6 +757,31 @@ void gpuSetEwaldParameters(gpuContext gpu, float alpha, int kmaxx, int kmaxy, in ...@@ -757,6 +757,31 @@ void gpuSetEwaldParameters(gpuContext gpu, float alpha, int kmaxx, int kmaxy, in
gpu->sim.pEwaldCosSinSum = gpu->psEwaldCosSinSum->_pDevStream[0]; gpu->sim.pEwaldCosSinSum = gpu->psEwaldCosSinSum->_pDevStream[0];
} }
extern "C"
void gpuSetPMEParameters(gpuContext gpu, float alpha)
{
gpu->sim.alphaEwald = alpha;
int3 gridSize = make_int3(16, 16, 16);
gpu->sim.pmeGridSize = gridSize;
cufftPlan3d(&gpu->fftplan, gridSize.x, gridSize.y, gridSize.z, CUFFT_C2C);
gpu->psPmeGrid = new CUDAStream<cufftComplex>(gridSize.x*gridSize.y*gridSize.z, 1, "PmeGrid");
gpu->sim.pPmeGrid = gpu->psPmeGrid->_pDevData;
gpu->psPmeBsplineModuli[0] = new CUDAStream<float>(gridSize.x, 1, "PmeBsplineModuli0");
gpu->sim.pPmeBsplineModuli[0] = gpu->psPmeBsplineModuli[0]->_pDevData;
gpu->psPmeBsplineModuli[1] = new CUDAStream<float>(gridSize.y, 1, "PmeBsplineModuli1");
gpu->sim.pPmeBsplineModuli[1] = gpu->psPmeBsplineModuli[1]->_pDevData;
gpu->psPmeBsplineModuli[2] = new CUDAStream<float>(gridSize.z, 1, "PmeBsplineModuli2");
gpu->sim.pPmeBsplineModuli[2] = gpu->psPmeBsplineModuli[2]->_pDevData;
gpu->psPmeBsplineTheta = new CUDAStream<float4>(PME_ORDER*gpu->natoms, 1, "PmeBsplineTheta");
gpu->sim.pPmeBsplineTheta = gpu->psPmeBsplineTheta->_pDevData;
gpu->psPmeBsplineDtheta = new CUDAStream<float4>(PME_ORDER*gpu->natoms, 1, "PmeBsplineDtheta");
gpu->sim.pPmeBsplineDtheta = gpu->psPmeBsplineDtheta->_pDevData;
gpu->psPmeParticleIndex = new CUDAStream<int4>(gpu->natoms, 1, "PmeParticleIndex");
gpu->sim.pPmeParticleIndex = gpu->psPmeParticleIndex->_pDevData;
gpu->psPmeParticleFraction = new CUDAStream<float4>(gpu->natoms, 1, "PmeParticleFraction");
gpu->sim.pPmeParticleFraction = gpu->psPmeParticleFraction->_pDevData;
}
extern "C" extern "C"
void gpuSetPeriodicBoxSize(gpuContext gpu, float xsize, float ysize, float zsize) void gpuSetPeriodicBoxSize(gpuContext gpu, float xsize, float ysize, float zsize)
{ {
...@@ -1589,6 +1614,14 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync) ...@@ -1589,6 +1614,14 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
gpu->psCustomExceptionID = NULL; gpu->psCustomExceptionID = NULL;
gpu->psCustomExceptionParams = NULL; gpu->psCustomExceptionParams = NULL;
gpu->psEwaldCosSinSum = NULL; gpu->psEwaldCosSinSum = NULL;
gpu->psPmeGrid = NULL;
gpu->psPmeBsplineModuli[0] = NULL;
gpu->psPmeBsplineModuli[1] = NULL;
gpu->psPmeBsplineModuli[2] = NULL;
gpu->psPmeBsplineTheta = NULL;
gpu->psPmeBsplineDtheta = NULL;
gpu->psPmeParticleIndex = NULL;
gpu->psPmeParticleFraction = NULL;
gpu->psShakeID = NULL; gpu->psShakeID = NULL;
gpu->psShakeParameter = NULL; gpu->psShakeParameter = NULL;
gpu->psSettleID = NULL; gpu->psSettleID = NULL;
...@@ -1747,6 +1780,17 @@ void gpuShutDown(gpuContext gpu) ...@@ -1747,6 +1780,17 @@ void gpuShutDown(gpuContext gpu)
} }
if (gpu->psEwaldCosSinSum != NULL) if (gpu->psEwaldCosSinSum != NULL)
delete gpu->psEwaldCosSinSum; delete gpu->psEwaldCosSinSum;
if (gpu->psPmeGrid != NULL) {
delete gpu->psPmeGrid;
delete gpu->psPmeBsplineModuli[0];
delete gpu->psPmeBsplineModuli[1];
delete gpu->psPmeBsplineModuli[2];
delete gpu->psPmeBsplineTheta;
delete gpu->psPmeBsplineDtheta;
delete gpu->psPmeParticleIndex;
delete gpu->psPmeParticleFraction;
cufftDestroy(gpu->fftplan);
}
delete gpu->psObcData; delete gpu->psObcData;
delete gpu->psObcChain; delete gpu->psObcChain;
delete gpu->psBornForce; delete gpu->psBornForce;
...@@ -2102,6 +2146,7 @@ int gpuSetConstants(gpuContext gpu) ...@@ -2102,6 +2146,7 @@ int gpuSetConstants(gpuContext gpu)
SetCalculateObcGbsaBornSumSim(gpu); SetCalculateObcGbsaBornSumSim(gpu);
SetCalculateObcGbsaForces2Sim(gpu); SetCalculateObcGbsaForces2Sim(gpu);
SetCalculateAndersenThermostatSim(gpu); SetCalculateAndersenThermostatSim(gpu);
SetCalculatePMESim(gpu);
SetForcesSim(gpu); SetForcesSim(gpu);
SetShakeHSim(gpu); SetShakeHSim(gpu);
SetLangevinUpdateSim(gpu); SetLangevinUpdateSim(gpu);
......
...@@ -91,6 +91,7 @@ struct _gpuContext { ...@@ -91,6 +91,7 @@ struct _gpuContext {
unsigned long seed; unsigned long seed;
SM_VERSION sm_version; SM_VERSION sm_version;
compactionPlan compactPlan; compactionPlan compactPlan;
cufftHandle fftplan;
CUDAStream<float4>* psPosq4; CUDAStream<float4>* psPosq4;
CUDAStream<float4>* psPosqP4; CUDAStream<float4>* psPosqP4;
CUDAStream<float4>* psOldPosq4; CUDAStream<float4>* psOldPosq4;
...@@ -105,6 +106,12 @@ struct _gpuContext { ...@@ -105,6 +106,12 @@ struct _gpuContext {
CUDAStream<float4>* psCustomExceptionParams; // Parameters for custom nonbonded exceptions CUDAStream<float4>* psCustomExceptionParams; // Parameters for custom nonbonded exceptions
CUDAStream<float4>* psTabulatedFunctionParams; // The min, max, and spacing for each tabulated function CUDAStream<float4>* psTabulatedFunctionParams; // The min, max, and spacing for each tabulated function
CUDAStream<float2>* psEwaldCosSinSum; CUDAStream<float2>* psEwaldCosSinSum;
CUDAStream<cufftComplex>* psPmeGrid; // Grid points for particle mesh Ewald
CUDAStream<float>* psPmeBsplineModuli[3];
CUDAStream<float4>* psPmeBsplineTheta;
CUDAStream<float4>* psPmeBsplineDtheta;
CUDAStream<int4>* psPmeParticleIndex; // The grid indices for each atom
CUDAStream<float4>* psPmeParticleFraction; // Fractional offset in the grid for each atom in all three dimensions.
CUDAStream<float2>* psObcData; CUDAStream<float2>* psObcData;
CUDAStream<float>* psObcChain; CUDAStream<float>* psObcChain;
CUDAStream<float>* psBornForce; CUDAStream<float>* psBornForce;
...@@ -206,6 +213,9 @@ void gpuSetCustomNonbondedParameters(gpuContext gpu, const std::vector<std::vect ...@@ -206,6 +213,9 @@ void gpuSetCustomNonbondedParameters(gpuContext gpu, const std::vector<std::vect
extern "C" extern "C"
void gpuSetEwaldParameters(gpuContext gpu, float alpha, int kmaxx, int kmaxy, int kmaxz); void gpuSetEwaldParameters(gpuContext gpu, float alpha, int kmaxx, int kmaxy, int kmaxz);
extern "C"
void gpuSetPMEParameters(gpuContext gpu, float alpha);
extern "C" extern "C"
void gpuSetPeriodicBoxSize(gpuContext gpu, float xsize, float ysize, float zsize); void gpuSetPeriodicBoxSize(gpuContext gpu, float xsize, float ysize, float zsize);
......
...@@ -120,6 +120,7 @@ void GetCalculateCDLJForcesSim(gpuContext gpu) ...@@ -120,6 +120,7 @@ void GetCalculateCDLJForcesSim(gpuContext gpu)
// Reciprocal Space Ewald summation is in a separate kernel // Reciprocal Space Ewald summation is in a separate kernel
#include "kCalculateCDLJEwaldFastReciprocal.h" #include "kCalculateCDLJEwaldFastReciprocal.h"
void kCalculatePME(gpuContext gpu);
void kCalculateCDLJForces(gpuContext gpu) void kCalculateCDLJForces(gpuContext gpu)
{ {
...@@ -187,5 +188,22 @@ void kCalculateCDLJForces(gpuContext gpu) ...@@ -187,5 +188,22 @@ void kCalculateCDLJForces(gpuContext gpu)
LAUNCHERROR("kCalculateEwaldFastCosSinSums"); LAUNCHERROR("kCalculateEwaldFastCosSinSums");
kCalculateEwaldFastForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>(); kCalculateEwaldFastForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kCalculateEwaldFastForces"); LAUNCHERROR("kCalculateEwaldFastForces");
break;
case PARTICLE_MESH_EWALD:
kFindBlockBoundsEwaldDirect_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsEwaldDirect");
kFindBlocksWithInteractionsEwaldDirect_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsEwaldDirect");
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksEwaldDirect_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJEwaldDirectByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateCDLJEwaldDirectForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJEwaldDirectForces");
kCalculatePME(gpu);
} }
} }
...@@ -448,7 +448,7 @@ __global__ void kCalculateLocalForces_kernel() ...@@ -448,7 +448,7 @@ __global__ void kCalculateLocalForces_kernel()
pos += blockDim.x * gridDim.x; pos += blockDim.x * gridDim.x;
} }
if (cSim.nonbondedMethod == NO_CUTOFF || cSim.nonbondedMethod == EWALD) if (cSim.nonbondedMethod == NO_CUTOFF || cSim.nonbondedMethod == EWALD || cSim.nonbondedMethod == PARTICLE_MESH_EWALD)
{ {
while (pos < cSim.LJ14_offset) while (pos < cSim.LJ14_offset)
{ {
......
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Erik Lindahl, Rossen Apostolov, Szilard Pall, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "gputypes.h"
#include <cuda.h>
using namespace std;
static __constant__ cudaGmxSimulation cSim;
void SetCalculatePMESim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculatePMESim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
inline __host__ __device__ int fast_mod(int a, int b)
{
return (b & (b - 1)) ? a % b : a & (b - 1);
}
inline __host__ __device__ float4 make_float4(float s)
{
return make_float4(s, s, s, s);
}
inline __host__ __device__ float4 operator-(float4 &a)
{
return make_float4(-a.x, -a.y, -a.z, -a.w);
}
inline __host__ __device__ float4 operator-(float4 a, float4 b)
{
return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
}
inline __host__ __device__ float4 operator+(float4 a, float4 b)
{
return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
}
inline __host__ __device__ float4 operator+(float4 a, float b)
{
return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
}
inline __host__ __device__ float4 operator+(float a, float4 b)
{
return make_float4(a + b.x, a + b.y, a + b.z, a + b.w);
}
inline __host__ __device__ float4 operator*(float s, float4 a)
{
return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
}
inline __host__ __device__ float4 operator*(float4 a, float4 b)
{
return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w + b.w);
}
inline __host__ __device__ float4 make_float4(int3 a)
{
return make_float4(a.x, a.y, a.z, 0);
}
__global__ void kUpdateGridIndexAndFraction()
{
unsigned int tnb = blockDim.x * gridDim.x;
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = tid; i < cSim.atoms; i += tnb)
{
float4 ftmp = cSim.pPosq[i];
__syncthreads();
float3 t = make_float3((ftmp.x/cSim.periodicBoxSizeX+1.0f)*cSim.pmeGridSize.x,
(ftmp.y/cSim.periodicBoxSizeY+1.0f)*cSim.pmeGridSize.y,
(ftmp.z/cSim.periodicBoxSizeZ+1.0f)*cSim.pmeGridSize.z);
float3 tix;
ftmp.x = modff(t.x, &tix.x);
ftmp.y = modff(t.y, &tix.y);
ftmp.z = modff(t.z, &tix.z);
cSim.pPmeParticleFraction[i] = ftmp;
/* avoid costly % operations if possible that is if dc_ngrid.* is pow. of 2 */
int4 itmp = make_int4(fast_mod(__float2int_rd(tix.x), cSim.pmeGridSize.x),
fast_mod(__float2int_rd(tix.y), cSim.pmeGridSize.y),
fast_mod(__float2int_rd(tix.z), cSim.pmeGridSize.z), 0);
cSim.pPmeParticleIndex[i] = itmp;
__syncthreads();
}
}
__global__ void kUpdateBsplines()
{
unsigned int tnb = blockDim.x * gridDim.x;
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
extern __shared__ float4 bsplines_cache[]; /*size = 2 * block_size * pme_order*/
const float4 div_o = make_float4(1.0f/(PME_ORDER - 1));
for (int i = tid; i < cSim.atoms; i += tnb)
{
float4* data = &bsplines_cache[threadIdx.x*PME_ORDER];
float4* ddata = &bsplines_cache[threadIdx.x*PME_ORDER + blockDim.x*PME_ORDER];
for (int j = 0; j < PME_ORDER; j++)
{
data[j] = make_float4(0.0f);
ddata[j] = make_float4(0.0f);
}
/* load data */
float4 dr = cSim.pPmeParticleFraction[i];
__syncthreads();
data[PME_ORDER - 1] = make_float4(0.0f);
data[1] = dr;
data[0] = make_float4(1.0f) - dr;
for (int j = 3; j < PME_ORDER; j++)
{
float div = 1.0f / ((float)j - 1.0f);
data[j - 1] = div * dr * data[j - 2];
for (int k = 1; k < (j - 1); k++)
{
data[j - k - 1] =
div * (
(dr + float(k)) * data[j - k - 2] +
(-dr + ((float)(j - k))) * data[j - k - 1]);
}
data[0] = div * (- dr + 1) * data[0];
}
ddata[0] = -data[0];
for (int j = 1; j < PME_ORDER; j++)
{
ddata[j] = data[j - 1] - data[j];
}
data[PME_ORDER - 1] = div_o * dr * data[PME_ORDER - 2];
for (int j = 1; j < (PME_ORDER - 1); j++)
{
data[PME_ORDER - j - 1] =
div_o * (
(dr + (float)j) * data[PME_ORDER - j - 2] +
(-dr + ((float)(PME_ORDER - j))) * data[PME_ORDER - j - 1]
);
}
data[0] = div_o * (-dr + 1.0f) * data[0];
__syncthreads();
/* write back */
for (int j = 0; j < PME_ORDER; j++)
{
cSim.pPmeBsplineTheta[i * PME_ORDER + j] = data[j];
cSim.pPmeBsplineDtheta[i * PME_ORDER + j] = ddata[j];
}
__syncthreads();
}
}
void kCalculatePME(gpuContext gpu)
{
// printf("kCalculatePME\n");
kUpdateGridIndexAndFraction<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kUpdateGridIndexAndFraction");
kUpdateBsplines<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, 2*gpu->sim.update_threads_per_block*PME_ORDER*sizeof(float4)>>>();
LAUNCHERROR("kUpdateBsplines");
}
...@@ -25,6 +25,6 @@ SET(CUDA_STATIC_COMPILE_FLAG "-DOPENMM_BUILDING_STATIC_LIBRARY -DOPENMM_USE_STAT ...@@ -25,6 +25,6 @@ SET(CUDA_STATIC_COMPILE_FLAG "-DOPENMM_BUILDING_STATIC_LIBRARY -DOPENMM_USE_STAT
SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES COMPILE_FLAGS ${CUDA_STATIC_COMPILE_FLAG}) SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES COMPILE_FLAGS ${CUDA_STATIC_COMPILE_FLAG})
TARGET_LINK_LIBRARIES(${STATIC_TARGET} optimized ${OPENMM_LIBRARY_NAME}_static cudpp cutil ) TARGET_LINK_LIBRARIES(${STATIC_TARGET} optimized ${OPENMM_LIBRARY_NAME}_static cudpp cutil )
TARGET_LINK_LIBRARIES(${STATIC_TARGET} debug ${OPENMM_LIBRARY_NAME}_static_d optimized ${OPENMM_LIBRARY_NAME}_static cudpp cutil) TARGET_LINK_LIBRARIES(${STATIC_TARGET} debug ${OPENMM_LIBRARY_NAME}_static_d optimized ${OPENMM_LIBRARY_NAME}_static cudpp cutil ${CUFFT_TARGET_LINK})
INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${STATIC_TARGET}) INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${STATIC_TARGET})
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment