Commit 0541e749 authored by Peter Eastman's avatar Peter Eastman
Browse files

Simplified and optimized calculation of energies

parent a534f1b2
......@@ -41,7 +41,6 @@ using namespace std;
static void calcForces(ContextImpl& context, CudaPlatform::PlatformData& data) {
_gpuContext* gpu = data.gpu;
gpu->bReduceEnergies = false;
if (data.nonbondedMethod != NO_CUTOFF && data.computeForceCount%100 == 0)
gpuReorderAtoms(gpu);
data.computeForceCount++;
......@@ -88,37 +87,23 @@ static double calcEnergy(ContextImpl& context, CudaPlatform::PlatformData& data,
}
else
{
double totalEnergy = 0.0f;
double energy;
gpu->bReduceEnergies = true;
if (data.nonbondedMethod != NO_CUTOFF && data.stepCount%100 == 0)
gpuReorderAtoms(gpu);
data.stepCount++;
kClearForces(gpu);
kClearEnergy(gpu);
if (gpu->bIncludeGBSA) {
gpu->bRecalculateBornRadii = true;
energy = kCalculateCDLJObcGbsaForces1(gpu);
// printf("Calculated CDLJObcGbsa energy: %f\n", energy);
totalEnergy += energy;
energy = kReduceObcGbsaBornForces(gpu);
// printf("Calculated reduced GbsaBorn surface area energy: %f\n", energy);
totalEnergy += energy;
kCalculateCDLJObcGbsaForces1(gpu);
kReduceObcGbsaBornForces(gpu);
kCalculateObcGbsaForces2(gpu);
}
else if (data.hasNonbonded) {
energy = kCalculateCDLJForces(gpu);
// printf("Calculated CDLJ energy: %f\n", energy);
totalEnergy += energy;
kCalculateCDLJForces(gpu);
}
energy = kCalculateLocalForces(gpu);
// printf("Calculated local interactions energy: %f\n", energy);
totalEnergy += energy;
kCalculateLocalForces(gpu);
if (gpu->bIncludeGBSA)
kReduceBornSumAndForces(gpu);
else
kReduceForces(gpu);
// printf("Total GPU energies: %f\n", totalEnergy);
return totalEnergy;
return kReduceEnergy(gpu);
}
return 0.0f;
}
......
......@@ -28,17 +28,19 @@
// Initialization
extern void kClearForces(gpuContext gpu);
extern void kClearEnergy(gpuContext gpu);
extern void kClearBornForces(gpuContext gpu);
extern void kCalculateObcGbsaBornSum(gpuContext gpu);
extern void kReduceObcGbsaBornSum(gpuContext gpu);
extern void kGenerateRandoms(gpuContext gpu);
// Main loop
extern double kCalculateCDLJObcGbsaForces1(gpuContext gpu);
extern double kCalculateCDLJForces(gpuContext gpu);
extern void kCalculateCDLJObcGbsaForces1(gpuContext gpu);
extern void kCalculateCDLJForces(gpuContext gpu);
extern void kCalculateCustomNonbondedForces(gpuContext gpu, bool neighborListValid);
extern double kReduceObcGbsaBornForces(gpuContext gpu);
extern void kReduceObcGbsaBornForces(gpuContext gpu);
extern void kCalculateObcGbsaForces2(gpuContext gpu);
extern double kCalculateLocalForces(gpuContext gpu);
extern void kCalculateLocalForces(gpuContext gpu);
extern void kCalculateAndersenThermostat(gpuContext gpu);
extern void kReduceBornSumAndForces(gpuContext gpu);
extern void kApplyFirstShake(gpuContext gpu);
......@@ -58,9 +60,7 @@ extern void kBrownianUpdatePart2(gpuContext gpu);
// Extras
extern void kReduceForces(gpuContext gpu);
extern void kReduceLocalEnergies(gpuContext gpu, float * gpu_energies);
extern double kReduceNonbondEnergies(gpuContext gpu, float * gpu_energies);
extern void kClearBornForces(gpuContext gpu);
extern double kReduceEnergy(gpuContext gpu);
// Initializers
extern void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
......
......@@ -232,6 +232,9 @@ static const int GT2XX_RANDOM_THREADS_PER_BLOCK = 384;
static const int G8X_NONBOND_WORKUNITS_PER_SM = 220;
static const int GT2XX_NONBOND_WORKUNITS_PER_SM = 256;
static const unsigned int MAX_STACK_SIZE = 8;
static const float PI = 3.14159265358979323846f;
enum CudaNonbondedMethod
{
NO_CUTOFF,
......@@ -300,6 +303,7 @@ struct cudaGmxSimulation {
unsigned int nonbondOutputBuffers; // Nonbond output buffers per nonbond call
unsigned int totalNonbondOutputBuffers; // Total nonbond output buffers
unsigned int outputBuffers; // Number of output buffers
unsigned int energyOutputBuffers; // Number of energy output buffers
float bigFloat; // Floating point value used as a flag for Shaken atoms
float epsfac; // Epsilon factor for CDLJ calculations
CudaNonbondedMethod nonbondedMethod; // How to handle nonbonded interactions
......@@ -414,7 +418,7 @@ struct cudaGmxSimulation {
float4* pForce4; // Pointer to all force4 data
float4* pForce4a; // Pointer to first set of force4 data
float4* pForce4b; // Pointer to second set of force4 data
float4* pOutForce4; // Pointer to output float4 force
float* pEnergy; // Pointer to energy output buffer
float* pBornForce; // Pointer to Born force data
float* pBornSum; // Pointer to Born Radii calculation output buffers
float* pBornRadii; // Pointer to Born Radii
......
......@@ -106,7 +106,6 @@ struct Molecule {
};
static const float dielectricOffset = 0.009f;
static const float PI = 3.1415926535f;
static const float probeRadius = 0.14f;
static const float forceConversionFactor = 0.4184f;
......@@ -1444,12 +1443,12 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
gpu->bRemoveCM = false;
gpu->bRecalculateBornRadii = true;
gpu->bIncludeGBSA = false;
gpu->bReduceEnergies = true;
gpuInitializeRandoms(gpu);
// To be determined later
gpu->psLJ14ID = NULL;
gpu->psForce4 = NULL;
gpu->psEnergy = NULL;
gpu->sim.pForce4 = NULL;
gpu->sim.pForce4a = NULL;
gpu->sim.pForce4b = NULL;
......@@ -1619,6 +1618,7 @@ void gpuShutDown(gpuContext gpu)
delete gpu->psOldPosq4;
delete gpu->psVelm4;
delete gpu->psForce4;
delete gpu->psEnergy;
delete gpu->psxVector4;
delete gpu->psvVector4;
delete gpu->psSigEps2;
......@@ -1713,12 +1713,15 @@ int gpuBuildOutputBuffers(gpuContext gpu)
}
}
gpu->sim.outputBuffers = outputBuffers;
gpu->sim.energyOutputBuffers = max(gpu->sim.nonbond_threads_per_block, gpu->sim.localForces_threads_per_block)*gpu->sim.blocks;
gpu->psForce4 = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, outputBuffers, "Force");
gpu->psEnergy = new CUDAStream<float>(gpu->sim.energyOutputBuffers, 1, "Energy");
gpu->psBornForce = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, gpu->sim.nonbondOutputBuffers, "BornForce");
gpu->psBornSum = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, gpu->sim.nonbondOutputBuffers, "BornSum");
gpu->sim.pForce4 = gpu->psForce4->_pDevStream[0];
gpu->sim.pForce4a = gpu->sim.pForce4;
gpu->sim.pForce4b = gpu->sim.pForce4 + 1 * gpu->sim.nonbondOutputBuffers * gpu->sim.stride;
gpu->sim.pEnergy = gpu->psEnergy->_pDevStream[0];
gpu->sim.pBornForce = gpu->psBornForce->_pDevStream[0];
gpu->sim.pBornSum = gpu->psBornSum->_pDevStream[0];
......
......@@ -77,7 +77,6 @@ struct _gpuContext {
bool bRecalculateBornRadii;
bool bOutputBufferPerWarp;
bool bIncludeGBSA;
bool bReduceEnergies;
unsigned long seed;
SM_VERSION sm_version;
CUDPPHandle cudpp;
......@@ -86,6 +85,7 @@ struct _gpuContext {
CUDAStream<float4>* psOldPosq4;
CUDAStream<float4>* psVelm4;
CUDAStream<float4>* psForce4;
CUDAStream<float>* psEnergy; // Energy output buffer
CUDAStream<float4>* psxVector4;
CUDAStream<float4>* psvVector4;
CUDAStream<float2>* psSigEps2;
......
......@@ -122,25 +122,19 @@ void GetCalculateCDLJForcesSim(gpuContext gpu)
#include "kCalculateCDLJEwaldFastReciprocal.h"
__global__ extern void kCalculateCDLJCutoffForces_12_kernel();
double kCalculateCDLJForces(gpuContext gpu)
void kCalculateCDLJForces(gpuContext gpu)
{
// printf("kCalculateCDLJCutoffForces\n");
unsigned int N = gpu->sim.nonbond_blocks * gpu->sim.nonbond_threads_per_block;
float *energies_dev;
cudaMalloc((void**) &energies_dev, sizeof(float) * N);
CUDPPResult result;
switch (gpu->sim.nonbondedMethod)
{
case NO_CUTOFF:
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, energies_dev);
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
else
kCalculateCDLJN2Forces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, energies_dev);
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
LAUNCHERROR("kCalculateCDLJN2Forces");
break;
case CUTOFF:
......@@ -159,10 +153,10 @@ double kCalculateCDLJForces(gpuContext gpu)
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev);
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateCDLJCutoffForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev);
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJCutoffForces");
break;
case PERIODIC:
......@@ -181,10 +175,10 @@ double kCalculateCDLJForces(gpuContext gpu)
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJPeriodicByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev);
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateCDLJPeriodicForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev);
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJPeriodicForces");
break;
case EWALD:
......@@ -203,10 +197,10 @@ double kCalculateCDLJForces(gpuContext gpu)
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJEwaldDirectByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev);
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateCDLJEwaldDirectForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev);
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJEwaldDirectForces");
if (false)
{
......@@ -224,19 +218,4 @@ double kCalculateCDLJForces(gpuContext gpu)
}
}
double sum = 0.0f;
if (gpu->bReduceEnergies)
{
float *energies = (float*) malloc(sizeof(float) * N);
memset(energies, 0, sizeof(float) * N);
cudaMemcpy(energies, energies_dev, sizeof(float) * N, cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++) sum += energies[i];
/* printf("\n");
for (int i = 0; i < N; i++) printf("\t%f", energies[i]);
printf("\n");
*/
}
cudaFree(energies_dev);
return sum;
}
......@@ -30,7 +30,7 @@
* different versions of the kernels.
*/
__global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit, float * gpu_energies)
__global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit)
{
extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
......@@ -38,7 +38,6 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int energyIndex = blockIdx.x * blockDim.x + threadIdx.x;
float CDLJ_energy;
float energy = 0.0f;
#ifdef USE_CUTOFF
......@@ -46,8 +45,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
#endif
#ifdef USE_EWALD
float PI = 3.14159265358979323846f;
float SQRT_PI = sqrt(PI);
const float SQRT_PI = sqrt(PI);
#endif
unsigned int lasty = 0xFFFFFFFF;
......@@ -140,7 +138,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
}
#endif
/* E */
energy += CDLJ_energy / 2.0;
energy += 0.5f*CDLJ_energy;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
......@@ -214,7 +212,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
CDLJ_energy = 0.0f;
}
/* E */
energy += CDLJ_energy / 2.0;
energy += 0.5f*CDLJ_energy;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
......@@ -541,5 +539,5 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
pos++;
}
gpu_energies[energyIndex] = energy;
cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += energy;
}
......@@ -104,14 +104,10 @@ extern __global__ void kFindBlocksWithInteractionsPeriodic_kernel();
extern __global__ void kFindInteractionsWithinBlocksCutoff_kernel(unsigned int*);
extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int*);
double kCalculateCDLJObcGbsaForces1(gpuContext gpu)
void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
{
// printf("kCalculateCDLJObcGbsaForces1\n");
unsigned int N = gpu->sim.nonbond_blocks * gpu->sim.nonbond_threads_per_block;
float *energies_dev;
cudaMalloc((void**) &energies_dev, sizeof(float) * N);
// check if Born radii need to be calculated
kClearBornForces(gpu);
......@@ -126,10 +122,10 @@ double kCalculateCDLJObcGbsaForces1(gpuContext gpu)
}
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, energies_dev);
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
else
kCalculateCDLJObcGbsaN2Forces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, energies_dev);
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
LAUNCHERROR("kCalculateCDLJObcGbsaN2Forces1");
break;
case CUTOFF:
......@@ -153,10 +149,10 @@ double kCalculateCDLJObcGbsaForces1(gpuContext gpu)
}
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev);
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateCDLJObcGbsaCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev);
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJObcGbsaCutoffForces1");
break;
case PERIODIC:
......@@ -180,24 +176,11 @@ double kCalculateCDLJObcGbsaForces1(gpuContext gpu)
}
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaPeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev);
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateCDLJObcGbsaPeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev);
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJObcGbsaPeriodicForces1");
break;
}
double sum = 0.0f;
if (gpu->bReduceEnergies)
{
float *energies = (float*) malloc(sizeof(float) * N);
// double sum = kReduceNonbondEnergies(gpu, energies_dev);
// printf("GPU____Sum: %f\n", sum);
cudaMemcpy(energies, energies_dev, sizeof(float) * N, cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++) sum += energies[i];
// printf("ControlSum: %f\n", sum);
}
cudaFree(energies_dev);
return sum;
}
......@@ -30,7 +30,7 @@
* different versions of the kernels.
*/
__global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit, float * gpu_energies)
__global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
{
extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
......@@ -38,7 +38,6 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int energyIndex = blockIdx.x * blockDim.x + threadIdx.x;
float CDLJObcGbsa_energy;
float energy = 0.0f;
#ifdef USE_CUTOFF
......@@ -138,7 +137,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
/* E */
if (i < cSim.atoms)
{
energy += CDLJObcGbsa_energy / 2.0f;
energy += 0.5f*CDLJObcGbsa_energy;
}
// Add Forces
dx *= dEdR;
......@@ -224,7 +223,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
/* E */
if (i < cSim.atoms)
{
energy += CDLJObcGbsa_energy / 2.0f;
energy += 0.5f*CDLJObcGbsa_energy;
}
// Add Forces
dx *= dEdR;
......@@ -615,5 +614,5 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
}
pos++;
}
gpu_energies[energyIndex] = energy;
cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += energy;
}
......@@ -59,14 +59,14 @@ static __constant__ cudaGmxSimulation cSim;
#define GETPREFACTORSGIVENANGLECOSINE(cosine, param, dEdR) \
{ \
float angle = acos(cosine); \
float deltaIdeal = angle - (param.x * (3.14159265f / 180.0f)); \
float deltaIdeal = angle - (param.x * (PI / 180.0f)); \
dEdR = param.y * deltaIdeal; \
}
#define GETENERGYGIVENANGLECOSINE(cosine, param, dEdR) \
{ \
float angle = acos(cosine); \
float deltaIdeal = angle - (param.x * (3.14159265f / 180.0f)); \
float deltaIdeal = angle - (param.x * (PI / 180.0f)); \
dEdR = param.y * deltaIdeal * deltaIdeal; \
}
......@@ -116,10 +116,9 @@ void GetCalculateLocalForcesSim(gpuContext gpu)
}
__global__ void kCalculateLocalForces_kernel(float * gpu_energies)
__global__ void kCalculateLocalForces_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int energyIndex = pos;
Vectors* A = &sV[threadIdx.x];
float energy = 0.0f;
......@@ -138,7 +137,7 @@ __global__ void kCalculateLocalForces_kernel(float * gpu_energies)
float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrt(r2);
float deltaIdeal = r - bond.x;
/* E */ energy += bond.y * deltaIdeal * deltaIdeal / 2.0;
/* E */ energy += 0.5f * bond.y * deltaIdeal * deltaIdeal;
float dEdR = bond.y * deltaIdeal;
dEdR = (r > 0.0f) ? (dEdR / r) : 0.0f;
// printf("D: %11.4f %11.4f %11.4f %11.4f %11.4f %11.4f\n", dx, dy, dz, r, deltaIdeal, dEdR);
......@@ -188,7 +187,7 @@ __global__ void kCalculateLocalForces_kernel(float * gpu_energies)
float angle_energy;
/* E */ GETENERGYGIVENANGLECOSINE(cosine, bond_angle, angle_energy);
energy += angle_energy / 2.0;
energy += 0.5f*angle_energy;
float dEdR;
GETPREFACTORSGIVENANGLECOSINE(cosine, bond_angle, dEdR);
......@@ -251,7 +250,7 @@ __global__ void kCalculateLocalForces_kernel(float * gpu_energies)
float dihedralAngle;
GETDIHEDRALANGLEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle);
float4 dihedral = cSim.pDihedralParameter[pos1];
float deltaAngle = dihedral.z * dihedralAngle - (dihedral.y * 3.14159265f / 180.0f);
float deltaAngle = dihedral.z * dihedralAngle - (dihedral.y * PI / 180.0f);
// ATTENTION: This section leads to a divergent deltaAngle values wrt
// forces and energies. We separate the case dihedral.z = n = 0, which
......@@ -260,8 +259,8 @@ __global__ void kCalculateLocalForces_kernel(float * gpu_energies)
/* E */ else
{
float deltaAngle = dihedralAngle - dihedral.y;
if (deltaAngle < -M_PI) deltaAngle += 2.0 * M_PI;
else if (deltaAngle > M_PI) deltaAngle -= 2.0 * M_PI;
if (deltaAngle < -PI) deltaAngle += 2.0f * PI;
else if (deltaAngle > PI) deltaAngle -= 2.0f * PI;
energy += dihedral.x * deltaAngle * deltaAngle;
}
......@@ -354,11 +353,11 @@ __global__ void kCalculateLocalForces_kernel(float * gpu_energies)
GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle, cosPhi);
if (dihedralAngle < 0.0f )
{
dihedralAngle += 3.14159265f;
dihedralAngle += PI;
}
else
{
dihedralAngle -= 3.14159265f;
dihedralAngle -= PI;
}
cosPhi = -cosPhi;
// printf("%4d: %9.4f %9.4f\n", pos1, dihedralAngle, cosPhi);
......@@ -607,30 +606,14 @@ __global__ void kCalculateLocalForces_kernel(float * gpu_energies)
pos += blockDim.x * gridDim.x;
}
}
gpu_energies[energyIndex] = energy;
cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy;
}
double kCalculateLocalForces(gpuContext gpu)
void kCalculateLocalForces(gpuContext gpu)
{
// printf("kCalculateLocalForces\n");
unsigned int N = gpu->sim.blocks * gpu->sim.localForces_threads_per_block;
float *energies_dev;
cudaMalloc((void**) &energies_dev, sizeof(float) * N);
kCalculateLocalForces_kernel<<<gpu->sim.blocks, gpu->sim.localForces_threads_per_block, gpu->sim.localForces_threads_per_block * sizeof(Vectors)>>>(energies_dev);
kCalculateLocalForces_kernel<<<gpu->sim.blocks, gpu->sim.localForces_threads_per_block, gpu->sim.localForces_threads_per_block * sizeof(Vectors)>>>();
LAUNCHERROR("kCalculateLocalForces");
double sum = 0.0f;
if (gpu->bReduceEnergies)
{
// sum = kReduceLocalEnergies(gpu, energies_dev);
float *energies = (float*) malloc(sizeof(float) * N);
memset(energies, 0, sizeof(float) * N);
cudaMemcpy(energies, energies_dev, sizeof(float) * N, cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++) sum += energies[i];
}
cudaFree(energies_dev);
return sum;
}
......@@ -87,6 +87,23 @@ void kClearBornForces(gpuContext gpu)
LAUNCHERROR("kClearBornForces");
}
__global__ void kClearEnergy_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.energyOutputBuffers)
{
((float*)cSim.pEnergy)[pos] = 0.0f;
pos += gridDim.x * blockDim.x;
}
}
void kClearEnergy(gpuContext gpu)
{
// printf("kClearEnergy\n");
kClearEnergy_kernel<<<gpu->sim.blocks, 384>>>();
LAUNCHERROR("kClearEnergy");
}
__global__ void kReduceBornSumAndForces_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
......@@ -188,20 +205,6 @@ void kReduceBornSumAndForces(gpuContext gpu)
//printf("kReduceBornSumAndForces\n");
kReduceBornSumAndForces_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>();
LAUNCHERROR("kReduceBornSumAndForces");
#if 0
//gpuDumpObcLoop1( gpu );
/*
gpu->psForce4->Download();
for (int i = 0; i < gpu->natoms; i++)
{
printf("%4d: %12.6f %12.6f %12.6f\n", i,
gpu->psForce4->_pSysStream[0][i].x,
gpu->psForce4->_pSysStream[0][i].y,
gpu->psForce4->_pSysStream[0][i].z
);
} */
#endif
}
__global__ void kReduceForces_kernel()
......@@ -254,56 +257,19 @@ void kReduceForces(gpuContext gpu)
LAUNCHERROR("kReduceForces");
}
__global__ void kReduceEnergies_kernel(float * gpu_energies)
{
// GPU energy reduction algorithm goes here
// unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
}
double kReduceLocalEnergies(gpuContext gpu, float * gpu_energies)
double kReduceEnergy(gpuContext gpu)
{
// printf("kReduceLocalEnergies\n");
unsigned int N = gpu->sim.blocks * gpu->sim.localForces_threads_per_block;
float *energies = (float*) malloc(sizeof(float) * N);
//kReduceEnergies_kernel<<<gpu->sim.blocks, gpu->sim.localForces_threads_per_block>>>(gpu_energies);
LAUNCHERROR("kReduceLocalEnergies");
cudaMemcpy(energies, gpu_energies, sizeof(float) * N, cudaMemcpyDeviceToHost);
// printf("kReduceEnergy\n");
gpu->psEnergy->Download();
double sum = 0.0f;
for (int i = 0; i < N; i++) sum += energies[i];
for (int i = 0; i < gpu->sim.energyOutputBuffers; i++)
sum += (*gpu->psEnergy)[i];
return sum;
}
double kReduceNonbondEnergies(gpuContext gpu, float * gpu_energies)
{
// printf("kReduceNonbondEnergies\n");
unsigned int N = gpu->sim.nonbond_blocks * gpu->sim.nonbond_threads_per_block;
float *energies = (float*) malloc(sizeof(float) * N);
//kReduceEnergies_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>(gpu_energies);
LAUNCHERROR("kReduceNonbondEnergies");
cudaMemcpy(energies, gpu_energies, sizeof(float) * N, cudaMemcpyDeviceToHost);
double sum = 0.0f;
for (int i = 0; i < N; i++) sum += energies[i];
return sum;
}
double kReduceObcGbsaBornEnergies(gpuContext gpu, float * gpu_energies)
{
// printf("kReduceObcGbsaBornEnergies\n");
unsigned int N = gpu->sim.blocks * gpu->sim.bf_reduce_threads_per_block;
float *energies = (float*) malloc(sizeof(float) * N);
//kReduceEnergies_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>(gpu_energies);
LAUNCHERROR("kReduceObcGbsaBornEnergies");
cudaMemcpy(energies, gpu_energies, sizeof(float) * N, cudaMemcpyDeviceToHost);
double sum = 0.0f;
for (int i = 0; i < N; i++) sum += energies[i];
return sum;
}
__global__ void kReduceObcGbsaBornForces_kernel(float * gpu_energies)
__global__ void kReduceObcGbsaBornForces_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
unsigned int energyIndex = pos;
float energy = 0.0f;
while (pos < cSim.atoms)
{
......@@ -357,29 +323,14 @@ __global__ void kReduceObcGbsaBornForces_kernel(float * gpu_energies)
}
/* E */
// correct for surface area factor of -6
gpu_energies[energyIndex] = energy / -6.0f;
cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy / -6.0f;
}
double kReduceObcGbsaBornForces(gpuContext gpu)
void kReduceObcGbsaBornForces(gpuContext gpu)
{
unsigned int N = gpu->sim.blocks * gpu->sim.bf_reduce_threads_per_block;
float *energies_dev;
cudaMalloc((void**) &energies_dev, sizeof(float) * N);
//printf("kReduceObcGbsaBornForces\n");
kReduceObcGbsaBornForces_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>(energies_dev);
kReduceObcGbsaBornForces_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>();
LAUNCHERROR("kReduceObcGbsaBornForces");
float sum = 0.0f;
if (gpu->bReduceEnergies)
{
// double sum = kReduceObcGbsaBornEnergies(gpu, energies_dev);
float *energies = (float*) malloc(sizeof(float) * N);
cudaMemcpy(energies, energies_dev, sizeof(float) * N, cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++) sum += energies[i];
}
cudaFree(energies_dev);
return sum;
}
......@@ -180,6 +180,7 @@ void testForce(int numParticles, NonbondedForce::NonbondedMethod method) {
norm = std::sqrt(norm);
diff = std::sqrt(diff);
ASSERT_EQUAL_TOL(0.0, diff, 0.001*norm);
ASSERT_EQUAL_TOL(state.getPotentialEnergy(), refState.getPotentialEnergy(), 1e-3);
// Take a small step in the direction of the energy gradient. (This doesn't work with cutoffs, since the energy
// changes discontinuously at the cutoff distance.)
......
......@@ -257,7 +257,6 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at
if( totalEnergy )
*totalEnergy -= selfEwaldEnergy;
// printf("- selfEwaldEnergy: %f aEwald %f p1 %f\n", -selfEwaldEnergy, alphaEwald, atomParameters[atomID][QIndex]);
if( energyByAtom ){
energyByAtom[atomID] -= selfEwaldEnergy;
......@@ -267,7 +266,6 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at
// **************************************************************************************
// RECIPROCAL SPACE EWALD ENERGY AND FORCES
// **************************************************************************************
RealOpenMM tempStore = *totalEnergy;
// PME
if (pme) {
......@@ -410,8 +408,6 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at
SimTKOpenMMLog::printError( message );
}
// printf("recipEnergy: %f\n", *totalEnergy - tempStore);
// **************************************************************************************
// SHORT-RANGE ENERGY AND FORCES
// **************************************************************************************
......@@ -451,13 +447,11 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at
// accumulate energies
realSpaceEwaldEnergy = atomParameters[ii][QIndex]*atomParameters[jj][QIndex]*inverseR*erfc(alphaR);
// printf("realSpaceEwald: %f p1 %f p2 %f invR %f erfc(aR) %f aR %f\n", realSpaceEwaldEnergy, atomParameters[ii][QIndex], atomParameters[jj][QIndex], inverseR, erfc(alphaR), alphaR);
vdwEnergy = eps*(sig6-one)*sig6;
if( totalEnergy )
*totalEnergy += realSpaceEwaldEnergy + vdwEnergy;
if( energyByAtom ){
energyByAtom[ii] += realSpaceEwaldEnergy + vdwEnergy;
energyByAtom[jj] += realSpaceEwaldEnergy + vdwEnergy;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment