Commit 0541e749 authored by Peter Eastman's avatar Peter Eastman
Browse files

Simplified and optimized calculation of energies

parent a534f1b2
...@@ -41,7 +41,6 @@ using namespace std; ...@@ -41,7 +41,6 @@ using namespace std;
static void calcForces(ContextImpl& context, CudaPlatform::PlatformData& data) { static void calcForces(ContextImpl& context, CudaPlatform::PlatformData& data) {
_gpuContext* gpu = data.gpu; _gpuContext* gpu = data.gpu;
gpu->bReduceEnergies = false;
if (data.nonbondedMethod != NO_CUTOFF && data.computeForceCount%100 == 0) if (data.nonbondedMethod != NO_CUTOFF && data.computeForceCount%100 == 0)
gpuReorderAtoms(gpu); gpuReorderAtoms(gpu);
data.computeForceCount++; data.computeForceCount++;
...@@ -88,37 +87,23 @@ static double calcEnergy(ContextImpl& context, CudaPlatform::PlatformData& data, ...@@ -88,37 +87,23 @@ static double calcEnergy(ContextImpl& context, CudaPlatform::PlatformData& data,
} }
else else
{ {
double totalEnergy = 0.0f;
double energy;
gpu->bReduceEnergies = true;
if (data.nonbondedMethod != NO_CUTOFF && data.stepCount%100 == 0) if (data.nonbondedMethod != NO_CUTOFF && data.stepCount%100 == 0)
gpuReorderAtoms(gpu); gpuReorderAtoms(gpu);
data.stepCount++; data.stepCount++;
kClearForces(gpu); kClearEnergy(gpu);
if (gpu->bIncludeGBSA) { if (gpu->bIncludeGBSA) {
gpu->bRecalculateBornRadii = true; gpu->bRecalculateBornRadii = true;
energy = kCalculateCDLJObcGbsaForces1(gpu); kCalculateCDLJObcGbsaForces1(gpu);
// printf("Calculated CDLJObcGbsa energy: %f\n", energy); kReduceObcGbsaBornForces(gpu);
totalEnergy += energy;
energy = kReduceObcGbsaBornForces(gpu);
// printf("Calculated reduced GbsaBorn surface area energy: %f\n", energy);
totalEnergy += energy;
kCalculateObcGbsaForces2(gpu); kCalculateObcGbsaForces2(gpu);
} }
else if (data.hasNonbonded) { else if (data.hasNonbonded) {
energy = kCalculateCDLJForces(gpu); kCalculateCDLJForces(gpu);
// printf("Calculated CDLJ energy: %f\n", energy);
totalEnergy += energy;
} }
energy = kCalculateLocalForces(gpu); kCalculateLocalForces(gpu);
// printf("Calculated local interactions energy: %f\n", energy);
totalEnergy += energy;
if (gpu->bIncludeGBSA) if (gpu->bIncludeGBSA)
kReduceBornSumAndForces(gpu); kReduceBornSumAndForces(gpu);
else return kReduceEnergy(gpu);
kReduceForces(gpu);
// printf("Total GPU energies: %f\n", totalEnergy);
return totalEnergy;
} }
return 0.0f; return 0.0f;
} }
......
...@@ -28,17 +28,19 @@ ...@@ -28,17 +28,19 @@
// Initialization // Initialization
extern void kClearForces(gpuContext gpu); extern void kClearForces(gpuContext gpu);
extern void kClearEnergy(gpuContext gpu);
extern void kClearBornForces(gpuContext gpu);
extern void kCalculateObcGbsaBornSum(gpuContext gpu); extern void kCalculateObcGbsaBornSum(gpuContext gpu);
extern void kReduceObcGbsaBornSum(gpuContext gpu); extern void kReduceObcGbsaBornSum(gpuContext gpu);
extern void kGenerateRandoms(gpuContext gpu); extern void kGenerateRandoms(gpuContext gpu);
// Main loop // Main loop
extern double kCalculateCDLJObcGbsaForces1(gpuContext gpu); extern void kCalculateCDLJObcGbsaForces1(gpuContext gpu);
extern double kCalculateCDLJForces(gpuContext gpu); extern void kCalculateCDLJForces(gpuContext gpu);
extern void kCalculateCustomNonbondedForces(gpuContext gpu, bool neighborListValid); extern void kCalculateCustomNonbondedForces(gpuContext gpu, bool neighborListValid);
extern double kReduceObcGbsaBornForces(gpuContext gpu); extern void kReduceObcGbsaBornForces(gpuContext gpu);
extern void kCalculateObcGbsaForces2(gpuContext gpu); extern void kCalculateObcGbsaForces2(gpuContext gpu);
extern double kCalculateLocalForces(gpuContext gpu); extern void kCalculateLocalForces(gpuContext gpu);
extern void kCalculateAndersenThermostat(gpuContext gpu); extern void kCalculateAndersenThermostat(gpuContext gpu);
extern void kReduceBornSumAndForces(gpuContext gpu); extern void kReduceBornSumAndForces(gpuContext gpu);
extern void kApplyFirstShake(gpuContext gpu); extern void kApplyFirstShake(gpuContext gpu);
...@@ -58,9 +60,7 @@ extern void kBrownianUpdatePart2(gpuContext gpu); ...@@ -58,9 +60,7 @@ extern void kBrownianUpdatePart2(gpuContext gpu);
// Extras // Extras
extern void kReduceForces(gpuContext gpu); extern void kReduceForces(gpuContext gpu);
extern void kReduceLocalEnergies(gpuContext gpu, float * gpu_energies); extern double kReduceEnergy(gpuContext gpu);
extern double kReduceNonbondEnergies(gpuContext gpu, float * gpu_energies);
extern void kClearBornForces(gpuContext gpu);
// Initializers // Initializers
extern void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu); extern void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
......
...@@ -232,6 +232,9 @@ static const int GT2XX_RANDOM_THREADS_PER_BLOCK = 384; ...@@ -232,6 +232,9 @@ static const int GT2XX_RANDOM_THREADS_PER_BLOCK = 384;
static const int G8X_NONBOND_WORKUNITS_PER_SM = 220; static const int G8X_NONBOND_WORKUNITS_PER_SM = 220;
static const int GT2XX_NONBOND_WORKUNITS_PER_SM = 256; static const int GT2XX_NONBOND_WORKUNITS_PER_SM = 256;
static const unsigned int MAX_STACK_SIZE = 8; static const unsigned int MAX_STACK_SIZE = 8;
static const float PI = 3.14159265358979323846f;
enum CudaNonbondedMethod enum CudaNonbondedMethod
{ {
NO_CUTOFF, NO_CUTOFF,
...@@ -297,9 +300,10 @@ struct cudaGmxSimulation { ...@@ -297,9 +300,10 @@ struct cudaGmxSimulation {
unsigned int stride2; // Atomic attributes stride x 2 unsigned int stride2; // Atomic attributes stride x 2
unsigned int stride3; // Atomic attributes stride x 3 unsigned int stride3; // Atomic attributes stride x 3
unsigned int stride4; // Atomic attributes stride x 4 unsigned int stride4; // Atomic attributes stride x 4
unsigned int nonbondOutputBuffers; // Nonbond output buffers per nonbond call unsigned int nonbondOutputBuffers; // Nonbond output buffers per nonbond call
unsigned int totalNonbondOutputBuffers; // Total nonbond output buffers unsigned int totalNonbondOutputBuffers; // Total nonbond output buffers
unsigned int outputBuffers; // Number of output buffers unsigned int outputBuffers; // Number of output buffers
unsigned int energyOutputBuffers; // Number of energy output buffers
float bigFloat; // Floating point value used as a flag for Shaken atoms float bigFloat; // Floating point value used as a flag for Shaken atoms
float epsfac; // Epsilon factor for CDLJ calculations float epsfac; // Epsilon factor for CDLJ calculations
CudaNonbondedMethod nonbondedMethod; // How to handle nonbonded interactions CudaNonbondedMethod nonbondedMethod; // How to handle nonbonded interactions
...@@ -414,10 +418,10 @@ struct cudaGmxSimulation { ...@@ -414,10 +418,10 @@ struct cudaGmxSimulation {
float4* pForce4; // Pointer to all force4 data float4* pForce4; // Pointer to all force4 data
float4* pForce4a; // Pointer to first set of force4 data float4* pForce4a; // Pointer to first set of force4 data
float4* pForce4b; // Pointer to second set of force4 data float4* pForce4b; // Pointer to second set of force4 data
float4* pOutForce4; // Pointer to output float4 force float* pEnergy; // Pointer to energy output buffer
float* pBornForce; // Pointer to Born force data float* pBornForce; // Pointer to Born force data
float* pBornSum; // Pointer to Born Radii calculation output buffers float* pBornSum; // Pointer to Born Radii calculation output buffers
float* pBornRadii; // Pointer to Born Radii float* pBornRadii; // Pointer to Born Radii
float* pObcChain; // Pointer to OBC chain data float* pObcChain; // Pointer to OBC chain data
float4* pLinearMomentum; // Pointer to linear momentum float4* pLinearMomentum; // Pointer to linear momentum
......
...@@ -106,7 +106,6 @@ struct Molecule { ...@@ -106,7 +106,6 @@ struct Molecule {
}; };
static const float dielectricOffset = 0.009f; static const float dielectricOffset = 0.009f;
static const float PI = 3.1415926535f;
static const float probeRadius = 0.14f; static const float probeRadius = 0.14f;
static const float forceConversionFactor = 0.4184f; static const float forceConversionFactor = 0.4184f;
...@@ -1444,12 +1443,12 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync) ...@@ -1444,12 +1443,12 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
gpu->bRemoveCM = false; gpu->bRemoveCM = false;
gpu->bRecalculateBornRadii = true; gpu->bRecalculateBornRadii = true;
gpu->bIncludeGBSA = false; gpu->bIncludeGBSA = false;
gpu->bReduceEnergies = true;
gpuInitializeRandoms(gpu); gpuInitializeRandoms(gpu);
// To be determined later // To be determined later
gpu->psLJ14ID = NULL; gpu->psLJ14ID = NULL;
gpu->psForce4 = NULL; gpu->psForce4 = NULL;
gpu->psEnergy = NULL;
gpu->sim.pForce4 = NULL; gpu->sim.pForce4 = NULL;
gpu->sim.pForce4a = NULL; gpu->sim.pForce4a = NULL;
gpu->sim.pForce4b = NULL; gpu->sim.pForce4b = NULL;
...@@ -1619,6 +1618,7 @@ void gpuShutDown(gpuContext gpu) ...@@ -1619,6 +1618,7 @@ void gpuShutDown(gpuContext gpu)
delete gpu->psOldPosq4; delete gpu->psOldPosq4;
delete gpu->psVelm4; delete gpu->psVelm4;
delete gpu->psForce4; delete gpu->psForce4;
delete gpu->psEnergy;
delete gpu->psxVector4; delete gpu->psxVector4;
delete gpu->psvVector4; delete gpu->psvVector4;
delete gpu->psSigEps2; delete gpu->psSigEps2;
...@@ -1713,12 +1713,15 @@ int gpuBuildOutputBuffers(gpuContext gpu) ...@@ -1713,12 +1713,15 @@ int gpuBuildOutputBuffers(gpuContext gpu)
} }
} }
gpu->sim.outputBuffers = outputBuffers; gpu->sim.outputBuffers = outputBuffers;
gpu->sim.energyOutputBuffers = max(gpu->sim.nonbond_threads_per_block, gpu->sim.localForces_threads_per_block)*gpu->sim.blocks;
gpu->psForce4 = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, outputBuffers, "Force"); gpu->psForce4 = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, outputBuffers, "Force");
gpu->psEnergy = new CUDAStream<float>(gpu->sim.energyOutputBuffers, 1, "Energy");
gpu->psBornForce = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, gpu->sim.nonbondOutputBuffers, "BornForce"); gpu->psBornForce = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, gpu->sim.nonbondOutputBuffers, "BornForce");
gpu->psBornSum = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, gpu->sim.nonbondOutputBuffers, "BornSum"); gpu->psBornSum = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, gpu->sim.nonbondOutputBuffers, "BornSum");
gpu->sim.pForce4 = gpu->psForce4->_pDevStream[0]; gpu->sim.pForce4 = gpu->psForce4->_pDevStream[0];
gpu->sim.pForce4a = gpu->sim.pForce4; gpu->sim.pForce4a = gpu->sim.pForce4;
gpu->sim.pForce4b = gpu->sim.pForce4 + 1 * gpu->sim.nonbondOutputBuffers * gpu->sim.stride; gpu->sim.pForce4b = gpu->sim.pForce4 + 1 * gpu->sim.nonbondOutputBuffers * gpu->sim.stride;
gpu->sim.pEnergy = gpu->psEnergy->_pDevStream[0];
gpu->sim.pBornForce = gpu->psBornForce->_pDevStream[0]; gpu->sim.pBornForce = gpu->psBornForce->_pDevStream[0];
gpu->sim.pBornSum = gpu->psBornSum->_pDevStream[0]; gpu->sim.pBornSum = gpu->psBornSum->_pDevStream[0];
......
...@@ -77,7 +77,6 @@ struct _gpuContext { ...@@ -77,7 +77,6 @@ struct _gpuContext {
bool bRecalculateBornRadii; bool bRecalculateBornRadii;
bool bOutputBufferPerWarp; bool bOutputBufferPerWarp;
bool bIncludeGBSA; bool bIncludeGBSA;
bool bReduceEnergies;
unsigned long seed; unsigned long seed;
SM_VERSION sm_version; SM_VERSION sm_version;
CUDPPHandle cudpp; CUDPPHandle cudpp;
...@@ -86,6 +85,7 @@ struct _gpuContext { ...@@ -86,6 +85,7 @@ struct _gpuContext {
CUDAStream<float4>* psOldPosq4; CUDAStream<float4>* psOldPosq4;
CUDAStream<float4>* psVelm4; CUDAStream<float4>* psVelm4;
CUDAStream<float4>* psForce4; CUDAStream<float4>* psForce4;
CUDAStream<float>* psEnergy; // Energy output buffer
CUDAStream<float4>* psxVector4; CUDAStream<float4>* psxVector4;
CUDAStream<float4>* psvVector4; CUDAStream<float4>* psvVector4;
CUDAStream<float2>* psSigEps2; CUDAStream<float2>* psSigEps2;
......
...@@ -122,25 +122,19 @@ void GetCalculateCDLJForcesSim(gpuContext gpu) ...@@ -122,25 +122,19 @@ void GetCalculateCDLJForcesSim(gpuContext gpu)
#include "kCalculateCDLJEwaldFastReciprocal.h" #include "kCalculateCDLJEwaldFastReciprocal.h"
__global__ extern void kCalculateCDLJCutoffForces_12_kernel(); void kCalculateCDLJForces(gpuContext gpu)
double kCalculateCDLJForces(gpuContext gpu)
{ {
// printf("kCalculateCDLJCutoffForces\n"); // printf("kCalculateCDLJCutoffForces\n");
unsigned int N = gpu->sim.nonbond_blocks * gpu->sim.nonbond_threads_per_block;
float *energies_dev;
cudaMalloc((void**) &energies_dev, sizeof(float) * N);
CUDPPResult result; CUDPPResult result;
switch (gpu->sim.nonbondedMethod) switch (gpu->sim.nonbondedMethod)
{ {
case NO_CUTOFF: case NO_CUTOFF:
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, energies_dev); sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
else else
kCalculateCDLJN2Forces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJN2Forces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, energies_dev); sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
LAUNCHERROR("kCalculateCDLJN2Forces"); LAUNCHERROR("kCalculateCDLJN2Forces");
break; break;
case CUTOFF: case CUTOFF:
...@@ -159,10 +153,10 @@ double kCalculateCDLJForces(gpuContext gpu) ...@@ -159,10 +153,10 @@ double kCalculateCDLJForces(gpuContext gpu)
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev); (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else else
kCalculateCDLJCutoffForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJCutoffForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev); (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJCutoffForces"); LAUNCHERROR("kCalculateCDLJCutoffForces");
break; break;
case PERIODIC: case PERIODIC:
...@@ -181,10 +175,10 @@ double kCalculateCDLJForces(gpuContext gpu) ...@@ -181,10 +175,10 @@ double kCalculateCDLJForces(gpuContext gpu)
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJPeriodicByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJPeriodicByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev); (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else else
kCalculateCDLJPeriodicForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJPeriodicForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev); (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJPeriodicForces"); LAUNCHERROR("kCalculateCDLJPeriodicForces");
break; break;
case EWALD: case EWALD:
...@@ -203,10 +197,10 @@ double kCalculateCDLJForces(gpuContext gpu) ...@@ -203,10 +197,10 @@ double kCalculateCDLJForces(gpuContext gpu)
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJEwaldDirectByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJEwaldDirectByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev); (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else else
kCalculateCDLJEwaldDirectForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJEwaldDirectForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev); (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJEwaldDirectForces"); LAUNCHERROR("kCalculateCDLJEwaldDirectForces");
if (false) if (false)
{ {
...@@ -224,19 +218,4 @@ double kCalculateCDLJForces(gpuContext gpu) ...@@ -224,19 +218,4 @@ double kCalculateCDLJForces(gpuContext gpu)
} }
} }
double sum = 0.0f;
if (gpu->bReduceEnergies)
{
float *energies = (float*) malloc(sizeof(float) * N);
memset(energies, 0, sizeof(float) * N);
cudaMemcpy(energies, energies_dev, sizeof(float) * N, cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++) sum += energies[i];
/* printf("\n");
for (int i = 0; i < N; i++) printf("\t%f", energies[i]);
printf("\n");
*/
}
cudaFree(energies_dev);
return sum;
} }
...@@ -30,7 +30,7 @@ ...@@ -30,7 +30,7 @@
* different versions of the kernels. * different versions of the kernels.
*/ */
__global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit, float * gpu_energies) __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit)
{ {
extern __shared__ Atom sA[]; extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID; unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
...@@ -38,7 +38,6 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni ...@@ -38,7 +38,6 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
unsigned int numWorkUnits = cSim.pInteractionCount[0]; unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps; unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int energyIndex = blockIdx.x * blockDim.x + threadIdx.x;
float CDLJ_energy; float CDLJ_energy;
float energy = 0.0f; float energy = 0.0f;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
...@@ -46,8 +45,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni ...@@ -46,8 +45,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
#endif #endif
#ifdef USE_EWALD #ifdef USE_EWALD
float PI = 3.14159265358979323846f; const float SQRT_PI = sqrt(PI);
float SQRT_PI = sqrt(PI);
#endif #endif
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
...@@ -140,7 +138,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni ...@@ -140,7 +138,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
} }
#endif #endif
/* E */ /* E */
energy += CDLJ_energy / 2.0; energy += 0.5f*CDLJ_energy;
dx *= dEdR; dx *= dEdR;
dy *= dEdR; dy *= dEdR;
dz *= dEdR; dz *= dEdR;
...@@ -214,7 +212,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni ...@@ -214,7 +212,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
CDLJ_energy = 0.0f; CDLJ_energy = 0.0f;
} }
/* E */ /* E */
energy += CDLJ_energy / 2.0; energy += 0.5f*CDLJ_energy;
dx *= dEdR; dx *= dEdR;
dy *= dEdR; dy *= dEdR;
dz *= dEdR; dz *= dEdR;
...@@ -541,5 +539,5 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni ...@@ -541,5 +539,5 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
pos++; pos++;
} }
gpu_energies[energyIndex] = energy; cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += energy;
} }
...@@ -104,14 +104,10 @@ extern __global__ void kFindBlocksWithInteractionsPeriodic_kernel(); ...@@ -104,14 +104,10 @@ extern __global__ void kFindBlocksWithInteractionsPeriodic_kernel();
extern __global__ void kFindInteractionsWithinBlocksCutoff_kernel(unsigned int*); extern __global__ void kFindInteractionsWithinBlocksCutoff_kernel(unsigned int*);
extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int*); extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int*);
double kCalculateCDLJObcGbsaForces1(gpuContext gpu) void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
{ {
// printf("kCalculateCDLJObcGbsaForces1\n"); // printf("kCalculateCDLJObcGbsaForces1\n");
unsigned int N = gpu->sim.nonbond_blocks * gpu->sim.nonbond_threads_per_block;
float *energies_dev;
cudaMalloc((void**) &energies_dev, sizeof(float) * N);
// check if Born radii need to be calculated // check if Born radii need to be calculated
kClearBornForces(gpu); kClearBornForces(gpu);
...@@ -126,10 +122,10 @@ double kCalculateCDLJObcGbsaForces1(gpuContext gpu) ...@@ -126,10 +122,10 @@ double kCalculateCDLJObcGbsaForces1(gpuContext gpu)
} }
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, energies_dev); sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
else else
kCalculateCDLJObcGbsaN2Forces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaN2Forces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, energies_dev); sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
LAUNCHERROR("kCalculateCDLJObcGbsaN2Forces1"); LAUNCHERROR("kCalculateCDLJObcGbsaN2Forces1");
break; break;
case CUTOFF: case CUTOFF:
...@@ -153,10 +149,10 @@ double kCalculateCDLJObcGbsaForces1(gpuContext gpu) ...@@ -153,10 +149,10 @@ double kCalculateCDLJObcGbsaForces1(gpuContext gpu)
} }
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev); (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else else
kCalculateCDLJObcGbsaCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev); (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJObcGbsaCutoffForces1"); LAUNCHERROR("kCalculateCDLJObcGbsaCutoffForces1");
break; break;
case PERIODIC: case PERIODIC:
...@@ -180,24 +176,11 @@ double kCalculateCDLJObcGbsaForces1(gpuContext gpu) ...@@ -180,24 +176,11 @@ double kCalculateCDLJObcGbsaForces1(gpuContext gpu)
} }
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaPeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaPeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev); (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else else
kCalculateCDLJObcGbsaPeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaPeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev); (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJObcGbsaPeriodicForces1"); LAUNCHERROR("kCalculateCDLJObcGbsaPeriodicForces1");
break; break;
} }
double sum = 0.0f;
if (gpu->bReduceEnergies)
{
float *energies = (float*) malloc(sizeof(float) * N);
// double sum = kReduceNonbondEnergies(gpu, energies_dev);
// printf("GPU____Sum: %f\n", sum);
cudaMemcpy(energies, energies_dev, sizeof(float) * N, cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++) sum += energies[i];
// printf("ControlSum: %f\n", sum);
}
cudaFree(energies_dev);
return sum;
} }
...@@ -30,7 +30,7 @@ ...@@ -30,7 +30,7 @@
* different versions of the kernels. * different versions of the kernels.
*/ */
__global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit, float * gpu_energies) __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
{ {
extern __shared__ Atom sA[]; extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID; unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
...@@ -38,7 +38,6 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -38,7 +38,6 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
unsigned int numWorkUnits = cSim.pInteractionCount[0]; unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps; unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int energyIndex = blockIdx.x * blockDim.x + threadIdx.x;
float CDLJObcGbsa_energy; float CDLJObcGbsa_energy;
float energy = 0.0f; float energy = 0.0f;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
...@@ -138,7 +137,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -138,7 +137,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
/* E */ /* E */
if (i < cSim.atoms) if (i < cSim.atoms)
{ {
energy += CDLJObcGbsa_energy / 2.0f; energy += 0.5f*CDLJObcGbsa_energy;
} }
// Add Forces // Add Forces
dx *= dEdR; dx *= dEdR;
...@@ -224,7 +223,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -224,7 +223,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
/* E */ /* E */
if (i < cSim.atoms) if (i < cSim.atoms)
{ {
energy += CDLJObcGbsa_energy / 2.0f; energy += 0.5f*CDLJObcGbsa_energy;
} }
// Add Forces // Add Forces
dx *= dEdR; dx *= dEdR;
...@@ -615,5 +614,5 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -615,5 +614,5 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
} }
pos++; pos++;
} }
gpu_energies[energyIndex] = energy; cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += energy;
} }
...@@ -59,14 +59,14 @@ static __constant__ cudaGmxSimulation cSim; ...@@ -59,14 +59,14 @@ static __constant__ cudaGmxSimulation cSim;
#define GETPREFACTORSGIVENANGLECOSINE(cosine, param, dEdR) \ #define GETPREFACTORSGIVENANGLECOSINE(cosine, param, dEdR) \
{ \ { \
float angle = acos(cosine); \ float angle = acos(cosine); \
float deltaIdeal = angle - (param.x * (3.14159265f / 180.0f)); \ float deltaIdeal = angle - (param.x * (PI / 180.0f)); \
dEdR = param.y * deltaIdeal; \ dEdR = param.y * deltaIdeal; \
} }
#define GETENERGYGIVENANGLECOSINE(cosine, param, dEdR) \ #define GETENERGYGIVENANGLECOSINE(cosine, param, dEdR) \
{ \ { \
float angle = acos(cosine); \ float angle = acos(cosine); \
float deltaIdeal = angle - (param.x * (3.14159265f / 180.0f)); \ float deltaIdeal = angle - (param.x * (PI / 180.0f)); \
dEdR = param.y * deltaIdeal * deltaIdeal; \ dEdR = param.y * deltaIdeal * deltaIdeal; \
} }
...@@ -116,10 +116,9 @@ void GetCalculateLocalForcesSim(gpuContext gpu) ...@@ -116,10 +116,9 @@ void GetCalculateLocalForcesSim(gpuContext gpu)
} }
__global__ void kCalculateLocalForces_kernel(float * gpu_energies) __global__ void kCalculateLocalForces_kernel()
{ {
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x; unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int energyIndex = pos;
Vectors* A = &sV[threadIdx.x]; Vectors* A = &sV[threadIdx.x];
float energy = 0.0f; float energy = 0.0f;
...@@ -138,7 +137,7 @@ __global__ void kCalculateLocalForces_kernel(float * gpu_energies) ...@@ -138,7 +137,7 @@ __global__ void kCalculateLocalForces_kernel(float * gpu_energies)
float r2 = dx * dx + dy * dy + dz * dz; float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrt(r2); float r = sqrt(r2);
float deltaIdeal = r - bond.x; float deltaIdeal = r - bond.x;
/* E */ energy += bond.y * deltaIdeal * deltaIdeal / 2.0; /* E */ energy += 0.5f * bond.y * deltaIdeal * deltaIdeal;
float dEdR = bond.y * deltaIdeal; float dEdR = bond.y * deltaIdeal;
dEdR = (r > 0.0f) ? (dEdR / r) : 0.0f; dEdR = (r > 0.0f) ? (dEdR / r) : 0.0f;
// printf("D: %11.4f %11.4f %11.4f %11.4f %11.4f %11.4f\n", dx, dy, dz, r, deltaIdeal, dEdR); // printf("D: %11.4f %11.4f %11.4f %11.4f %11.4f %11.4f\n", dx, dy, dz, r, deltaIdeal, dEdR);
...@@ -188,7 +187,7 @@ __global__ void kCalculateLocalForces_kernel(float * gpu_energies) ...@@ -188,7 +187,7 @@ __global__ void kCalculateLocalForces_kernel(float * gpu_energies)
float angle_energy; float angle_energy;
/* E */ GETENERGYGIVENANGLECOSINE(cosine, bond_angle, angle_energy); /* E */ GETENERGYGIVENANGLECOSINE(cosine, bond_angle, angle_energy);
energy += angle_energy / 2.0; energy += 0.5f*angle_energy;
float dEdR; float dEdR;
GETPREFACTORSGIVENANGLECOSINE(cosine, bond_angle, dEdR); GETPREFACTORSGIVENANGLECOSINE(cosine, bond_angle, dEdR);
...@@ -251,7 +250,7 @@ __global__ void kCalculateLocalForces_kernel(float * gpu_energies) ...@@ -251,7 +250,7 @@ __global__ void kCalculateLocalForces_kernel(float * gpu_energies)
float dihedralAngle; float dihedralAngle;
GETDIHEDRALANGLEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle); GETDIHEDRALANGLEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle);
float4 dihedral = cSim.pDihedralParameter[pos1]; float4 dihedral = cSim.pDihedralParameter[pos1];
float deltaAngle = dihedral.z * dihedralAngle - (dihedral.y * 3.14159265f / 180.0f); float deltaAngle = dihedral.z * dihedralAngle - (dihedral.y * PI / 180.0f);
// ATTENTION: This section leads to a divergent deltaAngle values wrt // ATTENTION: This section leads to a divergent deltaAngle values wrt
// forces and energies. We separate the case dihedral.z = n = 0, which // forces and energies. We separate the case dihedral.z = n = 0, which
...@@ -260,8 +259,8 @@ __global__ void kCalculateLocalForces_kernel(float * gpu_energies) ...@@ -260,8 +259,8 @@ __global__ void kCalculateLocalForces_kernel(float * gpu_energies)
/* E */ else /* E */ else
{ {
float deltaAngle = dihedralAngle - dihedral.y; float deltaAngle = dihedralAngle - dihedral.y;
if (deltaAngle < -M_PI) deltaAngle += 2.0 * M_PI; if (deltaAngle < -PI) deltaAngle += 2.0f * PI;
else if (deltaAngle > M_PI) deltaAngle -= 2.0 * M_PI; else if (deltaAngle > PI) deltaAngle -= 2.0f * PI;
energy += dihedral.x * deltaAngle * deltaAngle; energy += dihedral.x * deltaAngle * deltaAngle;
} }
...@@ -354,11 +353,11 @@ __global__ void kCalculateLocalForces_kernel(float * gpu_energies) ...@@ -354,11 +353,11 @@ __global__ void kCalculateLocalForces_kernel(float * gpu_energies)
GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle, cosPhi); GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle, cosPhi);
if (dihedralAngle < 0.0f ) if (dihedralAngle < 0.0f )
{ {
dihedralAngle += 3.14159265f; dihedralAngle += PI;
} }
else else
{ {
dihedralAngle -= 3.14159265f; dihedralAngle -= PI;
} }
cosPhi = -cosPhi; cosPhi = -cosPhi;
// printf("%4d: %9.4f %9.4f\n", pos1, dihedralAngle, cosPhi); // printf("%4d: %9.4f %9.4f\n", pos1, dihedralAngle, cosPhi);
...@@ -607,30 +606,14 @@ __global__ void kCalculateLocalForces_kernel(float * gpu_energies) ...@@ -607,30 +606,14 @@ __global__ void kCalculateLocalForces_kernel(float * gpu_energies)
pos += blockDim.x * gridDim.x; pos += blockDim.x * gridDim.x;
} }
} }
gpu_energies[energyIndex] = energy; cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy;
} }
double kCalculateLocalForces(gpuContext gpu) void kCalculateLocalForces(gpuContext gpu)
{ {
// printf("kCalculateLocalForces\n"); // printf("kCalculateLocalForces\n");
unsigned int N = gpu->sim.blocks * gpu->sim.localForces_threads_per_block; kCalculateLocalForces_kernel<<<gpu->sim.blocks, gpu->sim.localForces_threads_per_block, gpu->sim.localForces_threads_per_block * sizeof(Vectors)>>>();
float *energies_dev;
cudaMalloc((void**) &energies_dev, sizeof(float) * N);
kCalculateLocalForces_kernel<<<gpu->sim.blocks, gpu->sim.localForces_threads_per_block, gpu->sim.localForces_threads_per_block * sizeof(Vectors)>>>(energies_dev);
LAUNCHERROR("kCalculateLocalForces"); LAUNCHERROR("kCalculateLocalForces");
double sum = 0.0f;
if (gpu->bReduceEnergies)
{
// sum = kReduceLocalEnergies(gpu, energies_dev);
float *energies = (float*) malloc(sizeof(float) * N);
memset(energies, 0, sizeof(float) * N);
cudaMemcpy(energies, energies_dev, sizeof(float) * N, cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++) sum += energies[i];
}
cudaFree(energies_dev);
return sum;
} }
...@@ -87,6 +87,23 @@ void kClearBornForces(gpuContext gpu) ...@@ -87,6 +87,23 @@ void kClearBornForces(gpuContext gpu)
LAUNCHERROR("kClearBornForces"); LAUNCHERROR("kClearBornForces");
} }
__global__ void kClearEnergy_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.energyOutputBuffers)
{
((float*)cSim.pEnergy)[pos] = 0.0f;
pos += gridDim.x * blockDim.x;
}
}
void kClearEnergy(gpuContext gpu)
{
// printf("kClearEnergy\n");
kClearEnergy_kernel<<<gpu->sim.blocks, 384>>>();
LAUNCHERROR("kClearEnergy");
}
__global__ void kReduceBornSumAndForces_kernel() __global__ void kReduceBornSumAndForces_kernel()
{ {
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x); unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
...@@ -188,20 +205,6 @@ void kReduceBornSumAndForces(gpuContext gpu) ...@@ -188,20 +205,6 @@ void kReduceBornSumAndForces(gpuContext gpu)
//printf("kReduceBornSumAndForces\n"); //printf("kReduceBornSumAndForces\n");
kReduceBornSumAndForces_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>(); kReduceBornSumAndForces_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>();
LAUNCHERROR("kReduceBornSumAndForces"); LAUNCHERROR("kReduceBornSumAndForces");
#if 0
//gpuDumpObcLoop1( gpu );
/*
gpu->psForce4->Download();
for (int i = 0; i < gpu->natoms; i++)
{
printf("%4d: %12.6f %12.6f %12.6f\n", i,
gpu->psForce4->_pSysStream[0][i].x,
gpu->psForce4->_pSysStream[0][i].y,
gpu->psForce4->_pSysStream[0][i].z
);
} */
#endif
} }
__global__ void kReduceForces_kernel() __global__ void kReduceForces_kernel()
...@@ -254,56 +257,19 @@ void kReduceForces(gpuContext gpu) ...@@ -254,56 +257,19 @@ void kReduceForces(gpuContext gpu)
LAUNCHERROR("kReduceForces"); LAUNCHERROR("kReduceForces");
} }
__global__ void kReduceEnergies_kernel(float * gpu_energies) double kReduceEnergy(gpuContext gpu)
{
// GPU energy reduction algorithm goes here
// unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
}
double kReduceLocalEnergies(gpuContext gpu, float * gpu_energies)
{ {
// printf("kReduceLocalEnergies\n"); // printf("kReduceEnergy\n");
unsigned int N = gpu->sim.blocks * gpu->sim.localForces_threads_per_block; gpu->psEnergy->Download();
float *energies = (float*) malloc(sizeof(float) * N);
//kReduceEnergies_kernel<<<gpu->sim.blocks, gpu->sim.localForces_threads_per_block>>>(gpu_energies);
LAUNCHERROR("kReduceLocalEnergies");
cudaMemcpy(energies, gpu_energies, sizeof(float) * N, cudaMemcpyDeviceToHost);
double sum = 0.0f; double sum = 0.0f;
for (int i = 0; i < N; i++) sum += energies[i]; for (int i = 0; i < gpu->sim.energyOutputBuffers; i++)
sum += (*gpu->psEnergy)[i];
return sum; return sum;
} }
double kReduceNonbondEnergies(gpuContext gpu, float * gpu_energies) __global__ void kReduceObcGbsaBornForces_kernel()
{
// printf("kReduceNonbondEnergies\n");
unsigned int N = gpu->sim.nonbond_blocks * gpu->sim.nonbond_threads_per_block;
float *energies = (float*) malloc(sizeof(float) * N);
//kReduceEnergies_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>(gpu_energies);
LAUNCHERROR("kReduceNonbondEnergies");
cudaMemcpy(energies, gpu_energies, sizeof(float) * N, cudaMemcpyDeviceToHost);
double sum = 0.0f;
for (int i = 0; i < N; i++) sum += energies[i];
return sum;
}
double kReduceObcGbsaBornEnergies(gpuContext gpu, float * gpu_energies)
{
// printf("kReduceObcGbsaBornEnergies\n");
unsigned int N = gpu->sim.blocks * gpu->sim.bf_reduce_threads_per_block;
float *energies = (float*) malloc(sizeof(float) * N);
//kReduceEnergies_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>(gpu_energies);
LAUNCHERROR("kReduceObcGbsaBornEnergies");
cudaMemcpy(energies, gpu_energies, sizeof(float) * N, cudaMemcpyDeviceToHost);
double sum = 0.0f;
for (int i = 0; i < N; i++) sum += energies[i];
return sum;
}
__global__ void kReduceObcGbsaBornForces_kernel(float * gpu_energies)
{ {
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x); unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
unsigned int energyIndex = pos;
float energy = 0.0f; float energy = 0.0f;
while (pos < cSim.atoms) while (pos < cSim.atoms)
{ {
...@@ -357,29 +323,14 @@ __global__ void kReduceObcGbsaBornForces_kernel(float * gpu_energies) ...@@ -357,29 +323,14 @@ __global__ void kReduceObcGbsaBornForces_kernel(float * gpu_energies)
} }
/* E */ /* E */
// correct for surface area factor of -6 // correct for surface area factor of -6
gpu_energies[energyIndex] = energy / -6.0f; cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy / -6.0f;
} }
double kReduceObcGbsaBornForces(gpuContext gpu) void kReduceObcGbsaBornForces(gpuContext gpu)
{ {
unsigned int N = gpu->sim.blocks * gpu->sim.bf_reduce_threads_per_block;
float *energies_dev;
cudaMalloc((void**) &energies_dev, sizeof(float) * N);
//printf("kReduceObcGbsaBornForces\n"); //printf("kReduceObcGbsaBornForces\n");
kReduceObcGbsaBornForces_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>(energies_dev); kReduceObcGbsaBornForces_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>();
LAUNCHERROR("kReduceObcGbsaBornForces"); LAUNCHERROR("kReduceObcGbsaBornForces");
float sum = 0.0f;
if (gpu->bReduceEnergies)
{
// double sum = kReduceObcGbsaBornEnergies(gpu, energies_dev);
float *energies = (float*) malloc(sizeof(float) * N);
cudaMemcpy(energies, energies_dev, sizeof(float) * N, cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++) sum += energies[i];
}
cudaFree(energies_dev);
return sum;
} }
...@@ -179,7 +179,8 @@ void testForce(int numParticles, NonbondedForce::NonbondedMethod method) { ...@@ -179,7 +179,8 @@ void testForce(int numParticles, NonbondedForce::NonbondedMethod method) {
} }
norm = std::sqrt(norm); norm = std::sqrt(norm);
diff = std::sqrt(diff); diff = std::sqrt(diff);
ASSERT_EQUAL_TOL(0.0, diff, 0.001*norm); ASSERT_EQUAL_TOL(0.0, diff, 0.001*norm);
ASSERT_EQUAL_TOL(state.getPotentialEnergy(), refState.getPotentialEnergy(), 1e-3);
// Take a small step in the direction of the energy gradient. (This doesn't work with cutoffs, since the energy // Take a small step in the direction of the energy gradient. (This doesn't work with cutoffs, since the energy
// changes discontinuously at the cutoff distance.) // changes discontinuously at the cutoff distance.)
......
...@@ -257,7 +257,6 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at ...@@ -257,7 +257,6 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at
if( totalEnergy ) if( totalEnergy )
*totalEnergy -= selfEwaldEnergy; *totalEnergy -= selfEwaldEnergy;
// printf("- selfEwaldEnergy: %f aEwald %f p1 %f\n", -selfEwaldEnergy, alphaEwald, atomParameters[atomID][QIndex]);
if( energyByAtom ){ if( energyByAtom ){
energyByAtom[atomID] -= selfEwaldEnergy; energyByAtom[atomID] -= selfEwaldEnergy;
...@@ -267,7 +266,6 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at ...@@ -267,7 +266,6 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at
// ************************************************************************************** // **************************************************************************************
// RECIPROCAL SPACE EWALD ENERGY AND FORCES // RECIPROCAL SPACE EWALD ENERGY AND FORCES
// ************************************************************************************** // **************************************************************************************
RealOpenMM tempStore = *totalEnergy;
// PME // PME
if (pme) { if (pme) {
...@@ -410,8 +408,6 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at ...@@ -410,8 +408,6 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at
SimTKOpenMMLog::printError( message ); SimTKOpenMMLog::printError( message );
} }
// printf("recipEnergy: %f\n", *totalEnergy - tempStore);
// ************************************************************************************** // **************************************************************************************
// SHORT-RANGE ENERGY AND FORCES // SHORT-RANGE ENERGY AND FORCES
// ************************************************************************************** // **************************************************************************************
...@@ -451,13 +447,11 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at ...@@ -451,13 +447,11 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at
// accumulate energies // accumulate energies
realSpaceEwaldEnergy = atomParameters[ii][QIndex]*atomParameters[jj][QIndex]*inverseR*erfc(alphaR); realSpaceEwaldEnergy = atomParameters[ii][QIndex]*atomParameters[jj][QIndex]*inverseR*erfc(alphaR);
// printf("realSpaceEwald: %f p1 %f p2 %f invR %f erfc(aR) %f aR %f\n", realSpaceEwaldEnergy, atomParameters[ii][QIndex], atomParameters[jj][QIndex], inverseR, erfc(alphaR), alphaR);
vdwEnergy = eps*(sig6-one)*sig6; vdwEnergy = eps*(sig6-one)*sig6;
if( totalEnergy ) if( totalEnergy )
*totalEnergy += realSpaceEwaldEnergy + vdwEnergy; *totalEnergy += realSpaceEwaldEnergy + vdwEnergy;
if( energyByAtom ){ if( energyByAtom ){
energyByAtom[ii] += realSpaceEwaldEnergy + vdwEnergy; energyByAtom[ii] += realSpaceEwaldEnergy + vdwEnergy;
energyByAtom[jj] += realSpaceEwaldEnergy + vdwEnergy; energyByAtom[jj] += realSpaceEwaldEnergy + vdwEnergy;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment