Commit 7b3072df authored by Peter Eastman's avatar Peter Eastman
Browse files

Continuing implementation of PME reciprocal space calculation

parent ddf4e3b4
......@@ -1374,6 +1374,7 @@ void gpuElectrostaticAllocate( amoebaGpuContext amoebaGpu )
amoebaGpu->psForce = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "ElectrostaticForce");
amoebaGpu->psTorque = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "Torque");
amoebaGpu->amoebaSim.pTorque = amoebaGpu->psTorque->_pDevData;
unsigned int offset = 3*paddedNumberOfAtoms*sizeof( float );
memset( amoebaGpu->psForce->_pSysStream[0], 0, offset );
......
......@@ -110,6 +110,7 @@ extern void SetCalculateAmoebaCudaMapTorquesSim(amoebaGpuContext gpu);
extern void GetCalculateAmoebaCudaMapTorquesSim(amoebaGpuContext gpu);
extern void cudaComputeAmoebaMapTorques( amoebaGpuContext gpu, CUDAStream<float>* psTorque, CUDAStream<float>* psForce);
extern void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext gpu, CUDAStream<float>* psTorque, CUDAStream<float>* psForce, CUDAStream<float4>* psOutputForce);
extern void cudaComputeAmoebaMapTorquesAndAddTotalForce2( amoebaGpuContext gpu, CUDAStream<float>* psTorque, CUDAStream<float4>* psOutputForce);
extern void SetCalculateAmoebaKirkwoodSim( amoebaGpuContext amoebaGpu );
extern void GetCalculateAmoebaKirkwoodSim( amoebaGpuContext amoebaGpu );
......
......@@ -140,6 +140,8 @@ struct cudaAmoebaGmxSimulation {
float* pInducedDipoleS;
float* pInducedDipolePolarS;
float* pTorque;
float* pWorkArray_3_1;
float* pWorkArray_3_2;
float* pWorkArray_1_1;
......
......@@ -4,6 +4,7 @@
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
#include "bbsort.h"
//#define AMOEBA_DEBUG
......@@ -11,7 +12,7 @@ static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
/* Cuda compiler on Windows does not recognized "static const float" values */
#define LOCAL_HACK_PI 3.1415926535897932384626433832795
#define LOCAL_HACK_PI 3.1415926535897932384626433832795f
void SetCalculateAmoebaPMESim(amoebaGpuContext amoebaGpu)
{
......@@ -100,13 +101,13 @@ __device__ void computeBSplinePoint(float4* thetai, float w, float* array)
*/
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(256, 1)
__launch_bounds__(512, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(256, 1)
#else
__launch_bounds__(128, 1)
#endif
void kComputeBsplines_kernel()
void kComputeAmoebaBsplines_kernel()
{
extern __shared__ float bsplines_cache[]; // size = block_size*pme_order*pme_order
float* array = &bsplines_cache[threadIdx.x*AMOEBA_PME_ORDER*AMOEBA_PME_ORDER];
......@@ -158,11 +159,11 @@ void kComputeBsplines_kernel()
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
__launch_bounds__(768, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(512, 1)
__launch_bounds__(384, 1)
#else
__launch_bounds__(256, 1)
__launch_bounds__(192, 1)
#endif
void kGridSpreadFixedMultipoles_kernel()
{
......@@ -252,11 +253,11 @@ void kGridSpreadFixedMultipoles_kernel()
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
__launch_bounds__(768, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(512, 1)
__launch_bounds__(384, 1)
#else
__launch_bounds__(256, 1)
__launch_bounds__(192, 1)
#endif
void kGridSpreadInducedDipoles_kernel()
{
......@@ -342,11 +343,11 @@ void kGridSpreadInducedDipoles_kernel()
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(512, 1)
__launch_bounds__(768, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(384, 1)
#else
__launch_bounds__(256, 1)
__launch_bounds__(192, 1)
#endif
void kAmoebaReciprocalConvolution_kernel()
{
......@@ -380,11 +381,11 @@ void kAmoebaReciprocalConvolution_kernel()
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
__launch_bounds__(768, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(512, 1)
__launch_bounds__(384, 1)
#else
__launch_bounds__(256, 1)
__launch_bounds__(192, 1)
#endif
void kComputeFixedPotentialFromGrid_kernel()
{
......@@ -496,11 +497,11 @@ void kComputeFixedPotentialFromGrid_kernel()
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
__launch_bounds__(768, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(512, 1)
__launch_bounds__(384, 1)
#else
__launch_bounds__(256, 1)
__launch_bounds__(192, 1)
#endif
void kComputeInducedPotentialFromGrid_kernel()
{
......@@ -702,11 +703,11 @@ void kComputeInducedPotentialFromGrid_kernel()
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
__launch_bounds__(768, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(512, 1)
__launch_bounds__(384, 1)
#else
__launch_bounds__(256, 1)
__launch_bounds__(192, 1)
#endif
void kComputeFixedMultipoleForceAndEnergy_kernel()
{
......@@ -716,6 +717,8 @@ void kComputeFixedMultipoleForceAndEnergy_kernel()
const int deriv3[] = {4, 9, 10, 7, 15, 17, 13, 20, 18, 19};
float energy = 0.0f;
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < cSim.atoms; i += blockDim.x*gridDim.x) {
// Compute the force and energy.
multipole[0] = cSim.pPosq[i].w;
multipole[1] = cAmoebaSim.pLabFrameDipole[i+3];
multipole[2] = cAmoebaSim.pLabFrameDipole[i+3+1];
......@@ -741,17 +744,32 @@ void kComputeFixedMultipoleForceAndEnergy_kernel()
force.y += f.y;
force.z += f.z;
cSim.pForce4[i] = force;
// Compute the torque.
cAmoebaSim.pTorque[3*i] = multipole[3]*cAmoebaSim.pPhi[2] - multipole[2]*cAmoebaSim.pPhi[3]
+ 2.0f*(multipole[6]-multipole[5])*cAmoebaSim.pPhi[9]
+ multipole[8]*cAmoebaSim.pPhi[7] + multipole[9]*cAmoebaSim.pPhi[5]
- multipole[7]*cAmoebaSim.pPhi[8] - multipole[9]*cAmoebaSim.pPhi[6];
cAmoebaSim.pTorque[3*i+1] = multipole[1]*cAmoebaSim.pPhi[3] - multipole[3]*cAmoebaSim.pPhi[1]
+ 2.0f*(multipole[4]-multipole[6])*cAmoebaSim.pPhi[9]
+ multipole[7]*cAmoebaSim.pPhi[9] + multipole[8]*cAmoebaSim.pPhi[6]
- multipole[8]*cAmoebaSim.pPhi[4] - multipole[9]*cAmoebaSim.pPhi[7];
cAmoebaSim.pTorque[3*i+1] = multipole[2]*cAmoebaSim.pPhi[1] - multipole[1]*cAmoebaSim.pPhi[2]
+ 2.0f*(multipole[5]-multipole[4])*cAmoebaSim.pPhi[7]
+ multipole[7]*cAmoebaSim.pPhi[4] + multipole[9]*cAmoebaSim.pPhi[8]
- multipole[7]*cAmoebaSim.pPhi[5] - multipole[8]*cAmoebaSim.pPhi[9];
}
cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*energy;
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
__launch_bounds__(768, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(512, 1)
__launch_bounds__(384, 1)
#else
__launch_bounds__(256, 1)
__launch_bounds__(192, 1)
#endif
void kComputeInducedDipoleForceAndEnergy_kernel()
{
......@@ -763,6 +781,8 @@ void kComputeInducedDipoleForceAndEnergy_kernel()
const int deriv3[] = {4, 9, 10, 7, 15, 17, 13, 20, 18, 19};
float energy = 0.0f;
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < cSim.atoms; i += blockDim.x*gridDim.x) {
// Compute the force and energy.
multipole[0] = cSim.pPosq[i].w;
multipole[1] = cAmoebaSim.pLabFrameDipole[i+3];
multipole[2] = cAmoebaSim.pLabFrameDipole[i+3+1];
......@@ -802,6 +822,69 @@ void kComputeInducedDipoleForceAndEnergy_kernel()
force.y += f.y;
force.z += f.z;
cSim.pForce4[i] = force;
// Compute the torque.
cAmoebaSim.pTorque[3*i] = multipole[3]*cAmoebaSim.pPhi[2] - multipole[2]*cAmoebaSim.pPhi[3]
+ 2.0f*(multipole[6]-multipole[5])*cAmoebaSim.pPhi[9]
+ multipole[8]*cAmoebaSim.pPhi[7] + multipole[9]*cAmoebaSim.pPhi[5]
- multipole[7]*cAmoebaSim.pPhi[8] - multipole[9]*cAmoebaSim.pPhi[6];
cAmoebaSim.pTorque[3*i+1] = multipole[1]*cAmoebaSim.pPhi[3] - multipole[3]*cAmoebaSim.pPhi[1]
+ 2.0f*(multipole[4]-multipole[6])*cAmoebaSim.pPhi[9]
+ multipole[7]*cAmoebaSim.pPhi[9] + multipole[8]*cAmoebaSim.pPhi[6]
- multipole[8]*cAmoebaSim.pPhi[4] - multipole[9]*cAmoebaSim.pPhi[7];
cAmoebaSim.pTorque[3*i+1] = multipole[2]*cAmoebaSim.pPhi[1] - multipole[1]*cAmoebaSim.pPhi[2]
+ 2.0f*(multipole[5]-multipole[4])*cAmoebaSim.pPhi[7]
+ multipole[7]*cAmoebaSim.pPhi[4] + multipole[9]*cAmoebaSim.pPhi[8]
- multipole[7]*cAmoebaSim.pPhi[5] - multipole[8]*cAmoebaSim.pPhi[9];
}
cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*energy;
}
extern void cudaComputeAmoebaMapTorquesAndAddTotalForce2(amoebaGpuContext gpu, CUDAStream<float>* psTorque, CUDAStream<float4>* psOutputForce);
__global__ void kFindAtomRangeForGrid_kernel();
void kCalculateAmoebaPME(amoebaGpuContext amoebaGpu)
{
gpuContext gpu = amoebaGpu->gpuContext;
int threads;
if (gpu->sm_version >= SM_20)
threads = 512;
else if (gpu->sm_version >= SM_12)
threads = 256;
else
threads = 128;
kComputeAmoebaBsplines_kernel<<<gpu->sim.blocks, threads, threads*AMOEBA_PME_ORDER*AMOEBA_PME_ORDER*sizeof(float)>>>();
LAUNCHERROR("kComputeAmoebaBsplines");
bbSort(gpu->psPmeAtomGridIndex->_pDevData, gpu->natoms);
kFindAtomRangeForGrid_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kFindAtomRangeForGrid");
// Perform PME for the fixed multipoles.
kGridSpreadFixedMultipoles_kernel<<<8*gpu->sim.blocks, 64>>>();
LAUNCHERROR("kGridSpreadFixedMultipoles");
cufftExecC2C(gpu->fftplan, gpu->psPmeGrid->_pDevData, gpu->psPmeGrid->_pDevData, CUFFT_FORWARD);
kAmoebaReciprocalConvolution_kernel<<<gpu->sim.blocks, gpu->sim.nonbond_threads_per_block>>>();
LAUNCHERROR("kAmoebaReciprocalConvolution");
cufftExecC2C(gpu->fftplan, gpu->psPmeGrid->_pDevData, gpu->psPmeGrid->_pDevData, CUFFT_INVERSE);
kComputeFixedPotentialFromGrid_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kComputeFixedPotentialFromGrid");
kComputeFixedMultipoleForceAndEnergy_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kComputeFixedMultipoleForceAndEnergy");
cudaComputeAmoebaMapTorquesAndAddTotalForce2(amoebaGpu, amoebaGpu->psTorque, gpu->psForce4);
// Perform PME for the induced dipoles.
kGridSpreadInducedDipoles_kernel<<<8*gpu->sim.blocks, 64>>>();
LAUNCHERROR("kGridSpreadInducedDipoles");
cufftExecC2C(gpu->fftplan, gpu->psPmeGrid->_pDevData, gpu->psPmeGrid->_pDevData, CUFFT_FORWARD);
kAmoebaReciprocalConvolution_kernel<<<gpu->sim.blocks, gpu->sim.nonbond_threads_per_block>>>();
LAUNCHERROR("kAmoebaReciprocalConvolution");
cufftExecC2C(gpu->fftplan, gpu->psPmeGrid->_pDevData, gpu->psPmeGrid->_pDevData, CUFFT_INVERSE);
kComputeInducedPotentialFromGrid_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kComputeInducedPotentialFromGrid");
kComputeInducedDipoleForceAndEnergy_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kComputeInducedDipoleForceAndEnergy");
cudaComputeAmoebaMapTorquesAndAddTotalForce2(amoebaGpu, amoebaGpu->psTorque, gpu->psForce4);
}
......@@ -291,6 +291,59 @@ void amoebaMapTorqueReduce_kernel2(
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void amoebaMapTorqueReduce_kernel3(
int numThreads,
int numOfAtoms,
int maxDiff,
float* tempElecForce,
float4* outputForce
){
unsigned int tid = threadIdx.x;
__shared__ float sfx[BLOCK_SIZE];
__shared__ float sfy[BLOCK_SIZE];
__shared__ float sfz[BLOCK_SIZE];
// load values then sum and add results to elecForce
if( tid < maxDiff ){
sfx[tid] = tempElecForce[blockIdx.x*3*maxDiff + tid*3 ];
sfy[tid] = tempElecForce[blockIdx.x*3*maxDiff + tid*3+1];
sfz[tid] = tempElecForce[blockIdx.x*3*maxDiff + tid*3+2];
} else {
sfx[tid] = sfy[tid] = sfz[tid] = 0.0f;
}
__syncthreads();
for( unsigned int s = (blockDim.x)/2; s != 0; s >>= 1 ){
if(tid<s){
sfx[tid] += sfx[tid+s];
sfy[tid] += sfy[tid+s];
sfz[tid] += sfz[tid+s];
}
__syncthreads();
}
if( tid == 0 ){
outputForce[blockIdx.x].x += sfx[0];
outputForce[blockIdx.x].y += sfy[0];
outputForce[blockIdx.x].z += sfz[0];
}
}
void cudaComputeAmoebaMapTorques( amoebaGpuContext amoebaGpu, CUDAStream<float>* psTorque, CUDAStream<float>* psForce )
{
......@@ -638,3 +691,98 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
#endif
}
void cudaComputeAmoebaMapTorquesAndAddTotalForce2( amoebaGpuContext amoebaGpu,
CUDAStream<float>* psTorque,
CUDAStream<float4>* psCudaForce4 )
{
// ---------------------------------------------------------------------------------------
//#define AMOEBA_DEBUG
#ifdef AMOEBA_DEBUG
static const char* methodName = "cudaComputeAmoebaMapTorquesAndAddTotalForce";
static int timestep = 0;
std::vector<int> fileId;
timestep++;
fileId.resize( 2 );
fileId[0] = timestep;
fileId[1] = 1;
#endif
// check that BLOCK_SIZE >= amoebaGpu->maxMapTorqueDifference
if( amoebaGpu->maxMapTorqueDifference > BLOCK_SIZE ){
(void) fprintf( amoebaGpu->log, "block size (%d) in amoebaMapTorqueReduce_kernel is too small ( > %d)! -- aborting.\n",
BLOCK_SIZE, amoebaGpu->maxMapTorqueDifference );
exit(-1);
}
// ---------------------------------------------------------------------------------------
gpuContext gpu = amoebaGpu->gpuContext;
int numThreads = min(256, (gpu->natoms));
int numBlocks = 1 + (gpu->natoms/numThreads);
amoebaMapTorqueToForce_kernel<<< numBlocks, numThreads>>> (
gpu->natoms,
gpu->psPosq4->_pDevStream[0],
psTorque->_pDevStream[0],
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pDevStream[0],
amoebaGpu->maxMapTorqueDifference,
amoebaGpu->torqueMapForce->_pDevStream[0] );
LAUNCHERROR("AmoebaMapTrqKernel");
numBlocks = gpu->natoms;
numThreads = amoebaGpu->maxMapTorqueDifferencePow2;
amoebaMapTorqueReduce_kernel3<<< numBlocks, numThreads>>>(
numThreads, gpu->natoms,
amoebaGpu->maxMapTorqueDifference,
amoebaGpu->torqueMapForce->_pDevStream[0],
psCudaForce4->_pDevStream[0] );
LAUNCHERROR("amoebaMapTorqueReduce_kernel2");
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s: numBlocks=%d numThreads=%d %d\n", methodName, numBlocks, numThreads, amoebaGpu->maxMapTorqueDifferencePow2); (void) fflush( amoebaGpu->log );
amoebaGpu->psForce->Download();
psCudaForce4->Download();
amoebaGpu->psTorque->Download();
int maxPrint = 10;
(void) fprintf( amoebaGpu->log,"Post torqueMap\n" );
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
int indexOffset = ii*3;
(void) fprintf( amoebaGpu->log,"FTtl[%16.9e %16.9e %16.9e] ",
psCudaForce4->_pSysStream[0][ii].x,
psCudaForce4->_pSysStream[0][ii].y,
psCudaForce4->_pSysStream[0][ii].z );
(void) fprintf( amoebaGpu->log,"F[%16.9e %16.9e %16.9e] ",
amoebaGpu->psForce->_pSysStream[0][indexOffset],
amoebaGpu->psForce->_pSysStream[0][indexOffset+1],
amoebaGpu->psForce->_pSysStream[0][indexOffset+2] );
(void) fprintf( amoebaGpu->log,"T[%16.9e %16.9e %16.9e]\n",
amoebaGpu->psTorque->_pSysStream[0][indexOffset],
amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
amoebaGpu->psTorque->_pSysStream[0][indexOffset+2] );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii )ii = gpu->natoms - maxPrint;
}
(void) fflush( amoebaGpu->log );
}
if( 1 ){
//std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector );
cudaLoadCudaFloat4Array( gpu->natoms, 4, gpu->psForce4, outputVector );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector);
cudaWriteVectorOfDoubleVectorsToFile( "CudaVacuumElecForce", fileId, outputVector );
}
#endif
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment