/* -------------------------------------------------------------------------- * * OpenMM * * -------------------------------------------------------------------------- * * This is part of the OpenMM molecular simulation toolkit originating from * * Simbios, the NIH National Center for Physics-Based Simulation of * * Biological Structures at Stanford, funded under the NIH Roadmap for * * Medical Research, grant U54 GM072970. See https://simtk.org. * * * * Portions copyright (c) 2009 Stanford University and the Authors. * * Authors: Scott Le Grand, Peter Eastman * * Contributors: * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU Lesser General Public License as published * * by the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU Lesser General Public License for more details. * * * * You should have received a copy of the GNU Lesser General Public License * * along with this program. If not, see . * * -------------------------------------------------------------------------- */ #include "cudaKernels.h" #include "amoebaCudaKernels.h" #include "kCalculateAmoebaCudaUtilities.h" static __constant__ cudaGmxSimulation cSim; static __constant__ cudaAmoebaGmxSimulation cAmoebaSim; void SetCalculateAmoebaCudaPmeFixedEFieldSim(amoebaGpuContext amoebaGpu) { cudaError_t status; gpuContext gpu = amoebaGpu->gpuContext; status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation)); RTERROR(status, "SetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyToSymbol: SetSim copy to cSim failed"); status = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation)); RTERROR(status, "SetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed"); } void GetCalculateAmoebaCudaPmeFixedEFieldSim(amoebaGpuContext amoebaGpu) { cudaError_t status; gpuContext gpu = amoebaGpu->gpuContext; status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation)); RTERROR(status, "GetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed"); status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation)); RTERROR(status, "GetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed"); } __global__ #if (__CUDA_ARCH__ >= 200) __launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1) #elif (__CUDA_ARCH__ >= 120) __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1) #else __launch_bounds__(G8X_THREADS_PER_BLOCK, 1) #endif static void kReducePmeEFieldPolar_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* EFieldReciprocal, float* fieldIn, float* fieldOut ) { unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x; // Reduce field const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi; //const float term = 0.0f; while (pos < fieldComponents) { // self-term included here float totalField = EFieldReciprocal[pos] + term*cAmoebaSim.pLabFrameDipole[pos]; float* pFt = fieldIn + pos; unsigned int i = outputBuffers; while (i >= 4) { totalField += pFt[0] + pFt[fieldComponents] + pFt[2*fieldComponents] + pFt[3*fieldComponents]; pFt += fieldComponents*4; i -= 4; } if (i >= 2) { totalField += pFt[0] + pFt[fieldComponents]; pFt += fieldComponents*2; i -= 2; } if (i > 0) { totalField += pFt[0]; } fieldOut[pos] = totalField; pos += gridDim.x * blockDim.x; } } __global__ #if (__CUDA_ARCH__ >= 200) __launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1) #elif (__CUDA_ARCH__ >= 120) __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1) #else __launch_bounds__(G8X_THREADS_PER_BLOCK, 1) #endif static void kReducePmeEField_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn, float* fieldOut ) { unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x; // Reduce field const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi; //const float term = 0.0; while (pos < fieldComponents) { // self-term included here float totalField = term*cAmoebaSim.pLabFrameDipole[pos]; float* pFt = fieldIn + pos; unsigned int i = outputBuffers; while (i >= 4) { totalField += pFt[0] + pFt[fieldComponents] + pFt[2*fieldComponents] + pFt[3*fieldComponents]; pFt += fieldComponents*4; i -= 4; } if (i >= 2) { totalField += pFt[0] + pFt[fieldComponents]; pFt += fieldComponents*2; i -= 2; } if (i > 0) { totalField += pFt[0]; } fieldOut[pos] += totalField; pos += gridDim.x * blockDim.x; } } // reduce psWorkArray_3_1 -> EField // reduce psWorkArray_3_2 -> EFieldPolar static void kReducePmeDirectE_Fields(amoebaGpuContext amoebaGpu ) { gpuContext gpu = amoebaGpu->gpuContext; // E_FieldPolar = E_Field (reciprocal) + E_FieldPolar (direct) + self kReducePmeEFieldPolar_kernel<<sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>( gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers, amoebaGpu->psE_Field->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData ); LAUNCHERROR("kReducePmeE_Fields1"); // E_Field = E_Field (reciprocal) + E_Field (direct) + self kReducePmeEField_kernel<<sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>( gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers, amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData ); LAUNCHERROR("kReducePmeE_Fields2"); } // file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field #undef GK #undef INCLUDE_FIXED_FIELD_BUFFERS #define INCLUDE_FIXED_FIELD_BUFFERS #include "kCalculateAmoebaCudaFixedFieldParticle.h" #undef INCLUDE_FIXED_FIELD_BUFFERS __device__ void sumTempBuffer( FixedFieldParticle& atomI, FixedFieldParticle& atomJ ){ atomI.tempBuffer[0] += atomJ.tempBuffer[0]; atomI.tempBuffer[1] += atomJ.tempBuffer[1]; atomI.tempBuffer[2] += atomJ.tempBuffer[2]; atomI.tempBufferP[0] += atomJ.tempBufferP[0]; atomI.tempBufferP[1] += atomJ.tempBufferP[1]; atomI.tempBufferP[2] += atomJ.tempBufferP[2]; } __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& atomI, FixedFieldParticle& atomJ, float dscale, float pscale, float4 fields[3]){ // compute the real space portion of the Ewald summation float xr = atomJ.x - atomI.x; float yr = atomJ.y - atomI.y; float zr = atomJ.z - atomI.z; // periodic boundary conditions xr -= floorf(xr*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX; yr -= floorf(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY; zr -= floorf(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ; float r2 = xr*xr + yr*yr + zr*zr; if( r2 <= cSim.nonbondedCutoffSqr ){ float r = sqrtf(r2); // calculate the error function damping terms float ralpha = cSim.alphaEwald*r; float bn0 = erfcf(ralpha)/r; float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald; float alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald); float exp2a = expf(-(ralpha*ralpha)); alsq2n *= alsq2; float bn1 = (bn0+alsq2n*exp2a)/r2; alsq2n *= alsq2; float bn2 = (3.0f*bn1+alsq2n*exp2a)/r2; alsq2n *= alsq2; float bn3 = (5.0f*bn2+alsq2n*exp2a)/r2; // compute the error function scaled and unscaled terms float scale3 = 1.0f; float scale5 = 1.0f; float scale7 = 1.0f; float damp = atomI.damp*atomJ.damp; if( damp != 0.0f ){ float ratio = (r/damp); ratio = ratio*ratio*ratio; float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole; damp = -pgamma*ratio; if( damp > -50.0f) { float expdamp = expf(damp); scale3 = 1.0f - expdamp; scale5 = 1.0f - expdamp*(1.0f-damp); scale7 = 1.0f - expdamp*(1.0f-damp+(0.6f*damp*damp)); } } float dsc3 = dscale*scale3; float dsc5 = dscale*scale5; float dsc7 = dscale*scale7; float psc3 = pscale*scale3; float psc5 = pscale*scale5; float psc7 = pscale*scale7; float r3 = (r*r2); float r5 = (r3*r2); float r7 = (r5*r2); float drr3 = (1.0f-dsc3)/r3; float drr5 = 3.0f * (1.0f-dsc5)/r5; float drr7 = 15.0f * (1.0f-dsc7)/r7; float prr3 = (1.0f-psc3) / r3; float prr5 = 3.0f *(1.0f-psc5)/r5; float prr7 = 15.0f*(1.0f-psc7)/r7; float dir = atomI.labFrameDipole_X*xr + atomI.labFrameDipole_Y*yr + atomI.labFrameDipole_Z*zr; float qix = atomI.labFrameQuadrupole_XX*xr + atomI.labFrameQuadrupole_XY*yr + atomI.labFrameQuadrupole_XZ*zr; float qiy = atomI.labFrameQuadrupole_XY*xr + atomI.labFrameQuadrupole_YY*yr + atomI.labFrameQuadrupole_YZ*zr; float qiz = atomI.labFrameQuadrupole_XZ*xr + atomI.labFrameQuadrupole_YZ*yr + atomI.labFrameQuadrupole_ZZ*zr; float qir = qix*xr + qiy*yr + qiz*zr; float dkr = atomJ.labFrameDipole_X*xr + atomJ.labFrameDipole_Y*yr + atomJ.labFrameDipole_Z*zr; float qkx = atomJ.labFrameQuadrupole_XX*xr + atomJ.labFrameQuadrupole_XY*yr + atomJ.labFrameQuadrupole_XZ*zr; float qky = atomJ.labFrameQuadrupole_XY*xr + atomJ.labFrameQuadrupole_YY*yr + atomJ.labFrameQuadrupole_YZ*zr; float qkz = atomJ.labFrameQuadrupole_XZ*xr + atomJ.labFrameQuadrupole_YZ*yr + atomJ.labFrameQuadrupole_ZZ*zr; float qkr = qkx*xr + qky*yr + qkz*zr; float fim0 = -xr*(bn1*atomJ.q-bn2*dkr+bn3*qkr) - bn1*atomJ.labFrameDipole_X + 2.0f*bn2*qkx; float fim1 = -yr*(bn1*atomJ.q-bn2*dkr+bn3*qkr) - bn1*atomJ.labFrameDipole_Y + 2.0f*bn2*qky; float fim2 = -zr*(bn1*atomJ.q-bn2*dkr+bn3*qkr) - bn1*atomJ.labFrameDipole_Z + 2.0f*bn2*qkz; float fkm0 = xr*(bn1*atomI.q+bn2*dir+bn3*qir) - bn1*atomI.labFrameDipole_X - 2.0f*bn2*qix; float fkm1 = yr*(bn1*atomI.q+bn2*dir+bn3*qir) - bn1*atomI.labFrameDipole_Y - 2.0f*bn2*qiy; float fkm2 = zr*(bn1*atomI.q+bn2*dir+bn3*qir) - bn1*atomI.labFrameDipole_Z - 2.0f*bn2*qiz; float fid0 = -xr*(drr3*atomJ.q-drr5*dkr+drr7*qkr) - drr3*atomJ.labFrameDipole_X + 2.0f*drr5*qkx; float fid1 = -yr*(drr3*atomJ.q-drr5*dkr+drr7*qkr) - drr3*atomJ.labFrameDipole_Y + 2.0f*drr5*qky; float fid2 = -zr*(drr3*atomJ.q-drr5*dkr+drr7*qkr) - drr3*atomJ.labFrameDipole_Z + 2.0f*drr5*qkz; float fkd0 = xr*(drr3*atomI.q+drr5*dir+drr7*qir) - drr3*atomI.labFrameDipole_X - 2.0f*drr5*qix; float fkd1 = yr*(drr3*atomI.q+drr5*dir+drr7*qir) - drr3*atomI.labFrameDipole_Y - 2.0f*drr5*qiy; float fkd2 = zr*(drr3*atomI.q+drr5*dir+drr7*qir) - drr3*atomI.labFrameDipole_Z - 2.0f*drr5*qiz; float fip0 = -xr*(prr3*atomJ.q-prr5*dkr+prr7*qkr) - prr3*atomJ.labFrameDipole_X + 2.0f*prr5*qkx; float fip1 = -yr*(prr3*atomJ.q-prr5*dkr+prr7*qkr) - prr3*atomJ.labFrameDipole_Y + 2.0f*prr5*qky; float fip2 = -zr*(prr3*atomJ.q-prr5*dkr+prr7*qkr) - prr3*atomJ.labFrameDipole_Z + 2.0f*prr5*qkz; float fkp0 = xr*(prr3*atomI.q+prr5*dir+prr7*qir) - prr3*atomI.labFrameDipole_X - 2.0f*prr5*qix; float fkp1 = yr*(prr3*atomI.q+prr5*dir+prr7*qir) - prr3*atomI.labFrameDipole_Y - 2.0f*prr5*qiy; float fkp2 = zr*(prr3*atomI.q+prr5*dir+prr7*qir) - prr3*atomI.labFrameDipole_Z - 2.0f*prr5*qiz; // increment the field at each site due to this interaction fields[0].x = fim0 - fid0; fields[1].x = fim1 - fid1; fields[2].x = fim2 - fid2; fields[0].y = fkm0 - fkd0; fields[1].y = fkm1 - fkd1; fields[2].y = fkm2 - fkd2; fields[0].z = fim0 - fip0; fields[1].z = fim1 - fip1; fields[2].z = fim2 - fip2; fields[0].w = fkm0 - fkp0; fields[1].w = fkm1 - fkp1; fields[2].w = fkm2 - fkp2; } else { fields[0].x = 0.0f; fields[0].y = 0.0f; fields[0].z = 0.0f; fields[0].w = 0.0f; fields[1].x = 0.0f; fields[1].y = 0.0f; fields[1].z = 0.0f; fields[1].w = 0.0f; fields[2].x = 0.0f; fields[2].y = 0.0f; fields[2].z = 0.0f; fields[2].w = 0.0f; } } // Include versions of the kernels for N^2 calculations. #define METHOD_NAME(a, b) a##Cutoff##b #include "kCalculateAmoebaCudaPmeFixedEField.h" #define USE_OUTPUT_BUFFER_PER_WARP #undef METHOD_NAME #define METHOD_NAME(a, b) a##CutoffByWarp##b #include "kCalculateAmoebaCudaPmeFixedEField.h" /**--------------------------------------------------------------------------------------- Report whether a number is a nan or infinity @param number number to test @return 1 if number is nan or infinity; else return 0 --------------------------------------------------------------------------------------- */ /**--------------------------------------------------------------------------------------- Compute fixed electric field using PME @param amoebaGpu amoebaGpu context --------------------------------------------------------------------------------------- */ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu ) { static unsigned int threadsPerBlock = 0; gpuContext gpu = amoebaGpu->gpuContext; kClearFields_3( amoebaGpu, 2 ); // on first pass, set threads/block if( threadsPerBlock == 0 ){ unsigned int maxThreads; if (gpu->sm_version >= SM_20) maxThreads = 384; else if (gpu->sm_version >= SM_12) maxThreads = 192; else maxThreads = 64; threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle), gpu->sharedMemoryPerBlock ), maxThreads); } if (gpu->bOutputBufferPerWarp){ kCalculateAmoebaPmeDirectFixedE_FieldCutoffByWarp_kernel<<sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>( gpu->sim.pInteractingWorkUnit, amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData ); } else { kCalculateAmoebaPmeDirectFixedE_FieldCutoff_kernel<<sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>( gpu->sim.pInteractingWorkUnit, amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData ); } LAUNCHERROR("kCalculateAmoebaPmeDirectFixedE_Field_kernel"); kReducePmeDirectE_Fields( amoebaGpu ); } void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu ) { kCalculateAmoebaPMEFixedMultipoles( amoebaGpu ); cudaComputeAmoebaPmeDirectFixedEField( amoebaGpu ); }