//----------------------------------------------------------------------------------------- //----------------------------------------------------------------------------------------- #include "cudaKernels.h" #include "amoebaCudaKernels.h" #include "kCalculateAmoebaCudaUtilities.h" #include #include #include using namespace std; #define SQRT sqrtf static __constant__ cudaGmxSimulation cSim; static __constant__ cudaAmoebaGmxSimulation cAmoebaSim; extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int*); void SetCalculateAmoebaMultipoleForcesSim(amoebaGpuContext amoebaGpu) { cudaError_t status; gpuContext gpu = amoebaGpu->gpuContext; status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation)); RTERROR(status, "SetCalculateAmoebaMultipoleForcesSim: cudaMemcpyToSymbol: SetSim copy to cSim failed"); status = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation)); RTERROR(status, "SetCalculateAmoebaMultipoleForcesSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed"); } void GetCalculateAmoebaMultipoleForcesSim(amoebaGpuContext amoebaGpu) { cudaError_t status; gpuContext gpu = amoebaGpu->gpuContext; status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation)); RTERROR(status, "GetCalculateAmoebaMultipoleForcesSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed"); status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation)); RTERROR(status, "GetCalculateAmoebaMultipoleForcesSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed"); } __device__ static float normVector3( float* vector ) { float norm = DOT3( vector, vector ); float returnNorm = SQRT( norm ); norm = returnNorm > 0.0f ? 1.0f/returnNorm : 0.0f; vector[0] *= norm; vector[1] *= norm; vector[2] *= norm; return returnNorm; } #undef AMOEBA_DEBUG // ZThenX == 0 // Bisector == 1 // ZBisect == 2 // ThreeFold == 3 // ZOnly == 4 // NoAxisType == 5 __global__ #if (__CUDA_ARCH__ >= 200) __launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1) #elif (__CUDA_ARCH__ >= 120) __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1) #else __launch_bounds__(G8X_THREADS_PER_BLOCK, 1) #endif void kCudaComputeCheckChiral_kernel( void ) { const int AD = 0; const int BD = 1; const int CD = 2; const int C = 3; float delta[4][3]; float4* particleCoord = cSim.pPosq; int4* multiPoleParticles = cAmoebaSim.pMultipoleParticlesIdsAndAxisType; float* labFrameDipole = cAmoebaSim.pLabFrameDipole; float* labFrameQuadrupole = cAmoebaSim.pLabFrameQuadrupole; // --------------------------------------------------------------------------------------- int particleIndex = blockIdx.x*blockDim.x + threadIdx.x; int numberOfParticles = cSim.atoms; while( particleIndex < numberOfParticles ) { // skip z-then-x int axisType = multiPoleParticles[particleIndex].w; if( axisType != 0 && multiPoleParticles[particleIndex].x >= 0 && multiPoleParticles[particleIndex].y >=0 && multiPoleParticles[particleIndex].z >= 0 ) { // --------------------------------------------------------------------------------------- int particleA = particleIndex; int particleB = multiPoleParticles[particleIndex].z; int particleC = multiPoleParticles[particleIndex].x; int particleD = multiPoleParticles[particleIndex].y; delta[AD][0] = particleCoord[particleA].x - particleCoord[particleD].x; delta[AD][1] = particleCoord[particleA].y - particleCoord[particleD].y; delta[AD][2] = particleCoord[particleA].z - particleCoord[particleD].z; delta[BD][0] = particleCoord[particleB].x - particleCoord[particleD].x; delta[BD][1] = particleCoord[particleB].y - particleCoord[particleD].y; delta[BD][2] = particleCoord[particleB].z - particleCoord[particleD].z; delta[CD][0] = particleCoord[particleC].x - particleCoord[particleD].x; delta[CD][1] = particleCoord[particleC].y - particleCoord[particleD].y; delta[CD][2] = particleCoord[particleC].z - particleCoord[particleD].z; delta[C][0] = delta[BD][1]*delta[CD][2] - delta[BD][2]*delta[CD][1]; delta[C][1] = delta[CD][1]*delta[AD][2] - delta[CD][2]*delta[AD][1]; delta[C][2] = delta[AD][1]*delta[BD][2] - delta[AD][2]*delta[BD][1]; float volume = delta[C][0]*delta[AD][0] + delta[C][1]*delta[BD][0] + delta[C][2]*delta[CD][0]; if( volume < 0.0 ){ labFrameDipole[particleIndex*3+1] *= -1.0f; // pole(3,i) labFrameQuadrupole[particleIndex*9+1] *= -1.0f; // pole(6,i) && pole(8,i) labFrameQuadrupole[particleIndex*9+3] *= -1.0f; // pole(10,i) && pole(12,i) labFrameQuadrupole[particleIndex*9+5] *= -1.0f; // pole(6,i) && pole(8,i) labFrameQuadrupole[particleIndex*9+7] *= -1.0f; // pole(10,i) && pole(12,i) } } particleIndex += gridDim.x*blockDim.x; } } __global__ #if (__CUDA_ARCH__ >= 200) __launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1) #elif (__CUDA_ARCH__ >= 120) __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1) #else __launch_bounds__(G8X_THREADS_PER_BLOCK, 1) #endif void kCudaComputeLabFrameMoments_kernel( void ) { float vectorX[3]; float vectorY[3]; float vectorZ[3]; int particleIndex = blockIdx.x*blockDim.x + threadIdx.x; float4* particleCoord = cSim.pPosq; int4* multiPoleParticles = cAmoebaSim.pMultipoleParticlesIdsAndAxisType; float* labFrameDipole = cAmoebaSim.pLabFrameDipole; float* labFrameQuadrupole = cAmoebaSim.pLabFrameQuadrupole; // get coordinates of this atom and the z & x axis atoms // compute the vector between the atoms and 1/sqrt(d2), d2 is distance between // this atom and the axis atom // this atom is referred to as the k-atom in notes below // code common to ZThenX and Bisector while( particleIndex < cSim.atoms ) { if( multiPoleParticles[particleIndex].x >= 0 && multiPoleParticles[particleIndex].z >= 0 ) { float4 coordinatesThisParticle = particleCoord[particleIndex]; int multipoleParticleIndex = multiPoleParticles[particleIndex].z; float4 coordinatesAxisParticle = particleCoord[multipoleParticleIndex]; vectorZ[0] = coordinatesAxisParticle.x - coordinatesThisParticle.x; vectorZ[1] = coordinatesAxisParticle.y - coordinatesThisParticle.y; vectorZ[2] = coordinatesAxisParticle.z - coordinatesThisParticle.z; multipoleParticleIndex = multiPoleParticles[particleIndex].x; coordinatesAxisParticle = particleCoord[multipoleParticleIndex]; vectorX[0] = coordinatesAxisParticle.x - coordinatesThisParticle.x; vectorX[1] = coordinatesAxisParticle.y - coordinatesThisParticle.y; vectorX[2] = coordinatesAxisParticle.z - coordinatesThisParticle.z; int axisType = multiPoleParticles[particleIndex].w; /* z-only (1) norm z (2) select random x (3) x = x - (x.z)z (4) norm x z-then-x (1) norm z (2) norm x (not needed) (3) x = x - (x.z)z (4) norm x bisector (1) norm z (2) norm x (3) z = x + z (4) norm z (5) x = x - (x.z)z (6) norm x z-bisect (1) norm z (2) norm x (3) norm y (3) x = x + y (4) norm x (5) x = x - (x.z)z (6) norm x 3-fold (1) norm z (2) norm x (3) norm y (4) z = x + y + z (5) norm z (6) x = x - (x.z)z (7) norm x */ // branch based on axis type float sum = normVector3( vectorZ ); if( axisType == 1 ){ // bisector sum = normVector3( vectorX ); vectorZ[0] += vectorX[0]; vectorZ[1] += vectorX[1]; vectorZ[2] += vectorX[2]; sum = normVector3( vectorZ ); } else if( axisType == 2 || axisType == 3 ){ // z-bisect multipoleParticleIndex = multiPoleParticles[particleIndex].y; if( multipoleParticleIndex >= 0 && multipoleParticleIndex < cSim.atoms ){ coordinatesAxisParticle = particleCoord[multipoleParticleIndex]; vectorY[0] = coordinatesAxisParticle.x - coordinatesThisParticle.x; vectorY[1] = coordinatesAxisParticle.y - coordinatesThisParticle.y; vectorY[2] = coordinatesAxisParticle.z - coordinatesThisParticle.z; sum = normVector3( vectorY ); sum = normVector3( vectorX ); if( axisType == 2 ){ vectorX[0] += vectorY[0]; vectorX[1] += vectorY[1]; vectorX[2] += vectorY[2]; sum = normVector3( vectorX ); } else { // 3-fold vectorZ[0] += vectorX[0] + vectorY[0]; vectorZ[1] += vectorX[1] + vectorY[1]; vectorZ[2] += vectorX[2] + vectorY[2]; sum = normVector3( vectorZ ); } } } else if( axisType >= 4 ){ vectorX[0] = 0.1f; vectorX[1] = 0.1f; vectorX[2] = 0.1f; } // x = x - (x.z)z float dot = vectorZ[0]*vectorX[0] + vectorZ[1]*vectorX[1] + vectorZ[2]*vectorX[2]; vectorX[0] -= dot*vectorZ[0]; vectorX[1] -= dot*vectorZ[1]; vectorX[2] -= dot*vectorZ[2]; sum = normVector3( vectorX ); vectorY[0] = (vectorZ[1]*vectorX[2]) - (vectorZ[2]*vectorX[1]); vectorY[1] = (vectorZ[2]*vectorX[0]) - (vectorZ[0]*vectorX[2]); vectorY[2] = (vectorZ[0]*vectorX[1]) - (vectorZ[1]*vectorX[0]); // use identity rotation matrix for unrecognized axis types if( axisType < 0 || axisType > 4 ){ vectorX[0] = 1.0f; vectorX[1] = 0.0f; vectorX[2] = 0.0f; vectorY[0] = 0.0f; vectorY[1] = 1.0f; vectorY[2] = 0.0f; vectorZ[0] = 0.0f; vectorZ[1] = 0.0f; vectorZ[2] = 1.0f; } unsigned int offset = 3*particleIndex; float molDipole[3]; molDipole[0] = labFrameDipole[offset]; molDipole[1] = labFrameDipole[offset+1]; molDipole[2] = labFrameDipole[offset+2]; // set out-of-range elements to 0.0f labFrameDipole[offset] = molDipole[0]*vectorX[0] + molDipole[1]*vectorY[0] + molDipole[2]*vectorZ[0]; labFrameDipole[offset+1] = molDipole[0]*vectorX[1] + molDipole[1]*vectorY[1] + molDipole[2]*vectorZ[1]; labFrameDipole[offset+2] = molDipole[0]*vectorX[2] + molDipole[1]*vectorY[2] + molDipole[2]*vectorZ[2]; // --------------------------------------------------------------------------------------- float mPole[3][3]; offset = 9*particleIndex; mPole[0][0] = labFrameQuadrupole[offset]; mPole[0][1] = labFrameQuadrupole[offset+1]; mPole[0][2] = labFrameQuadrupole[offset+2]; mPole[1][0] = labFrameQuadrupole[offset+3]; mPole[1][1] = labFrameQuadrupole[offset+4]; mPole[1][2] = labFrameQuadrupole[offset+5]; mPole[2][0] = labFrameQuadrupole[offset+6]; mPole[2][1] = labFrameQuadrupole[offset+7]; mPole[2][2] = labFrameQuadrupole[offset+8]; labFrameQuadrupole[offset+8] = vectorX[2]*(vectorX[2]*mPole[0][0] + vectorY[2]*mPole[0][1] + vectorZ[2]*mPole[0][2]); labFrameQuadrupole[offset+8] += vectorY[2]*(vectorX[2]*mPole[1][0] + vectorY[2]*mPole[1][1] + vectorZ[2]*mPole[1][2]); labFrameQuadrupole[offset+8] += vectorZ[2]*(vectorX[2]*mPole[2][0] + vectorY[2]*mPole[2][1] + vectorZ[2]*mPole[2][2]); labFrameQuadrupole[offset+4] = vectorX[1]*(vectorX[1]*mPole[0][0] + vectorY[1]*mPole[0][1] + vectorZ[1]*mPole[0][2]); labFrameQuadrupole[offset+4] += vectorY[1]*(vectorX[1]*mPole[1][0] + vectorY[1]*mPole[1][1] + vectorZ[1]*mPole[1][2]); labFrameQuadrupole[offset+4] += vectorZ[1]*(vectorX[1]*mPole[2][0] + vectorY[1]*mPole[2][1] + vectorZ[1]*mPole[2][2]); labFrameQuadrupole[offset+5] = vectorX[1]*(vectorX[2]*mPole[0][0] + vectorY[2]*mPole[0][1] + vectorZ[2]*mPole[0][2]); labFrameQuadrupole[offset+5] += vectorY[1]*(vectorX[2]*mPole[1][0] + vectorY[2]*mPole[1][1] + vectorZ[2]*mPole[1][2]); labFrameQuadrupole[offset+5] += vectorZ[1]*(vectorX[2]*mPole[2][0] + vectorY[2]*mPole[2][1] + vectorZ[2]*mPole[2][2]); labFrameQuadrupole[offset] = vectorX[0]*(vectorX[0]*mPole[0][0] + vectorY[0]*mPole[0][1] + vectorZ[0]*mPole[0][2]); labFrameQuadrupole[offset] += vectorY[0]*(vectorX[0]*mPole[1][0] + vectorY[0]*mPole[1][1] + vectorZ[0]*mPole[1][2]); labFrameQuadrupole[offset] += vectorZ[0]*(vectorX[0]*mPole[2][0] + vectorY[0]*mPole[2][1] + vectorZ[0]*mPole[2][2]); labFrameQuadrupole[offset+1] = vectorX[0]*(vectorX[1]*mPole[0][0] + vectorY[1]*mPole[0][1] + vectorZ[1]*mPole[0][2]); labFrameQuadrupole[offset+1] += vectorY[0]*(vectorX[1]*mPole[1][0] + vectorY[1]*mPole[1][1] + vectorZ[1]*mPole[1][2]); labFrameQuadrupole[offset+1] += vectorZ[0]*(vectorX[1]*mPole[2][0] + vectorY[1]*mPole[2][1] + vectorZ[1]*mPole[2][2]); labFrameQuadrupole[offset+2] = vectorX[0]*(vectorX[2]*mPole[0][0] + vectorY[2]*mPole[0][1] + vectorZ[2]*mPole[0][2]); labFrameQuadrupole[offset+2] += vectorY[0]*(vectorX[2]*mPole[1][0] + vectorY[2]*mPole[1][1] + vectorZ[2]*mPole[1][2]); labFrameQuadrupole[offset+2] += vectorZ[0]*(vectorX[2]*mPole[2][0] + vectorY[2]*mPole[2][1] + vectorZ[2]*mPole[2][2]); labFrameQuadrupole[offset+3] = labFrameQuadrupole[offset+1]; labFrameQuadrupole[offset+6] = labFrameQuadrupole[offset+2]; labFrameQuadrupole[offset+7] = labFrameQuadrupole[offset+5]; } particleIndex += gridDim.x*blockDim.x; } } void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu ) { // --------------------------------------------------------------------------------------- static const char* methodName = "computeCudaAmoebaLabFrameMoments"; // --------------------------------------------------------------------------------------- gpuContext gpu = amoebaGpu->gpuContext; int numBlocks = gpu->sim.blocks; int numThreads = gpu->sim.threads_per_block; //#define AMOEBA_DEBUG #ifdef AMOEBA_DEBUG if( amoebaGpu->log ){ (void) fprintf( amoebaGpu->log, "%s: numBlocks/atoms=%d\n", methodName, numBlocks ); (void) fflush( amoebaGpu->log ); amoebaGpu->psMultipoleParticlesIdsAndAxisType->Download(); amoebaGpu->psMolecularDipole->Download(); amoebaGpu->psMultipoleParticlesTorqueBufferIndices->Download(); gpu->psPosq4->Download(); for( int ii = 0; ii < gpu->natoms; ii++ ){ int mIndex = 3*ii; (void) fprintf( amoebaGpu->log,"%6d [%6d %6d %6d %6d] x[%16.9e %16.9e %16.9e] %s [%6d %6d %6d %6d]\n", ii, amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].x, amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].y, amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].z, amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].w, gpu->psPosq4->_pSysData[ii].x, gpu->psPosq4->_pSysData[ii].y, gpu->psPosq4->_pSysData[ii].z, (amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].w > 1 ? " XXX" : ""), amoebaGpu->psMultipoleParticlesTorqueBufferIndices->_pSysData[ii].x, amoebaGpu->psMultipoleParticlesTorqueBufferIndices->_pSysData[ii].y, amoebaGpu->psMultipoleParticlesTorqueBufferIndices->_pSysData[ii].z, amoebaGpu->psMultipoleParticlesTorqueBufferIndices->_pSysData[ii].w ); //if( ii == 30 )ii = gpu->natoms - 30; } } #endif // copy molecular moments to lab frame moment arrays // check if chiral center requires moments to have sign flipped // compute lab frame moments cudaMemcpy( amoebaGpu->psLabFrameDipole->_pDevData, amoebaGpu->psMolecularDipole->_pDevData, 3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice ); cudaMemcpy( amoebaGpu->psLabFrameQuadrupole->_pDevData, amoebaGpu->psMolecularQuadrupole->_pDevData, 9*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice ); kCudaComputeCheckChiral_kernel<<< numBlocks, numThreads>>> ( ); LAUNCHERROR("kCudaComputeCheckChiral"); kCudaComputeLabFrameMoments_kernel<<< numBlocks, numThreads>>> ( ); LAUNCHERROR(methodName); } void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaGeneralizedKirkwood ) { std::string methodName = "kCalculateAmoebaMultipoleForces"; // compute lab frame moments cudaComputeAmoebaLabFrameMoments( amoebaGpu ); if( 0 ){ gpuContext gpu = amoebaGpu->gpuContext; std::vector fileId; //fileId.push_back( 0 ); VectorOfDoubleVectors outputVector; //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psLabFrameDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f ); cudaLoadCudaFloatArray( gpu->natoms, 9, amoebaGpu->psLabFrameQuadrupole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f ); cudaWriteVectorOfDoubleVectorsToFile( "CudaLabMoments", fileId, outputVector ); } // compute fixed E-field and mutual induced field if( hasAmoebaGeneralizedKirkwood ){ cudaComputeAmoebaFixedEAndGkFields( amoebaGpu ); cudaComputeAmoebaMutualInducedAndGkField( amoebaGpu ); } else { if( amoebaGpu->multipoleNonbondedMethod == AMOEBA_NO_CUTOFF ){ cudaComputeAmoebaFixedEField( amoebaGpu ); cudaComputeAmoebaMutualInducedField( amoebaGpu ); } else { gpuContext gpu = amoebaGpu->gpuContext; kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>(); LAUNCHERROR("kFindBlockBoundsPeriodic"); kFindBlocksWithInteractionsPeriodic_kernel<<sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>(); LAUNCHERROR("kFindBlocksWithInteractionsPeriodic"); compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount); //compactStream( gpu->compactPlan, // gpu->sim.pInteractingWorkUnit, unsigned int* dOut // amoebaGpu->psWorkUnit->_pDevData, const unsigned int* dIn // gpu->sim.pInteractionFlag, const unsigned int* dValid // gpu->sim.workUnits, gpu // gpu->sim.pInteractionCount); kFindInteractionsWithinBlocksPeriodic_kernel<<sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic"); cudaComputeAmoebaPmeFixedEField( amoebaGpu ); cudaComputeAmoebaPmeMutualInducedField( amoebaGpu ); } } // check if induce dipole calculation converged -- abort if it did not if( amoebaGpu->mutualInducedDone == 0 ){ (void) fprintf( stderr, "%s induced dipole calculation did not converge -- aborting!\n", methodName.c_str() ); (void) fflush( stderr ); exit(-1); } // calculate electrostatic forces if( amoebaGpu->multipoleNonbondedMethod == AMOEBA_NO_CUTOFF ){ cudaComputeAmoebaElectrostatic( amoebaGpu, (hasAmoebaGeneralizedKirkwood ? 0 : 1) ); } else { cudaComputeAmoebaPmeElectrostatic( amoebaGpu ); } } #undef AMOEBA_DEBUG