//----------------------------------------------------------------------------------------- //----------------------------------------------------------------------------------------- #include "amoebaCudaKernels.h" #include #include #include using namespace std; #define SQRT sqrtf static __constant__ cudaGmxSimulation cSim; static __constant__ cudaAmoebaGmxSimulation cAmoebaSim; void SetCalculateAmoebaMultipoleForcesSim(amoebaGpuContext amoebaGpu) { cudaError_t status; gpuContext gpu = amoebaGpu->gpuContext; status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation)); RTERROR(status, "SetCalculateAmoebaMultipoleForcesSim: cudaMemcpyToSymbol: SetSim copy to cSim failed"); status = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation)); RTERROR(status, "SetCalculateAmoebaMultipoleForcesSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed"); } void GetCalculateAmoebaMultipoleForcesSim(amoebaGpuContext amoebaGpu) { cudaError_t status; gpuContext gpu = amoebaGpu->gpuContext; status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation)); RTERROR(status, "GetCalculateAmoebaMultipoleForcesSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed"); status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation)); RTERROR(status, "GetCalculateAmoebaMultipoleForcesSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed"); } __device__ static float normVector3( float* vector ) { float norm = DOT3( vector, vector ); float returnNorm = SQRT( norm ); norm = returnNorm > 0.0f ? 1.0f/returnNorm : 0.0f; vector[0] *= norm; vector[1] *= norm; vector[2] *= norm; return returnNorm; } __global__ #if (__CUDA_ARCH__ >= 200) __launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1) #elif (__CUDA_ARCH__ >= 130) __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1) #else __launch_bounds__(G8X_THREADS_PER_BLOCK, 1) #endif void kCudaComputeLabFrameMoments_kernel( int numOfAtoms, float *rotationMatrix, float4 *atomCoord, int4 *multiPoleAtoms, float *molecularDipole, float *molecularQuadrupole, float *labFrameDipole, float *labFrameQuadrupole ) { float* vectorX; float* vectorY; float* vectorZ; // --------------------------------------------------------------------------------------- int atomIndex = blockIdx.x;//__mul24(blockIdx.x,blockDim.x) + threadIdx.x ; // --------------------------------------------------------------------------------------- // get coordinates of this atom and the z & x axis atoms // compute the vector between the atoms and 1/sqrt(d2), d2 is distance between // this atom and the axis atom // this atom is referred to as the k-atom in notes below // code common to ZThenX and Bisector vectorX = &(rotationMatrix[atomIndex*9]); vectorY = &(rotationMatrix[atomIndex*9+ 3]); vectorZ = &(rotationMatrix[atomIndex*9+ 6]); float4 coordinatesThisAtom = atomCoord[atomIndex]; int multipoleAtomIndex = multiPoleAtoms[atomIndex].x; float4 coordinatesAxisAtom = atomCoord[multipoleAtomIndex]; vectorZ[0] = coordinatesAxisAtom.x - coordinatesThisAtom.x; vectorZ[1] = coordinatesAxisAtom.y - coordinatesThisAtom.y; vectorZ[2] = coordinatesAxisAtom.z - coordinatesThisAtom.z; multipoleAtomIndex = multiPoleAtoms[atomIndex].y; coordinatesAxisAtom = atomCoord[multipoleAtomIndex]; vectorX[0] = coordinatesAxisAtom.x - coordinatesThisAtom.x; vectorX[1] = coordinatesAxisAtom.y - coordinatesThisAtom.y; vectorX[2] = coordinatesAxisAtom.z - coordinatesThisAtom.z; int axisType = multiPoleAtoms[atomIndex].w; float sum = normVector3( vectorZ ); // branch based on axis type if( axisType == 1 ){ // bisector // dx = dx1 + dx2 (in Tinker code) sum = normVector3( vectorX ); vectorZ[0] += vectorX[0]; vectorZ[1] += vectorX[1]; vectorZ[2] += vectorX[2]; sum = normVector3( vectorZ ); } float dot = vectorZ[0]*vectorX[0] + vectorZ[1]*vectorX[1] + vectorZ[2]*vectorX[2]; vectorX[0] -= dot*vectorZ[0]; vectorX[1] -= dot*vectorZ[1]; vectorX[2] -= dot*vectorZ[2]; sum = normVector3( vectorX ); vectorY[0] = (vectorZ[1]*vectorX[2]) - (vectorZ[2]*vectorX[1]); vectorY[1] = (vectorZ[2]*vectorX[0]) - (vectorZ[0]*vectorX[2]); vectorY[2] = (vectorZ[0]*vectorX[1]) - (vectorZ[1]*vectorX[0]); float* molDipole = &(molecularDipole[atomIndex*3]); float* labDipole = &(labFrameDipole[atomIndex*3]); // set out-of-range elements to 0.0f labDipole[0] = atomIndex >= numOfAtoms ? 0.0f : molDipole[0]*vectorX[0] + molDipole[1]*vectorY[0] + molDipole[2]*vectorZ[0]; labDipole[1] = atomIndex >= numOfAtoms ? 0.0f : molDipole[0]*vectorX[1] + molDipole[1]*vectorY[1] + molDipole[2]*vectorZ[1]; labDipole[2] = atomIndex >= numOfAtoms ? 0.0f : molDipole[0]*vectorX[2] + molDipole[1]*vectorY[2] + molDipole[2]*vectorZ[2]; // --------------------------------------------------------------------------------------- const float * mPole[3]; float* rPole[3]; float* molQuadrupole = &(molecularQuadrupole[atomIndex*9]); float* labQuadrupole = &(labFrameQuadrupole[atomIndex*9]); for( int ii = 0; ii < 3; ii++ ){ mPole[ii] = molQuadrupole + ii*3; rPole[ii] = labQuadrupole + ii*3; rPole[ii][0] = rPole[ii][1] = rPole[ii][2] = 0.0f; } int ii = threadIdx.x; if( ii < 3 ){ for( int jj = ii; jj < 3; jj++ ){ rPole[ii][jj] += vectorX[ii]*vectorX[jj]*mPole[0][0]; rPole[ii][jj] += vectorX[ii]*vectorY[jj]*mPole[0][1]; rPole[ii][jj] += vectorX[ii]*vectorZ[jj]*mPole[0][2]; rPole[ii][jj] += vectorY[ii]*vectorX[jj]*mPole[1][0]; rPole[ii][jj] += vectorY[ii]*vectorY[jj]*mPole[1][1]; rPole[ii][jj] += vectorY[ii]*vectorZ[jj]*mPole[1][2]; rPole[ii][jj] += vectorZ[ii]*vectorX[jj]*mPole[2][0]; rPole[ii][jj] += vectorZ[ii]*vectorY[jj]*mPole[2][1]; rPole[ii][jj] += vectorZ[ii]*vectorZ[jj]*mPole[2][2]; } } __syncthreads(); rPole[1][0] = rPole[0][1]; rPole[2][0] = rPole[0][2]; rPole[2][1] = rPole[1][2]; // set out-of-range elements to 0.0f labQuadrupole[0] = atomIndex >= numOfAtoms ? 0.0f : labQuadrupole[0]; labQuadrupole[1] = atomIndex >= numOfAtoms ? 0.0f : labQuadrupole[1]; labQuadrupole[2] = atomIndex >= numOfAtoms ? 0.0f : labQuadrupole[2]; labQuadrupole[3] = atomIndex >= numOfAtoms ? 0.0f : labQuadrupole[3]; labQuadrupole[4] = atomIndex >= numOfAtoms ? 0.0f : labQuadrupole[4]; labQuadrupole[5] = atomIndex >= numOfAtoms ? 0.0f : labQuadrupole[5]; labQuadrupole[6] = atomIndex >= numOfAtoms ? 0.0f : labQuadrupole[6]; labQuadrupole[7] = atomIndex >= numOfAtoms ? 0.0f : labQuadrupole[7]; labQuadrupole[8] = atomIndex >= numOfAtoms ? 0.0f : labQuadrupole[8]; } void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu ) { // --------------------------------------------------------------------------------------- static const char* methodName = "computeCudaAmoebaLabFrameMoments"; // --------------------------------------------------------------------------------------- gpuContext gpu = amoebaGpu->gpuContext; int numBlocks = amoebaGpu->paddedNumberOfAtoms; int numThreads = 20; //#define AMOEBA_DEBUG #ifdef AMOEBA_DEBUG if( 0 && amoebaGpu->log ){ (void) fprintf( amoebaGpu->log, "%s: numBlocks/atoms=%d\n", methodName, numBlocks ); (void) fflush( amoebaGpu->log ); amoebaGpu->psMultipoleParticlesIdsAndAxisType->Download(); amoebaGpu->psMolecularDipole->Download(); gpu->psPosq4->Download(); for( int ii = 0; ii < gpu->natoms; ii++ ){ int mIndex = 3*ii; (void) fprintf( amoebaGpu->log,"%6d [%6d %6d %6d] x[%16.9e %16.9e %16.9e] dpl[%16.9e %16.9e %16.9e]\nRot[%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n\n", ii, amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].x, amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].y, amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].w, gpu->psPosq4->_pSysStream[0][ii].x, gpu->psPosq4->_pSysStream[0][ii].y, gpu->psPosq4->_pSysStream[0][ii].z, amoebaGpu->psMolecularDipole->_pSysStream[0][mIndex], amoebaGpu->psMolecularDipole->_pSysStream[0][mIndex+1], amoebaGpu->psMolecularDipole->_pSysStream[0][mIndex+2] ); } } // int64 kernelTime = AmoebaTiming::getTimeOfDay(); double kernelTime = 0.0; #endif kCudaComputeLabFrameMoments_kernel<<< numBlocks, numThreads>>> ( gpu->natoms, amoebaGpu->psRotationMatrix->_pDevStream[0], gpu->psPosq4->_pDevStream[0], amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pDevStream[0], amoebaGpu->psMolecularDipole->_pDevStream[0], amoebaGpu->psMolecularQuadrupole->_pDevStream[0], amoebaGpu->psLabFrameDipole->_pDevStream[0], amoebaGpu->psLabFrameQuadrupole->_pDevStream[0] ); LAUNCHERROR(methodName); #ifdef AMOEBA_DEBUG if( amoebaGpu->log ){ // kernelTime = AmoebaTiming::getTimeOfDay() - kernelTime; static int timestep = 0; timestep++; (void) fprintf( amoebaGpu->log, "Finished rotation kernel execution in %lf us\n", kernelTime ); (void) fflush( amoebaGpu->log ); (void) fprintf( amoebaGpu->log, "psLabFrameDipole=%p _pSysStream=%p _pSysStream[0]=%p _pDevStream=%p _pDevStream[0]=%p\n", amoebaGpu->psLabFrameDipole, amoebaGpu->psLabFrameDipole->_pSysStream, amoebaGpu->psLabFrameDipole->_pSysStream[0], amoebaGpu->psLabFrameDipole->_pDevStream, amoebaGpu->psLabFrameDipole->_pDevStream[0] ); fflush( amoebaGpu->log ); amoebaGpu->psRotationMatrix->Download(); amoebaGpu->psLabFrameDipole->Download(); (void) fprintf( amoebaGpu->log, "psLabFrameDipole completed\n" ); (void) fflush( amoebaGpu->log ); amoebaGpu->psLabFrameQuadrupole->Download(); (void) fprintf( amoebaGpu->log, "psLabFrameQpole completed\n" ); (void) fflush( amoebaGpu->log ); int maxPrint = 10; for( int ii = 0; ii < amoebaGpu->paddedNumberOfAtoms; ii++ ){ int dipoleOffset = 3*ii; int quadrupoleOffset = 9*ii; (void) fprintf( amoebaGpu->log,"\n%6d [%6d %6d %6d] ", ii, amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].x, amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].y, amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].w ); // coords (void) fprintf( amoebaGpu->log,"x[%16.9e %16.9e %16.9e]\n", gpu->psPosq4->_pSysStream[0][ii].x, gpu->psPosq4->_pSysStream[0][ii].y, gpu->psPosq4->_pSysStream[0][ii].z); (void) fprintf( amoebaGpu->log," R[%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n", amoebaGpu->psRotationMatrix->_pSysStream[0][quadrupoleOffset], amoebaGpu->psRotationMatrix->_pSysStream[0][quadrupoleOffset+1], amoebaGpu->psRotationMatrix->_pSysStream[0][quadrupoleOffset+2], amoebaGpu->psRotationMatrix->_pSysStream[0][quadrupoleOffset+3], amoebaGpu->psRotationMatrix->_pSysStream[0][quadrupoleOffset+4], amoebaGpu->psRotationMatrix->_pSysStream[0][quadrupoleOffset+5], amoebaGpu->psRotationMatrix->_pSysStream[0][quadrupoleOffset+6], amoebaGpu->psRotationMatrix->_pSysStream[0][quadrupoleOffset+7], amoebaGpu->psRotationMatrix->_pSysStream[0][quadrupoleOffset+8] ); // dipole (void) fprintf( amoebaGpu->log," D[%16.9e %16.9e %16.9e]\n", amoebaGpu->psLabFrameDipole->_pSysStream[0][dipoleOffset], amoebaGpu->psLabFrameDipole->_pSysStream[0][dipoleOffset+1], amoebaGpu->psLabFrameDipole->_pSysStream[0][dipoleOffset+2] ); // quadrupole (void) fprintf( amoebaGpu->log," Q[%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n", amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][quadrupoleOffset], amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][quadrupoleOffset+1], amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][quadrupoleOffset+2], amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][quadrupoleOffset+3], amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][quadrupoleOffset+4], amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][quadrupoleOffset+5], amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][quadrupoleOffset+6], amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][quadrupoleOffset+7], amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][quadrupoleOffset+8] ); if( ii == maxPrint && (ii < (gpu->natoms - maxPrint)) ){ ii = gpu->natoms - maxPrint; } } int nansDetected = checkForNansAndInfinities( amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->psLabFrameDipole ); nansDetected += checkForNansAndInfinities( amoebaGpu->paddedNumberOfAtoms*9, amoebaGpu->psLabFrameQuadrupole ); if( nansDetected ){ (void) fprintf( amoebaGpu->log,"Nans detected in dipole/quadrupoles.\n" ); exit(0); } (void) fflush( amoebaGpu->log ); } #endif if( 0 ){ // int particles = particles; int particles = amoebaGpu->paddedNumberOfAtoms; std::vector fileId; //fileId.push_back( 0 ); VectorOfDoubleVectors outputVector; cudaLoadCudaFloat4Array( particles, 3, gpu->psPosq4, outputVector ); cudaLoadCudaFloatArray( particles, 9, amoebaGpu->psRotationMatrix, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaRotationMatrices", fileId, outputVector ); } if( 0 ){ int particles = amoebaGpu->paddedNumberOfAtoms; std::vector fileId; //fileId.push_back( 0 ); VectorOfDoubleVectors outputVector; cudaLoadCudaFloat4Array( particles, 3, gpu->psPosq4, outputVector ); cudaLoadCudaFloatArray( particles, 3, amoebaGpu->psLabFrameDipole, outputVector ); cudaLoadCudaFloatArray( particles, 9, amoebaGpu->psLabFrameQuadrupole, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaRotatedMoments", fileId, outputVector ); } } void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaGeneralizedKirkwood ) { std::string methodName = "kCalculateAmoebaMultipoleForces"; //printf("%s \n", methodName.c_str() ); fflush( stdout ); // compute lab frame moments cudaComputeAmoebaLabFrameMoments( amoebaGpu ); // compute fixed E-field and mutual induced field if( hasAmoebaGeneralizedKirkwood ){ cudaComputeAmoebaFixedEAndGkFields( amoebaGpu ); cudaComputeAmoebaMutualInducedAndGkField( amoebaGpu ); } else { if( amoebaGpu->multipoleNonbondedMethod == AMOEBA_NO_CUTOFF ){ cudaComputeAmoebaFixedEField( amoebaGpu ); cudaComputeAmoebaMutualInducedField( amoebaGpu ); } else { cudaComputeAmoebaPmeFixedEField( amoebaGpu ); cudaComputeAmoebaPmeMutualInducedField( amoebaGpu ); } } // check if induce dipole calculation converged -- abort if it did not if( amoebaGpu->mutualInducedDone == 0 ){ (void) fprintf( amoebaGpu->log, "%s induced dipole calculation did not converge -- aborting!\n", methodName.c_str() ); (void) fflush( amoebaGpu->log ); exit(-1); } // calculate electrostatic forces if( amoebaGpu->multipoleNonbondedMethod == AMOEBA_NO_CUTOFF ){ cudaComputeAmoebaElectrostatic( amoebaGpu ); } else { cudaComputeAmoebaPmeElectrostatic( amoebaGpu ); } // map torques to forces cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpu, amoebaGpu->psTorque, amoebaGpu->psForce, amoebaGpu->gpuContext->psForce4 ); if( 0 && amoebaGpu->log ){ (void) fprintf( amoebaGpu->log, "Done mapping torques -> forces%s\n", methodName.c_str() ); fflush( NULL ); (void) fflush( NULL ); } } #undef AMOEBA_DEBUG