//----------------------------------------------------------------------------------------- //----------------------------------------------------------------------------------------- #include "amoebaCudaKernels.h" #include "kCalculateAmoebaCudaUtilities.h" //#define AMOEBA_DEBUG static __constant__ cudaGmxSimulation cSim; static __constant__ cudaAmoebaGmxSimulation cAmoebaSim; void SetCalculateAmoebaCudaPmeFixedEFieldSim(amoebaGpuContext amoebaGpu) { cudaError_t status; gpuContext gpu = amoebaGpu->gpuContext; status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation)); RTERROR(status, "SetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyToSymbol: SetSim copy to cSim failed"); status = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation)); RTERROR(status, "SetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed"); } void GetCalculateAmoebaCudaPmeFixedEFieldSim(amoebaGpuContext amoebaGpu) { cudaError_t status; gpuContext gpu = amoebaGpu->gpuContext; status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation)); RTERROR(status, "GetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed"); status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation)); RTERROR(status, "GetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed"); } __global__ #if (__CUDA_ARCH__ >= 200) __launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1) #elif (__CUDA_ARCH__ >= 130) __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1) #else __launch_bounds__(G8X_THREADS_PER_BLOCK, 1) #endif static void kReducePmeEFieldPolar_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* EFieldReciprocal, float* fieldIn, float* fieldOut ) { unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x; // Reduce field const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi; //const float term = 0.0f; while (pos < fieldComponents) { // self-term included here float totalField = EFieldReciprocal[pos] + term*cAmoebaSim.pLabFrameDipole[pos]; float* pFt = fieldIn + pos; unsigned int i = outputBuffers; while (i >= 4) { totalField += pFt[0] + pFt[fieldComponents] + pFt[2*fieldComponents] + pFt[3*fieldComponents]; pFt += fieldComponents*4; i -= 4; } if (i >= 2) { totalField += pFt[0] + pFt[fieldComponents]; pFt += fieldComponents*2; i -= 2; } if (i > 0) { totalField += pFt[0]; } fieldOut[pos] = totalField; pos += gridDim.x * blockDim.x; } } __global__ #if (__CUDA_ARCH__ >= 200) __launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1) #elif (__CUDA_ARCH__ >= 130) __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1) #else __launch_bounds__(G8X_THREADS_PER_BLOCK, 1) #endif static void kReducePmeEField_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn, float* fieldOut ) { unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x; // Reduce field const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi; //const float term = 0.0f; while (pos < fieldComponents) { // self-term included here float totalField = term*cAmoebaSim.pLabFrameDipole[pos]; float* pFt = fieldIn + pos; unsigned int i = outputBuffers; while (i >= 4) { totalField += pFt[0] + pFt[fieldComponents] + pFt[2*fieldComponents] + pFt[3*fieldComponents]; pFt += fieldComponents*4; i -= 4; } if (i >= 2) { totalField += pFt[0] + pFt[fieldComponents]; pFt += fieldComponents*2; i -= 2; } if (i > 0) { totalField += pFt[0]; } fieldOut[pos] += totalField; pos += gridDim.x * blockDim.x; } } // reduce psWorkArray_3_1 -> EField // reduce psWorkArray_3_2 -> EFieldPolar static void kReducePmeDirectE_Fields(amoebaGpuContext amoebaGpu ) { // E_FieldPolar = E_Field (reciprocal) + E_FieldPolar (direct) + self kReducePmeEFieldPolar_kernel<<nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>( amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers, amoebaGpu->psE_Field->_pDevStream[0], amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psE_FieldPolar->_pDevStream[0] ); LAUNCHERROR("kReducePmeE_Fields1"); // E_Field = E_Field (reciprocal) + E_Field (direct) + self kReducePmeEField_kernel<<nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>( amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers, amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psE_Field->_pDevStream[0] ); LAUNCHERROR("kReducePmeE_Fields2"); } // file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field #undef GK #include "kCalculateAmoebaCudaFixedFieldParticle.h" __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& atomI, FixedFieldParticle& atomJ, float dscale, float pscale, float fields[4][3] #ifdef AMOEBA_DEBUG , float4* pullBack #endif ){ // compute the real space portion of the Ewald summation float xr = atomJ.x - atomI.x; float yr = atomJ.y - atomI.y; float zr = atomJ.z - atomI.z; // periodic boundary conditions xr -= floor(xr*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX; yr -= floor(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY; zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ; float r2 = xr*xr + yr* yr + zr*zr; float r = sqrtf(r2); // calculate the error function damping terms float ralpha = cSim.alphaEwald*r; float bn[4]; bn[0] = erfc(ralpha)/r; float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald; float alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald); float exp2a = exp(-(ralpha*ralpha)); alsq2n *= alsq2; bn[1] = (bn[0]+alsq2n*exp2a)/r2; alsq2n *= alsq2; bn[2] = (3.0f*bn[1]+alsq2n*exp2a)/r2; alsq2n *= alsq2; bn[3] = (5.0f*bn[2]+alsq2n*exp2a)/r2; // compute the error function scaled and unscaled terms float scale3 = 1.0f; float scale5 = 1.0f; float scale7 = 1.0f; float damp = atomI.damp*atomJ.damp; if( damp != 0.0f ){ float ratio = (r/damp); ratio = ratio*ratio*ratio; float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole; damp = -pgamma*ratio; if( damp > -50.0f) { float expdamp = exp(damp); scale3 = 1.0f - expdamp; scale5 = 1.0f - expdamp*(1.0f-damp); scale7 = 1.0f - expdamp*(1.0f-damp+(0.6f*damp*damp)); } } float dsc3 = dscale*scale3; float dsc5 = dscale*scale5; float dsc7 = dscale*scale7; float psc3 = pscale*scale3; float psc5 = pscale*scale5; float psc7 = pscale*scale7; float r3 = (r*r2); float r5 = (r3*r2); float r7 = (r5*r2); float drr3 = (1.0f-dsc3)/r3; float drr5 = 3.0f * (1.0f-dsc5)/r5; float drr7 = 15.0f * (1.0f-dsc7)/r7; float prr3 = (1.0f-psc3) / r3; float prr5 = 3.0f *(1.0f-psc5)/r5; float prr7 = 15.0f*(1.0f-psc7)/r7; float dir = atomI.labFrameDipole_X*xr + atomI.labFrameDipole_Y*yr + atomI.labFrameDipole_Z*zr; float qix = atomI.labFrameQuadrupole_XX*xr + atomI.labFrameQuadrupole_XY*yr + atomI.labFrameQuadrupole_XZ*zr; float qiy = atomI.labFrameQuadrupole_XY*xr + atomI.labFrameQuadrupole_YY*yr + atomI.labFrameQuadrupole_YZ*zr; float qiz = atomI.labFrameQuadrupole_XZ*xr + atomI.labFrameQuadrupole_YZ*yr + atomI.labFrameQuadrupole_ZZ*zr; float qir = qix*xr + qiy*yr + qiz*zr; float dkr = atomJ.labFrameDipole_X*xr + atomJ.labFrameDipole_Y*yr + atomJ.labFrameDipole_Z*zr; float qkx = atomJ.labFrameQuadrupole_XX*xr + atomJ.labFrameQuadrupole_XY*yr + atomJ.labFrameQuadrupole_XZ*zr; float qky = atomJ.labFrameQuadrupole_XY*xr + atomJ.labFrameQuadrupole_YY*yr + atomJ.labFrameQuadrupole_YZ*zr; float qkz = atomJ.labFrameQuadrupole_XZ*xr + atomJ.labFrameQuadrupole_YZ*yr + atomJ.labFrameQuadrupole_ZZ*zr; float qkr = qkx*xr + qky*yr + qkz*zr; float fim[3],fkm[3]; float fid[3],fkd[3]; float fip[3],fkp[3]; fim[0] = -xr*(bn[1]*atomJ.q-bn[2]*dkr+bn[3]*qkr) - bn[1]*atomJ.labFrameDipole_X + 2.0f*bn[2]*qkx; fim[1] = -yr*(bn[1]*atomJ.q-bn[2]*dkr+bn[3]*qkr) - bn[1]*atomJ.labFrameDipole_Y + 2.0f*bn[2]*qky; fim[2] = -zr*(bn[1]*atomJ.q-bn[2]*dkr+bn[3]*qkr) - bn[1]*atomJ.labFrameDipole_Z + 2.0f*bn[2]*qkz; fkm[0] = xr*(bn[1]*atomI.q+bn[2]*dir+bn[3]*qir) - bn[1]*atomI.labFrameDipole_X - 2.0f*bn[2]*qix; fkm[1] = yr*(bn[1]*atomI.q+bn[2]*dir+bn[3]*qir) - bn[1]*atomI.labFrameDipole_Y - 2.0f*bn[2]*qiy; fkm[2] = zr*(bn[1]*atomI.q+bn[2]*dir+bn[3]*qir) - bn[1]*atomI.labFrameDipole_Z - 2.0f*bn[2]*qiz; fid[0] = -xr*(drr3*atomJ.q-drr5*dkr+drr7*qkr) - drr3*atomJ.labFrameDipole_X + 2.0f*drr5*qkx; fid[1] = -yr*(drr3*atomJ.q-drr5*dkr+drr7*qkr) - drr3*atomJ.labFrameDipole_Y + 2.0f*drr5*qky; fid[2] = -zr*(drr3*atomJ.q-drr5*dkr+drr7*qkr) - drr3*atomJ.labFrameDipole_Z + 2.0f*drr5*qkz; fkd[0] = xr*(drr3*atomI.q+drr5*dir+drr7*qir) - drr3*atomI.labFrameDipole_X - 2.0f*drr5*qix; fkd[1] = yr*(drr3*atomI.q+drr5*dir+drr7*qir) - drr3*atomI.labFrameDipole_Y - 2.0f*drr5*qiy; fkd[2] = zr*(drr3*atomI.q+drr5*dir+drr7*qir) - drr3*atomI.labFrameDipole_Z - 2.0f*drr5*qiz; fip[0] = -xr*(prr3*atomJ.q-prr5*dkr+prr7*qkr) - prr3*atomJ.labFrameDipole_X + 2.0f*prr5*qkx; fip[1] = -yr*(prr3*atomJ.q-prr5*dkr+prr7*qkr) - prr3*atomJ.labFrameDipole_Y + 2.0f*prr5*qky; fip[2] = -zr*(prr3*atomJ.q-prr5*dkr+prr7*qkr) - prr3*atomJ.labFrameDipole_Z + 2.0f*prr5*qkz; fkp[0] = xr*(prr3*atomI.q+prr5*dir+prr7*qir) - prr3*atomI.labFrameDipole_X - 2.0f*prr5*qix; fkp[1] = yr*(prr3*atomI.q+prr5*dir+prr7*qir) - prr3*atomI.labFrameDipole_Y - 2.0f*prr5*qiy; fkp[2] = zr*(prr3*atomI.q+prr5*dir+prr7*qir) - prr3*atomI.labFrameDipole_Z - 2.0f*prr5*qiz; // increment the field at each site due to this interaction if( r2 <= cAmoebaSim.cutoffDistance2 ){ fields[0][0] = fim[0] - fid[0]; fields[0][1] = fim[1] - fid[1]; fields[0][2] = fim[2] - fid[2]; fields[1][0] = fkm[0] - fkd[0]; fields[1][1] = fkm[1] - fkd[1]; fields[1][2] = fkm[2] - fkd[2]; fields[2][0] = fim[0] - fip[0]; fields[2][1] = fim[1] - fip[1]; fields[2][2] = fim[2] - fip[2]; fields[3][0] = fkm[0] - fkp[0]; fields[3][1] = fkm[1] - fkp[1]; fields[3][2] = fkm[2] - fkp[2]; } else { fields[0][0] = 0.0f; fields[1][0] = 0.0f; fields[2][0] = 0.0f; fields[3][0] = 0.0f; fields[0][1] = 0.0f; fields[1][1] = 0.0f; fields[2][1] = 0.0f; fields[3][1] = 0.0f; fields[0][2] = 0.0f; fields[1][2] = 0.0f; fields[2][2] = 0.0f; fields[3][2] = 0.0f; } #ifdef AMOEBA_DEBUG pullBack[0].x = xr; pullBack[0].y = yr; pullBack[0].z = zr; pullBack[0].w = r2; pullBack[1].x = atomJ.x - atomI.x; pullBack[1].y = atomJ.y - atomI.y; pullBack[1].z = atomJ.z - atomI.z; pullBack[1].w = (atomJ.x - atomI.x)*(atomJ.x - atomI.x) + (atomJ.y - atomI.y)*(atomJ.y - atomI.y)+ (atomJ.z - atomI.z)*(atomJ.z - atomI.z); pullBack[2].x = scale3; pullBack[2].y = scale5; pullBack[2].z = scale7; pullBack[2].w = -1.0; #endif } // Include versions of the kernels for N^2 calculations. #define METHOD_NAME(a, b) a##N2##b #include "kCalculateAmoebaCudaPmeFixedEField.h" #define USE_OUTPUT_BUFFER_PER_WARP #undef METHOD_NAME #define METHOD_NAME(a, b) a##N2ByWarp##b #include "kCalculateAmoebaCudaPmeFixedEField.h" /**--------------------------------------------------------------------------------------- Report whether a number is a nan or infinity @param number number to test @return 1 if number is nan or infinity; else return 0 --------------------------------------------------------------------------------------- */ #ifdef AMOEBA_DEBUG static int isNanOrInfinity( double number ){ return (number != number || number == std::numeric_limits::infinity() || number == -std::numeric_limits::infinity()) ? 1 : 0; } #endif /**--------------------------------------------------------------------------------------- Compute fixed electric field using PME @param amoebaGpu amoebaGpu context --------------------------------------------------------------------------------------- */ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu ) { gpuContext gpu = amoebaGpu->gpuContext; #ifdef AMOEBA_DEBUG static const char* methodName = "computeCudaAmoebaPmeFixedEField"; if( amoebaGpu->log ){ (void) fprintf( amoebaGpu->log, "\n%s\n", methodName ); (void) fflush( amoebaGpu->log ); } int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; // N2 debug array CUDAStream* debugArray = new CUDAStream(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray"); memset( debugArray->_pSysStream[0], 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms); debugArray->Upload(); // print intermediate results for the targetAtom unsigned int targetAtom = 0; int maxPrint = 3002; amoebaGpu->psE_Field->Download(); (void) fprintf( amoebaGpu->log, "Recip EFields In\n" ); for( int ii = 0; ii < gpu->natoms; ii++ ){ (void) fprintf( amoebaGpu->log, "%5d ", ii); int indexOffset = ii*3; // E_Field int isNan = isNanOrInfinity( amoebaGpu->psE_Field->_pSysStream[0][indexOffset] ); isNan += isNanOrInfinity( amoebaGpu->psE_Field->_pSysStream[0][indexOffset+1] ); isNan += isNanOrInfinity( amoebaGpu->psE_Field->_pSysStream[0][indexOffset+2] ); (void) fprintf( amoebaGpu->log,"E[%16.9e %16.9e %16.9e] %s\n", amoebaGpu->psE_Field->_pSysStream[0][indexOffset], amoebaGpu->psE_Field->_pSysStream[0][indexOffset+1], amoebaGpu->psE_Field->_pSysStream[0][indexOffset+2], (isNan ? "XXX" :"") ); if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){ ii = gpu->natoms - maxPrint; } } (void) fflush( amoebaGpu->log ); (void) fprintf( amoebaGpu->log, "Recip EFields End\n" ); #endif kClearFields_3( amoebaGpu, 2 ); if (gpu->bOutputBufferPerWarp){ kCalculateAmoebaPmeDirectFixedE_FieldN2ByWarp_kernel<<nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock>>>( amoebaGpu->psWorkUnit->_pDevStream[0], amoebaGpu->psWorkArray_3_1->_pDevStream[0], #ifdef AMOEBA_DEBUG amoebaGpu->psWorkArray_3_2->_pDevStream[0], debugArray->_pDevStream[0], targetAtom ); #else amoebaGpu->psWorkArray_3_2->_pDevStream[0] ); #endif } else { kCalculateAmoebaPmeDirectFixedE_FieldN2_kernel<<nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock>>>( amoebaGpu->psWorkUnit->_pDevStream[0], amoebaGpu->psWorkArray_3_1->_pDevStream[0], #ifdef AMOEBA_DEBUG amoebaGpu->psWorkArray_3_2->_pDevStream[0], debugArray->_pDevStream[0], targetAtom ); #else amoebaGpu->psWorkArray_3_2->_pDevStream[0] ); #endif } LAUNCHERROR("kCalculateAmoebaPmeDirectFixedE_Field_kernel"); #if 0 for( unsigned int ii = 0; ii < amoebaGpu->outputBuffers; ii++ ){ //float index = 1.0f; float index = (float) ii; for( unsigned int jj = 0; jj < 3*amoebaGpu->paddedNumberOfAtoms; jj += 3 ){ unsigned int kk = 3*ii*amoebaGpu->paddedNumberOfAtoms + jj; amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk] = index; amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk+1] = index; amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk+2] = index; } } amoebaGpu->psWorkArray_3_1->Upload(); #endif kReducePmeDirectE_Fields( amoebaGpu ); #ifdef AMOEBA_DEBUG if( amoebaGpu->log ){ gpu->psInteractionCount->Download(); (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u warp=%d\n", amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp, sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->bOutputBufferPerWarp ); (void) fflush( amoebaGpu->log ); /* (void) fprintf( amoebaGpu->log, "Out WorkArray_3_[1,2] paddedNumberOfAtoms=%d\n", amoebaGpu->paddedNumberOfAtoms, amoebaGpu->outputBuffers ); amoebaGpu->psWorkArray_3_1->Download(); amoebaGpu->psWorkArray_3_2->Download(); for( int ii = 0; ii < amoebaGpu->paddedNumberOfAtoms; ii++ ){ (void) fprintf( amoebaGpu->log, "%5d ", ii); int indexOffset = ii*3; // buffer 1 (void) fprintf( amoebaGpu->log,"WArry1[%16.9e %16.9e %16.9e] ", amoebaGpu->psWorkArray_3_1->_pSysStream[0][indexOffset], amoebaGpu->psWorkArray_3_1->_pSysStream[0][indexOffset+1], amoebaGpu->psWorkArray_3_1->_pSysStream[0][indexOffset+2] ); // buffer 2 (void) fprintf( amoebaGpu->log,"WArry2[%16.9e %16.9e %16.9e] ", amoebaGpu->psWorkArray_3_2->_pSysStream[0][indexOffset], amoebaGpu->psWorkArray_3_2->_pSysStream[0][indexOffset+1], amoebaGpu->psWorkArray_3_2->_pSysStream[0][indexOffset+2] ); (void) fprintf( amoebaGpu->log,"\n" ); if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){ ii = gpu->natoms - maxPrint; } } (void) fflush( amoebaGpu->log ); */ amoebaGpu->psE_Field->Download(); amoebaGpu->psE_FieldPolar->Download(); for( int ii = 0; ii < gpu->natoms; ii++ ){ (void) fprintf( amoebaGpu->log, "%5d ", ii); int indexOffset = ii*3; // E_Field (void) fprintf( amoebaGpu->log,"E[%16.9e %16.9e %16.9e] ", amoebaGpu->psE_Field->_pSysStream[0][indexOffset], amoebaGpu->psE_Field->_pSysStream[0][indexOffset+1], amoebaGpu->psE_Field->_pSysStream[0][indexOffset+2] ); // E_Field polar (void) fprintf( amoebaGpu->log,"Epol[%16.9e %16.9e %16.9e] ", amoebaGpu->psE_FieldPolar->_pSysStream[0][indexOffset], amoebaGpu->psE_FieldPolar->_pSysStream[0][indexOffset+1], amoebaGpu->psE_FieldPolar->_pSysStream[0][indexOffset+2] ); (void) fprintf( amoebaGpu->log,"\n" ); if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){ ii = gpu->natoms - maxPrint; } } (void) fflush( amoebaGpu->log ); (void) fprintf( amoebaGpu->log, "EFields End\n" ); (void) fprintf( amoebaGpu->log, "DebugQ\n" ); debugArray->Download(); int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; for( int jj = 0; jj < gpu->natoms; jj++ ){ int debugIndex = jj; (void) fprintf( amoebaGpu->log,"%5d PmeFixedEField\n", jj ); for( int kk = 0; kk < 10; kk++ ){ (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n", debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y, debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w ); debugIndex += paddedNumberOfAtoms; } (void) fprintf( amoebaGpu->log,"\n" ); } // write results to file if( 1 ){ std::vector fileId; //fileId.push_back( 0 ); VectorOfDoubleVectors outputVector; cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector); cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector ); } delete debugArray; } #endif if( 0 ){ std::vector fileId; fileId.push_back( 0 ); VectorOfDoubleVectors outputVector; cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector); cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector ); } } void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu ) { kCalculateAmoebaPMEFixedMultipoles( amoebaGpu ); cudaComputeAmoebaPmeDirectFixedEField( amoebaGpu ); }