Commit 0fe5c905 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

PME fixes

Converted CUDAStream SysStream[0] and DevStream[0] references to SysData & Devdata
parent ffcf55d9
......@@ -990,9 +990,13 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
if( pmeParametersSetBasedOnEwaldErrorTolerance ){
(void) fprintf( data.getLog(), " parameters set based on error tolerance and OpenMM algorithm.\n" );
} else {
double alphaT;
int xsizeT, ysizeT, zsizeT;
NonbondedForceImpl::calcPMEParameters(system, nb, alphaT, xsizeT, ysizeT, zsizeT);
double impliedTolerance = alpha*force.getCutoffDistance();
impliedTolerance = 0.5*exp( -(impliedTolerance*impliedTolerance) );
(void) fprintf( data.getLog(), " using input parameters implied tolerance=%12.3e\n", impliedTolerance );
(void) fprintf( data.getLog(), " using input parameters implied tolerance=%12.3e;", impliedTolerance );
(void) fprintf( data.getLog(), "OpenMM param: aEwald=%12.3f [%6d %6d %6d]\n", alphaT, xsizeT, ysizeT, zsizeT);
}
(void) fflush( data.getLog() );
}
......
......@@ -689,12 +689,12 @@ static void printElectrostaticBuffer( amoebaGpuContext amoebaGpu, unsigned int b
unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, " %6u %3u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
ii/3, bufferIndex, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii],
amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii+1],
amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii+2],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii+1],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii+2] );
amoebaGpu->psWorkArray_3_1->_pSysData[ii],
amoebaGpu->psWorkArray_3_1->_pSysData[ii+1],
amoebaGpu->psWorkArray_3_1->_pSysData[ii+2],
amoebaGpu->psWorkArray_3_2->_pSysData[ii],
amoebaGpu->psWorkArray_3_2->_pSysData[ii+1],
amoebaGpu->psWorkArray_3_2->_pSysData[ii+2] );
}
/*
......@@ -702,14 +702,14 @@ static void printElectrostaticBuffer( amoebaGpuContext amoebaGpu, unsigned int b
stop = -146016;
float maxV = -1.0e+99;
for( unsigned int ii = start; ii < stop; ii += 3 ){
if( amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii] > maxV ){
if( amoebaGpu->psWorkArray_3_1->_pSysData[ii] > maxV ){
unsigned int ii3Index = ii/3;
unsigned int bufferIndex = ii3Index/(amoebaGpu->paddedNumberOfAtoms);
unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, "MaxQ %6u %3u %6u %14.6e\n",
ii/3, bufferIndex, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii] );
maxV = amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii];
amoebaGpu->psWorkArray_3_1->_pSysData[ii] );
maxV = amoebaGpu->psWorkArray_3_1->_pSysData[ii];
}
}
*/
......@@ -722,12 +722,12 @@ static void printElectrostaticAtomBuffers( amoebaGpuContext amoebaGpu, unsigned
unsigned int particleIndex = 3*(targetAtom + ii*amoebaGpu->paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, " %2u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
ii, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex],
amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex+1],
amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex+2],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex+1],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex+2] );
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex],
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+1],
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+2],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+1],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+2] );
}
}
#endif
......@@ -771,7 +771,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
}
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysStream[0], 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
memset( debugArray->_pSysData, 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
debugArray->Upload();
unsigned int targetAtom = 0;
#endif
......@@ -871,34 +871,34 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
// force
(void) fprintf( amoebaGpu->log,"ElectrostaticF [%16.9e %16.9e %16.9e] ",
amoebaGpu->psForce->_pSysStream[0][indexOffset],
amoebaGpu->psForce->_pSysStream[0][indexOffset+1],
amoebaGpu->psForce->_pSysStream[0][indexOffset+2] );
amoebaGpu->psForce->_pSysData[indexOffset],
amoebaGpu->psForce->_pSysData[indexOffset+1],
amoebaGpu->psForce->_pSysData[indexOffset+2] );
// torque
(void) fprintf( amoebaGpu->log,"ElectrostaticT [%16.9e %16.9e %16.9e] ",
amoebaGpu->psTorque->_pSysStream[0][indexOffset],
amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
amoebaGpu->psTorque->_pSysStream[0][indexOffset+2] );
amoebaGpu->psTorque->_pSysData[indexOffset],
amoebaGpu->psTorque->_pSysData[indexOffset+1],
amoebaGpu->psTorque->_pSysData[indexOffset+2] );
// coords
#if 0
(void) fprintf( amoebaGpu->log,"x[%16.9e %16.9e %16.9e] ",
gpu->psPosq4->_pSysStream[0][ii].x,
gpu->psPosq4->_pSysStream[0][ii].y,
gpu->psPosq4->_pSysStream[0][ii].z);
gpu->psPosq4->_pSysData[ii].x,
gpu->psPosq4->_pSysData[ii].y,
gpu->psPosq4->_pSysData[ii].z);
for( int jj = 0; jj < gpu->natoms && jj < 5; jj++ ){
int debugIndex = jj*gpu->natoms + ii;
float xx = gpu->psPosq4->_pSysStream[0][jj].x - gpu->psPosq4->_pSysStream[0][ii].x;
float yy = gpu->psPosq4->_pSysStream[0][jj].y - gpu->psPosq4->_pSysStream[0][ii].y;
float zz = gpu->psPosq4->_pSysStream[0][jj].z - gpu->psPosq4->_pSysStream[0][ii].z;
float xx = gpu->psPosq4->_pSysData[jj].x - gpu->psPosq4->_pSysData[ii].x;
float yy = gpu->psPosq4->_pSysData[jj].y - gpu->psPosq4->_pSysData[ii].y;
float zz = gpu->psPosq4->_pSysData[jj].z - gpu->psPosq4->_pSysData[ii].z;
(void) fprintf( amoebaGpu->log,"\n%4d %4d delta [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] ",
ii, jj, xx, yy, zz,
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y, debugArray->_pSysStream[0][debugIndex].z );
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );
}
#endif
......@@ -914,8 +914,8 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
int debugIndex = jj;
for( int kk = 0; kk < 5; kk++ ){
(void) fprintf( amoebaGpu->log,"%5d %5d [%16.9e %16.9e %16.9e %16.9e] E11\n", targetAtom, jj,
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
debugIndex += paddedNumberOfAtoms;
}
(void) fprintf( amoebaGpu->log,"\n" );
......@@ -933,12 +933,12 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
int offset = 3*ii;
(void) fprintf( amoebaGpu->log,"%6d F[%16.7e %16.7e %16.7e] T[%16.7e %16.7e %16.7e]\n", ii,
amoebaGpu->psForce->_pSysStream[0][offset],
amoebaGpu->psForce->_pSysStream[0][offset+1],
amoebaGpu->psForce->_pSysStream[0][offset+2],
amoebaGpu->psTorque->_pSysStream[0][offset],
amoebaGpu->psTorque->_pSysStream[0][offset+1],
amoebaGpu->psTorque->_pSysStream[0][offset+2] );
amoebaGpu->psForce->_pSysData[offset],
amoebaGpu->psForce->_pSysData[offset+1],
amoebaGpu->psForce->_pSysData[offset+2],
amoebaGpu->psTorque->_pSysData[offset],
amoebaGpu->psTorque->_pSysData[offset+1],
amoebaGpu->psTorque->_pSysData[offset+2] );
if( (ii == maxPrint) && (ii < (gpu->natoms - maxPrint)) )ii = gpu->natoms - maxPrint;
}
}
......
......@@ -40,17 +40,17 @@ static void kReduceEAndGkFields(amoebaGpuContext amoebaGpu )
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psE_Field->_pDevStream[0] );
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData );
LAUNCHERROR("kReduceEAndGK_Fields1");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psE_FieldPolar->_pDevStream[0] );
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData );
LAUNCHERROR("kReduceEAndGK_Fields2");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_3->_pDevStream[0], amoebaGpu->psGk_Field->_pDevStream[0] );
amoebaGpu->psWorkArray_3_3->_pDevData, amoebaGpu->psGk_Field->_pDevData );
LAUNCHERROR("kReduceEAndGK_Fields3");
}
......@@ -323,12 +323,12 @@ static void printEFieldBuffer( amoebaGpuContext amoebaGpu, unsigned int bufferIn
unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, " %6u %3u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
ii/3, bufferIndex, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii],
amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii+1],
amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii+2],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii+1],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii+2] );
amoebaGpu->psWorkArray_3_1->_pSysData[ii],
amoebaGpu->psWorkArray_3_1->_pSysData[ii+1],
amoebaGpu->psWorkArray_3_1->_pSysData[ii+2],
amoebaGpu->psWorkArray_3_2->_pSysData[ii],
amoebaGpu->psWorkArray_3_2->_pSysData[ii+1],
amoebaGpu->psWorkArray_3_2->_pSysData[ii+2] );
}
}
......@@ -339,12 +339,12 @@ static void printEFieldAtomBuffers( amoebaGpuContext amoebaGpu, unsigned int tar
unsigned int particleIndex = targetAtom + ii*3*amoebaGpu->paddedNumberOfAtoms;
(void) fprintf( amoebaGpu->log, " %2u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
ii, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex],
amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex+1],
amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex+2],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex+1],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex+2] );
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex],
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+1],
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+2],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+1],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+2] );
}
}
#endif
......@@ -383,7 +383,7 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
// N2 debug array
CUDAStream<float4>* debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysStream[0], 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
memset( debugArray->_pSysData, 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
debugArray->Upload();
(*gpu->psInteractionCount)[0] = gpu->sim.workUnits;
......@@ -413,18 +413,18 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
if (gpu->bOutputBufferPerWarp){
(void) fprintf( amoebaGpu->log, "N2 warp\n" );
kCalculateAmoebaFixedEAndGkFieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0],
gpu->psPosq4->_pDevStream[0],
amoebaGpu->psLabFrameDipole->_pDevStream[0],
amoebaGpu->psLabFrameQuadrupole->_pDevStream[0],
gpu->psBornRadii->_pDevStream[0],
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
amoebaGpu->psWorkArray_3_2->_pDevStream[0],
amoebaGpu->psWorkUnit->_pDevData,
gpu->psPosq4->_pDevData,
amoebaGpu->psLabFrameDipole->_pDevData,
amoebaGpu->psLabFrameQuadrupole->_pDevData,
gpu->psBornRadii->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
amoebaGpu->psWorkArray_3_2->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_3->_pDevStream[0],
debugArray->_pDevStream[0], targetAtom );
amoebaGpu->psWorkArray_3_3->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_3->_pDevStream[0] );
amoebaGpu->psWorkArray_3_3->_pDevData );
#endif
} else {
......@@ -437,18 +437,18 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
#endif
kCalculateAmoebaFixedEAndGkFieldN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0],
gpu->psPosq4->_pDevStream[0],
amoebaGpu->psLabFrameDipole->_pDevStream[0],
amoebaGpu->psLabFrameQuadrupole->_pDevStream[0],
gpu->psBornRadii->_pDevStream[0],
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
amoebaGpu->psWorkArray_3_2->_pDevStream[0],
amoebaGpu->psWorkUnit->_pDevData,
gpu->psPosq4->_pDevData,
amoebaGpu->psLabFrameDipole->_pDevData,
amoebaGpu->psLabFrameQuadrupole->_pDevData,
gpu->psBornRadii->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
amoebaGpu->psWorkArray_3_2->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_3->_pDevStream[0],
debugArray->_pDevStream[0], targetAtom );
amoebaGpu->psWorkArray_3_3->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_3->_pDevStream[0] );
amoebaGpu->psWorkArray_3_3->_pDevData );
#endif
}
LAUNCHERROR("kCalculateAmoebaFixedE_FieldN2Forces_kernel");
......@@ -459,9 +459,9 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
float index = (float) ii;
for( unsigned int jj = 0; jj < 3*amoebaGpu->paddedNumberOfAtoms; jj += 3 ){
unsigned int kk = 3*ii*amoebaGpu->paddedNumberOfAtoms + jj;
amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk] = index;
amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk+1] = index;
amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk+2] = index;
amoebaGpu->psWorkArray_3_1->_pSysData[kk] = index;
amoebaGpu->psWorkArray_3_1->_pSysData[kk+1] = index;
amoebaGpu->psWorkArray_3_1->_pSysData[kk+2] = index;
}
}
amoebaGpu->psWorkArray_3_1->Upload();
......@@ -497,23 +497,23 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
// E_Field
(void) fprintf( amoebaGpu->log,"E[%16.9e %16.9e %16.9e] ",
amoebaGpu->psE_Field->_pSysStream[0][indexOffset],
amoebaGpu->psE_Field->_pSysStream[0][indexOffset+1],
amoebaGpu->psE_Field->_pSysStream[0][indexOffset+2] );
amoebaGpu->psE_Field->_pSysData[indexOffset],
amoebaGpu->psE_Field->_pSysData[indexOffset+1],
amoebaGpu->psE_Field->_pSysData[indexOffset+2] );
// E_Field polar
(void) fprintf( amoebaGpu->log,"Epol[%16.9e %16.9e %16.9e] ",
amoebaGpu->psE_FieldPolar->_pSysStream[0][indexOffset],
amoebaGpu->psE_FieldPolar->_pSysStream[0][indexOffset+1],
amoebaGpu->psE_FieldPolar->_pSysStream[0][indexOffset+2] );
amoebaGpu->psE_FieldPolar->_pSysData[indexOffset],
amoebaGpu->psE_FieldPolar->_pSysData[indexOffset+1],
amoebaGpu->psE_FieldPolar->_pSysData[indexOffset+2] );
// Gk_Field polar
(void) fprintf( amoebaGpu->log,"Gk[%16.9e %16.9e %16.9e] ",
amoebaGpu->psGk_Field->_pSysStream[0][indexOffset],
amoebaGpu->psGk_Field->_pSysStream[0][indexOffset+1],
amoebaGpu->psGk_Field->_pSysStream[0][indexOffset+2] );
amoebaGpu->psGk_Field->_pSysData[indexOffset],
amoebaGpu->psGk_Field->_pSysData[indexOffset+1],
amoebaGpu->psGk_Field->_pSysData[indexOffset+2] );
(void) fprintf( amoebaGpu->log,"\n" );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
......@@ -542,14 +542,14 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
int debugIndex = jj;
(void) fprintf( amoebaGpu->log,"%4d %4d Qint [%16.9e %16.9e %16.9e %16.9e] %16.9e ",
ii, jj,
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w,
gpu->psBornRadii->_pSysStream[0][jj] );
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w,
gpu->psBornRadii->_pSysData[jj] );
for( int kk = 0; kk < 2; kk++ ){
debugIndex += paddedNumberOfAtoms;
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e] ",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y, debugArray->_pSysStream[0][debugIndex].z );
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );
}
(void) fprintf( amoebaGpu->log,"\n" );
}
......
......@@ -38,12 +38,12 @@ static void kReduceE_Fields_kernel(amoebaGpuContext amoebaGpu )
{
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psE_Field->_pDevStream[0] );
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData );
LAUNCHERROR("kReduceE_Fields1");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psE_FieldPolar->_pDevStream[0] );
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData );
LAUNCHERROR("kReduceE_Fields2");
}
......@@ -88,7 +88,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
// N2 debug array
CUDAStream<float4>* debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysStream[0], 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
memset( debugArray->_pSysData, 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
debugArray->Upload();
(*gpu->psInteractionCount)[0] = gpu->sim.workUnits;
......@@ -104,13 +104,13 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
if (gpu->bOutputBufferPerWarp){
(void) fprintf( amoebaGpu->log, "N2 warp\n" );
kCalculateAmoebaFixedE_FieldN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0],
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
amoebaGpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0],
debugArray->_pDevStream[0], targetAtom );
amoebaGpu->psWorkArray_3_2->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
amoebaGpu->psWorkArray_3_2->_pDevData );
#endif
} else {
......@@ -123,13 +123,13 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
#endif
kCalculateAmoebaFixedE_FieldN2Forces_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0],
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
amoebaGpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0],
debugArray->_pDevStream[0], targetAtom );
amoebaGpu->psWorkArray_3_2->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
amoebaGpu->psWorkArray_3_2->_pDevData );
#endif
}
......@@ -141,9 +141,9 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
float index = (float) ii;
for( unsigned int jj = 0; jj < 3*amoebaGpu->paddedNumberOfAtoms; jj += 3 ){
unsigned int kk = 3*ii*amoebaGpu->paddedNumberOfAtoms + jj;
amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk] = index;
amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk+1] = index;
amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk+2] = index;
amoebaGpu->psWorkArray_3_1->_pSysData[kk] = index;
amoebaGpu->psWorkArray_3_1->_pSysData[kk+1] = index;
amoebaGpu->psWorkArray_3_1->_pSysData[kk+2] = index;
}
}
amoebaGpu->psWorkArray_3_1->Upload();
......@@ -172,16 +172,16 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
// E_Field
(void) fprintf( amoebaGpu->log,"E[%16.9e %16.9e %16.9e] ",
amoebaGpu->psE_Field->_pSysStream[0][indexOffset],
amoebaGpu->psE_Field->_pSysStream[0][indexOffset+1],
amoebaGpu->psE_Field->_pSysStream[0][indexOffset+2] );
amoebaGpu->psE_Field->_pSysData[indexOffset],
amoebaGpu->psE_Field->_pSysData[indexOffset+1],
amoebaGpu->psE_Field->_pSysData[indexOffset+2] );
// E_Field polar
(void) fprintf( amoebaGpu->log,"Epol[%16.9e %16.9e %16.9e] ",
amoebaGpu->psE_FieldPolar->_pSysStream[0][indexOffset],
amoebaGpu->psE_FieldPolar->_pSysStream[0][indexOffset+1],
amoebaGpu->psE_FieldPolar->_pSysStream[0][indexOffset+2] );
amoebaGpu->psE_FieldPolar->_pSysData[indexOffset],
amoebaGpu->psE_FieldPolar->_pSysData[indexOffset+1],
amoebaGpu->psE_FieldPolar->_pSysData[indexOffset+2] );
(void) fprintf( amoebaGpu->log,"\n" );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
......@@ -211,65 +211,65 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
if( jj == ii )continue;
(void) fprintf( amoebaGpu->log,"\n\n%4d %4d rrs\n[%16.9e %16.9e %16.9e %16.9e]\n",
ii, jj,
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
debugIndex += amoebaGpu->paddedNumberOfAtoms;
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e]\n",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z );
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z );
debugIndex += amoebaGpu->paddedNumberOfAtoms;
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e]\n",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z );
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z );
debugIndex += amoebaGpu->paddedNumberOfAtoms;
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e]\n",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z );
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z );
debugIndex += amoebaGpu->paddedNumberOfAtoms;
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e]\n",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z );
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z );
debugIndex += amoebaGpu->paddedNumberOfAtoms;
(void) fprintf( amoebaGpu->log,"Y1 %5d %16.9e %16.9e %16.9e\n", jj,
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z );
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z );
sum[0][0] += debugArray->_pSysStream[0][debugIndex].x;
sum[0][1] += debugArray->_pSysStream[0][debugIndex].y;
sum[0][2] += debugArray->_pSysStream[0][debugIndex].z;
sum[0][0] += debugArray->_pSysData[debugIndex].x;
sum[0][1] += debugArray->_pSysData[debugIndex].y;
sum[0][2] += debugArray->_pSysData[debugIndex].z;
debugIndex += amoebaGpu->paddedNumberOfAtoms;
(void) fprintf( amoebaGpu->log,"Y2 %5d %16.9e %16.9e %16.9e\n", jj,
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z );
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z );
sum[1][0] += debugArray->_pSysStream[0][debugIndex].x;
sum[1][1] += debugArray->_pSysStream[0][debugIndex].y;
sum[1][2] += debugArray->_pSysStream[0][debugIndex].z;
sum[1][0] += debugArray->_pSysData[debugIndex].x;
sum[1][1] += debugArray->_pSysData[debugIndex].y;
sum[1][2] += debugArray->_pSysData[debugIndex].z;
/*
debugIndex += amoebaGpu->paddedNumberOfAtoms;
(void) fprintf( amoebaGpu->log,"atmJ[%16.9e %16.9e %16.9e]\n",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z );
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z );
debugIndex += amoebaGpu->paddedNumberOfAtoms;
(void) fprintf( amoebaGpu->log,"atmJ[%16.9e %16.9e %16.9e]\n",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z );
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z );
debugIndex += gpu->natoms;
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
*/
}
(void) fprintf( amoebaGpu->log,"SumQ [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n",
......@@ -278,12 +278,12 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
}
for( unsigned int ii = 0; ii < debugArray->_stride; ii++ ){
int print;
if( debugArray->_pSysStream[0][ii].x != 0.0f || debugArray->_pSysStream[0][ii].y != 0.0f ||
debugArray->_pSysStream[0][ii].y != 0.0f || debugArray->_pSysStream[0][ii].w != 0.0f ||
debugArray->_pSysStream[0][ii].x != debugArray->_pSysStream[0][ii].x ||
debugArray->_pSysStream[0][ii].y != debugArray->_pSysStream[0][ii].y ||
debugArray->_pSysStream[0][ii].z != debugArray->_pSysStream[0][ii].z ||
debugArray->_pSysStream[0][ii].w != debugArray->_pSysStream[0][ii].w ){
if( debugArray->_pSysData[ii].x != 0.0f || debugArray->_pSysData[ii].y != 0.0f ||
debugArray->_pSysData[ii].y != 0.0f || debugArray->_pSysData[ii].w != 0.0f ||
debugArray->_pSysData[ii].x != debugArray->_pSysData[ii].x ||
debugArray->_pSysData[ii].y != debugArray->_pSysData[ii].y ||
debugArray->_pSysData[ii].z != debugArray->_pSysData[ii].z ||
debugArray->_pSysData[ii].w != debugArray->_pSysData[ii].w ){
print = 0;
} else {
print = 0;
......@@ -293,10 +293,10 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
unsigned int atomJ = ii - atomI*amoebaGpu->paddedNumberOfAtoms;
(void) fprintf( amoebaGpu->log, "%5u [%5u %5u] ", ii, atomI, atomJ);
(void) fprintf( amoebaGpu->log, "%14.6e %14.6e %14.6e %14.6e\n",
debugArray->_pSysStream[0][ii].x,
debugArray->_pSysStream[0][ii].y,
debugArray->_pSysStream[0][ii].z,
debugArray->_pSysStream[0][ii].w );
debugArray->_pSysData[ii].x,
debugArray->_pSysData[ii].y,
debugArray->_pSysData[ii].z,
debugArray->_pSysData[ii].w );
}
}
......
......@@ -1542,12 +1542,12 @@ static void kReduceForceTorque(amoebaGpuContext amoebaGpu )
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psKirkwoodForce->_pDevStream[0] );
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psKirkwoodForce->_pDevData );
LAUNCHERROR("kReduceForceTorque1");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psTorque->_pDevStream[0] );
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psTorque->_pDevData );
LAUNCHERROR("kReduceForceTorque2");
}
......@@ -1561,13 +1561,13 @@ static void kReduce_dBorn(amoebaGpuContext amoebaGpu )
/*
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_1_1->_pDevStream[0], amoebaGpu->psBorn->_pDevStream[0] );
amoebaGpu->psWorkArray_1_1->_pDevData, amoebaGpu->psBorn->_pDevData );
LAUNCHERROR("kReduce_dBorn1");
*/
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_1_2->_pDevStream[0], amoebaGpu->psBornPolar->_pDevStream[0] );
amoebaGpu->psWorkArray_1_2->_pDevData, amoebaGpu->psBornPolar->_pDevData );
LAUNCHERROR("kReduce_dBorn2");
}
......@@ -1709,9 +1709,9 @@ static void kReduceAndCombine_dBorn(amoebaGpuContext amoebaGpu )
kReduceAndCombineFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_1_1->_pDevStream[0],
amoebaGpu->psWorkArray_1_2->_pDevStream[0],
amoebaGpu->psBorn->_pDevStream[0] );
amoebaGpu->psWorkArray_1_1->_pDevData,
amoebaGpu->psWorkArray_1_2->_pDevData,
amoebaGpu->psBorn->_pDevData );
LAUNCHERROR("kReduce_dBorn");
} */
......@@ -1722,9 +1722,9 @@ static void kReduceToBornForcePrefactor( amoebaGpuContext amoebaGpu )
kReduceToBornForcePrefactorAndSASA_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_1_1->_pDevStream[0],
amoebaGpu->psWorkArray_1_2->_pDevStream[0],
amoebaGpu->gpuContext->psBornForce->_pDevStream[0] );
amoebaGpu->psWorkArray_1_1->_pDevData,
amoebaGpu->psWorkArray_1_2->_pDevData,
amoebaGpu->gpuContext->psBornForce->_pDevData );
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
......@@ -1739,9 +1739,9 @@ static void kReduceToBornForcePrefactor( amoebaGpuContext amoebaGpu )
for( int ii = 0; ii < amoebaGpu->gpuContext->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
(void) fprintf( amoebaGpu->log,"bF %16.9e obc=%16.9e bR=%16.9e\n",
amoebaGpu->gpuContext->psBornForce->_pSysStream[0][ii],
amoebaGpu->gpuContext->psObcData->_pSysStream[0][ii].x,
amoebaGpu->gpuContext->psBornRadii->_pSysStream[0][ii] );
amoebaGpu->gpuContext->psBornForce->_pSysData[ii],
amoebaGpu->gpuContext->psObcData->_pSysData[ii].x,
amoebaGpu->gpuContext->psBornRadii->_pSysData[ii] );
}
(void) fflush( amoebaGpu->log );
if( 1 ){
......@@ -1764,9 +1764,9 @@ static void kReduceToBornForcePrefactor( amoebaGpuContext amoebaGpu )
} else {
kReduceToBornForcePrefactor_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_1_1->_pDevStream[0],
amoebaGpu->psWorkArray_1_2->_pDevStream[0],
amoebaGpu->gpuContext->psBornForce->_pDevStream[0] );
amoebaGpu->psWorkArray_1_1->_pDevData,
amoebaGpu->psWorkArray_1_2->_pDevData,
amoebaGpu->gpuContext->psBornForce->_pDevData );
}
LAUNCHERROR("kReduceToBornForcePrefactor");
}
......@@ -1784,12 +1784,12 @@ static void printKirkwoodBuffer( amoebaGpuContext amoebaGpu, unsigned int buffer
unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, " %6u %3u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
ii/3, bufferIndex, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii],
amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii+1],
amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii+2],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii+1],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii+2] );
amoebaGpu->psWorkArray_3_1->_pSysData[ii],
amoebaGpu->psWorkArray_3_1->_pSysData[ii+1],
amoebaGpu->psWorkArray_3_1->_pSysData[ii+2],
amoebaGpu->psWorkArray_3_2->_pSysData[ii],
amoebaGpu->psWorkArray_3_2->_pSysData[ii+1],
amoebaGpu->psWorkArray_3_2->_pSysData[ii+2] );
}
/*
......@@ -1797,14 +1797,14 @@ static void printKirkwoodBuffer( amoebaGpuContext amoebaGpu, unsigned int buffer
stop = -146016;
float maxV = -1.0e+99;
for( unsigned int ii = start; ii < stop; ii += 3 ){
if( amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii] > maxV ){
if( amoebaGpu->psWorkArray_3_1->_pSysData[ii] > maxV ){
unsigned int ii3Index = ii/3;
unsigned int bufferIndex = ii3Index/(amoebaGpu->paddedNumberOfAtoms);
unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, "MaxQ %6u %3u %6u %14.6e\n",
ii/3, bufferIndex, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii] );
maxV = amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii];
amoebaGpu->psWorkArray_3_1->_pSysData[ii] );
maxV = amoebaGpu->psWorkArray_3_1->_pSysData[ii];
}
}
*/
......@@ -1817,12 +1817,12 @@ static void printKirkwoodAtomBuffers( amoebaGpuContext amoebaGpu, unsigned int t
unsigned int particleIndex = 3*(targetAtom + ii*amoebaGpu->paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, " %2u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
ii, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex],
amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex+1],
amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex+2],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex+1],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex+2] );
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex],
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+1],
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+2],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+1],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+2] );
}
}
#endif
......@@ -1867,7 +1867,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
}
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysStream[0], 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
memset( debugArray->_pSysData, 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
debugArray->Upload();
unsigned int targetAtom = 0;
......@@ -1875,7 +1875,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
(void) fprintf( amoebaGpu->log, "Kirkwood input\n" ); (void) fflush( amoebaGpu->log );
for( int ii = 0; ii < amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; ii++ ){
(void) fprintf( amoebaGpu->log,"Born %6d %16.9e\n", ii,
gpu->psBornRadii->_pSysStream[0][ii] );
gpu->psBornRadii->_pSysData[ii] );
}
#endif
......@@ -1908,9 +1908,9 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaCudaKirkwoodN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(KirkwoodParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0]
amoebaGpu->psWorkUnit->_pDevData
#ifdef AMOEBA_DEBUG
, debugArray->_pDevStream[0], targetAtom );
, debugArray->_pDevData, targetAtom );
#else
);
#endif
......@@ -1925,9 +1925,9 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
#endif
kCalculateAmoebaCudaKirkwoodN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(KirkwoodParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0]
amoebaGpu->psWorkUnit->_pDevData
#ifdef AMOEBA_DEBUG
, debugArray->_pDevStream[0], targetAtom );
, debugArray->_pDevData, targetAtom );
#else
);
#endif
......@@ -1952,19 +1952,19 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
int indexOffset3 = ii*3;
int indexOffset9 = ii*9;
(void) fprintf( amoebaGpu->log, "%5d [%14.7e %14.7e %14.7e] q[%14.7e %14.7e %14.7e]\n", ii,
amoebaGpu->psLabFrameDipole->_pSysStream[0][indexOffset3],
amoebaGpu->psLabFrameDipole->_pSysStream[0][indexOffset3+1],
amoebaGpu->psLabFrameDipole->_pSysStream[0][indexOffset3+2],
amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][indexOffset9],
amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][indexOffset9+1],
amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][indexOffset9+2] );
amoebaGpu->psLabFrameDipole->_pSysData[indexOffset3],
amoebaGpu->psLabFrameDipole->_pSysData[indexOffset3+1],
amoebaGpu->psLabFrameDipole->_pSysData[indexOffset3+2],
amoebaGpu->psLabFrameQuadrupole->_pSysData[indexOffset9],
amoebaGpu->psLabFrameQuadrupole->_pSysData[indexOffset9+1],
amoebaGpu->psLabFrameQuadrupole->_pSysData[indexOffset9+2] );
(void) fprintf( amoebaGpu->log, "%5d [%14.7e %14.7e %14.7e] q[%14.7e %14.7e %14.7e]\n", ii,
amoebaGpu->psInducedDipoleS->_pSysStream[0][indexOffset3],
amoebaGpu->psInducedDipoleS->_pSysStream[0][indexOffset3+1],
amoebaGpu->psInducedDipoleS->_pSysStream[0][indexOffset3+2],
amoebaGpu->psInducedDipolePolarS->_pSysStream[0][indexOffset3],
amoebaGpu->psInducedDipolePolarS->_pSysStream[0][indexOffset3+1],
amoebaGpu->psInducedDipolePolarS->_pSysStream[0][indexOffset3+2] );
amoebaGpu->psInducedDipoleS->_pSysData[indexOffset3],
amoebaGpu->psInducedDipoleS->_pSysData[indexOffset3+1],
amoebaGpu->psInducedDipoleS->_pSysData[indexOffset3+2],
amoebaGpu->psInducedDipolePolarS->_pSysData[indexOffset3],
amoebaGpu->psInducedDipolePolarS->_pSysData[indexOffset3+1],
amoebaGpu->psInducedDipolePolarS->_pSysData[indexOffset3+2] );
}
*/
debugArray->Download();
......@@ -1978,8 +1978,8 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
(void) fprintf( amoebaGpu->log,"%5d %5d DebugGk\n", targetAtom, jj );
for( int kk = 0; kk < 8; kk++ ){
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
debugIndex += paddedNumberOfAtoms;
}
(void) fprintf( amoebaGpu->log,"\n" );
......@@ -2015,28 +2015,28 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
// force
(void) fprintf( amoebaGpu->log,"KirkwoodF [%16.9e %16.9e %16.9e] ",
amoebaGpu->psKirkwoodForce->_pSysStream[0][indexOffset],
amoebaGpu->psKirkwoodForce->_pSysStream[0][indexOffset+1],
amoebaGpu->psKirkwoodForce->_pSysStream[0][indexOffset+2] );
amoebaGpu->psKirkwoodForce->_pSysData[indexOffset],
amoebaGpu->psKirkwoodForce->_pSysData[indexOffset+1],
amoebaGpu->psKirkwoodForce->_pSysData[indexOffset+2] );
// torque
(void) fprintf( amoebaGpu->log,"T [%16.9e %16.9e %16.9e] ",
amoebaGpu->psTorque->_pSysStream[0][indexOffset],
amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
amoebaGpu->psTorque->_pSysStream[0][indexOffset+2] );
amoebaGpu->psTorque->_pSysData[indexOffset],
amoebaGpu->psTorque->_pSysData[indexOffset+1],
amoebaGpu->psTorque->_pSysData[indexOffset+2] );
// d_Born
//float bornForceValue = amoebaGpu->psBorn->_pSysStream[0][ii];
float bornForceValue = gpu->psBornForce->_pSysStream[0][ii];
float bornRadius = gpu->psBornRadii->_pSysStream[0][ii];
float obcChain = gpu->psObcChain->_pSysStream[0][ii];
//float bornForceValue = amoebaGpu->psBorn->_pSysData[ii];
float bornForceValue = gpu->psBornForce->_pSysData[ii];
float bornRadius = gpu->psBornRadii->_pSysData[ii];
float obcChain = gpu->psObcChain->_pSysData[ii];
float bornSumValue = bornRadius*obcChain != 0.0f ? bornForceValue/(bornRadius*bornRadius*obcChain) : 0.0f;
float bornValue = bornSumValue - amoebaGpu->psBornPolar->_pSysStream[0][ii];
float bornValue = bornSumValue - amoebaGpu->psBornPolar->_pSysData[ii];
(void) fprintf( amoebaGpu->log,"dB br=%16.9e obcC=%16.9e bSum=%16.9e [%16.9e %16.9e]",
bornRadius,obcChain, bornSumValue, bornValue,
amoebaGpu->psBornPolar->_pSysStream[0][ii] );
amoebaGpu->psBornPolar->_pSysData[ii] );
// coords
......@@ -2054,7 +2054,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
// d_Born
(void) fprintf( amoebaGpu->log,"dBrnSum %16.9e ",
amoebaGpu->psBorn->_pSysStream[0][ii] );
amoebaGpu->psBorn->_pSysData[ii] );
(void) fprintf( amoebaGpu->log,"\n" );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
......@@ -2099,9 +2099,9 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
// force
(void) fprintf( amoebaGpu->log,"KirkwoodF [%16.9e %16.9e %16.9e] ",
amoebaGpu->psKirkwoodForce->_pSysStream[0][indexOffset],
amoebaGpu->psKirkwoodForce->_pSysStream[0][indexOffset+1],
amoebaGpu->psKirkwoodForce->_pSysStream[0][indexOffset+2] );
amoebaGpu->psKirkwoodForce->_pSysData[indexOffset],
amoebaGpu->psKirkwoodForce->_pSysData[indexOffset+1],
amoebaGpu->psKirkwoodForce->_pSysData[indexOffset+2] );
(void) fprintf( amoebaGpu->log,"\n" );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
......
......@@ -936,12 +936,12 @@ static void kReduceForceTorque( amoebaGpuContext amoebaGpu )
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psKirkwoodEDiffForce->_pDevStream[0] );
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psKirkwoodEDiffForce->_pDevData );
LAUNCHERROR("kReduceForceTorqueKirkwoodEDiff1");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psTorque->_pDevStream[0] );
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psTorque->_pDevData );
LAUNCHERROR("kReduceForceTorqueKirkwoodEDiff2");
}
......@@ -959,12 +959,12 @@ static void printKirkwoodEDiffBuffer( amoebaGpuContext amoebaGpu, unsigned int b
unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, " %6u %3u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
ii/3, bufferIndex, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii],
amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii+1],
amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii+2],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii+1],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii+2] );
amoebaGpu->psWorkArray_3_1->_pSysData[ii],
amoebaGpu->psWorkArray_3_1->_pSysData[ii+1],
amoebaGpu->psWorkArray_3_1->_pSysData[ii+2],
amoebaGpu->psWorkArray_3_2->_pSysData[ii],
amoebaGpu->psWorkArray_3_2->_pSysData[ii+1],
amoebaGpu->psWorkArray_3_2->_pSysData[ii+2] );
}
/*
......@@ -972,14 +972,14 @@ static void printKirkwoodEDiffBuffer( amoebaGpuContext amoebaGpu, unsigned int b
stop = -146016;
float maxV = -1.0e+99;
for( unsigned int ii = start; ii < stop; ii += 3 ){
if( amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii] > maxV ){
if( amoebaGpu->psWorkArray_3_1->_pSysData[ii] > maxV ){
unsigned int ii3Index = ii/3;
unsigned int bufferIndex = ii3Index/(amoebaGpu->paddedNumberOfAtoms);
unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, "MaxQ %6u %3u %6u %14.6e\n",
ii/3, bufferIndex, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii] );
maxV = amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii];
amoebaGpu->psWorkArray_3_1->_pSysData[ii] );
maxV = amoebaGpu->psWorkArray_3_1->_pSysData[ii];
}
}
*/
......@@ -992,12 +992,12 @@ static void printKirkwoodEDiffAtomBuffers( amoebaGpuContext amoebaGpu, unsigned
unsigned int particleIndex = 3*(targetAtom + ii*amoebaGpu->paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, " %2u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
ii, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex],
amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex+1],
amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex+2],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex+1],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex+2] );
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex],
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+1],
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+2],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+1],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+2] );
}
}
#endif
......@@ -1041,7 +1041,7 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
}
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysStream[0], 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
memset( debugArray->_pSysData, 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
debugArray->Upload();
unsigned int targetAtom = 0;
#endif
......@@ -1074,39 +1074,39 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaCudaKirkwoodEDiffN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(KirkwoodEDiffParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0],
gpu->psPosq4->_pDevStream[0],
amoebaGpu->psLabFrameDipole->_pDevStream[0],
amoebaGpu->psLabFrameQuadrupole->_pDevStream[0],
amoebaGpu->psInducedDipole->_pDevStream[0],
amoebaGpu->psInducedDipolePolar->_pDevStream[0],
amoebaGpu->psInducedDipoleS->_pDevStream[0],
amoebaGpu->psInducedDipolePolarS->_pDevStream[0],
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
amoebaGpu->psWorkUnit->_pDevData,
gpu->psPosq4->_pDevData,
amoebaGpu->psLabFrameDipole->_pDevData,
amoebaGpu->psLabFrameQuadrupole->_pDevData,
amoebaGpu->psInducedDipole->_pDevData,
amoebaGpu->psInducedDipolePolar->_pDevData,
amoebaGpu->psInducedDipoleS->_pDevData,
amoebaGpu->psInducedDipolePolarS->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0],
debugArray->_pDevStream[0], targetAtom );
amoebaGpu->psWorkArray_3_2->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
amoebaGpu->psWorkArray_3_2->_pDevData );
#endif
} else {
kCalculateAmoebaCudaKirkwoodEDiffN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(KirkwoodEDiffParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0],
gpu->psPosq4->_pDevStream[0],
amoebaGpu->psLabFrameDipole->_pDevStream[0],
amoebaGpu->psLabFrameQuadrupole->_pDevStream[0],
amoebaGpu->psInducedDipole->_pDevStream[0],
amoebaGpu->psInducedDipolePolar->_pDevStream[0],
amoebaGpu->psInducedDipoleS->_pDevStream[0],
amoebaGpu->psInducedDipolePolarS->_pDevStream[0],
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
amoebaGpu->psWorkUnit->_pDevData,
gpu->psPosq4->_pDevData,
amoebaGpu->psLabFrameDipole->_pDevData,
amoebaGpu->psLabFrameQuadrupole->_pDevData,
amoebaGpu->psInducedDipole->_pDevData,
amoebaGpu->psInducedDipolePolar->_pDevData,
amoebaGpu->psInducedDipoleS->_pDevData,
amoebaGpu->psInducedDipolePolarS->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0],
debugArray->_pDevStream[0], targetAtom );
amoebaGpu->psWorkArray_3_2->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
amoebaGpu->psWorkArray_3_2->_pDevData );
#endif
}
......@@ -1138,34 +1138,34 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
// force
(void) fprintf( amoebaGpu->log,"KirkwoodEDiffF [%16.9e %16.9e %16.9e] ",
amoebaGpu->psKirkwoodEDiffForce->_pSysStream[0][indexOffset],
amoebaGpu->psKirkwoodEDiffForce->_pSysStream[0][indexOffset+1],
amoebaGpu->psKirkwoodEDiffForce->_pSysStream[0][indexOffset+2] );
amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset],
amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset+1],
amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset+2] );
// torque
(void) fprintf( amoebaGpu->log,"T [%16.9e %16.9e %16.9e] ",
amoebaGpu->psTorque->_pSysStream[0][indexOffset],
amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
amoebaGpu->psTorque->_pSysStream[0][indexOffset+2] );
amoebaGpu->psTorque->_pSysData[indexOffset],
amoebaGpu->psTorque->_pSysData[indexOffset+1],
amoebaGpu->psTorque->_pSysData[indexOffset+2] );
// coords
#if 0
(void) fprintf( amoebaGpu->log,"x[%16.9e %16.9e %16.9e] ",
gpu->psPosq4->_pSysStream[0][ii].x,
gpu->psPosq4->_pSysStream[0][ii].y,
gpu->psPosq4->_pSysStream[0][ii].z);
gpu->psPosq4->_pSysData[ii].x,
gpu->psPosq4->_pSysData[ii].y,
gpu->psPosq4->_pSysData[ii].z);
for( int jj = 0; jj < gpu->natoms && jj < 5; jj++ ){
int debugIndex = jj*gpu->natoms + ii;
float xx = gpu->psPosq4->_pSysStream[0][jj].x - gpu->psPosq4->_pSysStream[0][ii].x;
float yy = gpu->psPosq4->_pSysStream[0][jj].y - gpu->psPosq4->_pSysStream[0][ii].y;
float zz = gpu->psPosq4->_pSysStream[0][jj].z - gpu->psPosq4->_pSysStream[0][ii].z;
float xx = gpu->psPosq4->_pSysData[jj].x - gpu->psPosq4->_pSysData[ii].x;
float yy = gpu->psPosq4->_pSysData[jj].y - gpu->psPosq4->_pSysData[ii].y;
float zz = gpu->psPosq4->_pSysData[jj].z - gpu->psPosq4->_pSysData[ii].z;
(void) fprintf( amoebaGpu->log,"\n%4d %4d delta [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] ",
ii, jj, xx, yy, zz,
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y, debugArray->_pSysStream[0][debugIndex].z );
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );
}
#endif
......@@ -1177,8 +1177,8 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
(void) fprintf( amoebaGpu->log,"%5d %5d ediff F%T\n", ii, jj );
for( int kk = 0; kk < 5; kk++ ){
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
debugIndex += paddedNumberOfAtoms;
}
(void) fprintf( amoebaGpu->log,"\n" );
......@@ -1203,12 +1203,12 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
int offset = 3*ii;
(void) fprintf( amoebaGpu->log,"%6d F[%16.7e %16.7e %16.7e] T[%16.7e %16.7e %16.7e]\n", ii,
amoebaGpu->psKirkwoodEDiffForce->_pSysStream[0][offset],
amoebaGpu->psKirkwoodEDiffForce->_pSysStream[0][offset+1],
amoebaGpu->psKirkwoodEDiffForce->_pSysStream[0][offset+2],
amoebaGpu->psTorque->_pSysStream[0][offset],
amoebaGpu->psTorque->_pSysStream[0][offset+1],
amoebaGpu->psTorque->_pSysStream[0][offset+2] );
amoebaGpu->psKirkwoodEDiffForce->_pSysData[offset],
amoebaGpu->psKirkwoodEDiffForce->_pSysData[offset+1],
amoebaGpu->psKirkwoodEDiffForce->_pSysData[offset+2],
amoebaGpu->psTorque->_pSysData[offset],
amoebaGpu->psTorque->_pSysData[offset+1],
amoebaGpu->psTorque->_pSysData[offset+2] );
if( (ii == maxPrint) && (ii < (gpu->natoms - maxPrint)) )ii = gpu->natoms - maxPrint;
}
}
......@@ -1248,9 +1248,9 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
// force
(void) fprintf( amoebaGpu->log,"KirkwoodEDiffF [%16.9e %16.9e %16.9e] ",
amoebaGpu->psKirkwoodEDiffForce->_pSysStream[0][indexOffset],
amoebaGpu->psKirkwoodEDiffForce->_pSysStream[0][indexOffset+1],
amoebaGpu->psKirkwoodEDiffForce->_pSysStream[0][indexOffset+2] );
amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset],
amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset+1],
amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset+2] );
(void) fprintf( amoebaGpu->log,"\n" );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
......
......@@ -715,13 +715,13 @@ void cudaComputeAmoebaMapTorques( amoebaGpuContext amoebaGpu, CUDAStream<float>*
int indexOffset = ii*3;
(void) fprintf( amoebaGpu->log,"E[%16.9e %16.9e %16.9e] ",
amoebaGpu->psForce->_pSysStream[0][indexOffset],
amoebaGpu->psForce->_pSysStream[0][indexOffset+1],
amoebaGpu->psForce->_pSysStream[0][indexOffset+2] );
amoebaGpu->psForce->_pSysData[indexOffset],
amoebaGpu->psForce->_pSysData[indexOffset+1],
amoebaGpu->psForce->_pSysData[indexOffset+2] );
(void) fprintf( amoebaGpu->log,"T[%16.9e %16.9e %16.9e]\n",
amoebaGpu->psTorque->_pSysStream[0][indexOffset],
amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
amoebaGpu->psTorque->_pSysStream[0][indexOffset+2] );
amoebaGpu->psTorque->_pSysData[indexOffset],
amoebaGpu->psTorque->_pSysData[indexOffset+1],
amoebaGpu->psTorque->_pSysData[indexOffset+2] );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii )ii = gpu->natoms - maxPrint;
}
int nansDetected = checkForNansAndInfinities( gpu->natoms*3, amoebaGpu->psForce );
......@@ -735,7 +735,7 @@ void cudaComputeAmoebaMapTorques( amoebaGpuContext amoebaGpu, CUDAStream<float>*
// zero forces
#if 0
for( int ii = 0; ii < 3*gpu->natoms; ii++ ){
amoebaGpu->psForce->_pSysStream[0][ii] = 0.0f;
amoebaGpu->psForce->_pSysData[ii] = 0.0f;
}
amoebaGpu->psForce->Upload();
#endif
......@@ -746,15 +746,15 @@ void cudaComputeAmoebaMapTorques( amoebaGpuContext amoebaGpu, CUDAStream<float>*
/*
AmoebaTorqueMapZeroKernel<<< numBlocks, numThreads >>>(
gpu->natoms, amoebaGpu->torqueMapForce->_pDevStream[0] );
gpu->natoms, amoebaGpu->torqueMapForce->_pDevData );
LAUNCHERROR("AmoebaMapTrqZeroKernel");
*/
amoebaMapTorqueToForce_kernel<<< numBlocks, numThreads>>> (
psTorque->_pDevStream[0],
psTorque->_pDevData,
amoebaGpu->maxMapTorqueDifference,
amoebaGpu->torqueMapForce->_pDevStream[0] );
amoebaGpu->torqueMapForce->_pDevData );
LAUNCHERROR("AmoebaMapTrqKernel");
//#ifdef AMOEBA_DEBUG
......@@ -764,24 +764,24 @@ void cudaComputeAmoebaMapTorques( amoebaGpuContext amoebaGpu, CUDAStream<float>*
(void) fprintf( amoebaGpu->log,"Post AmoebaMapTrqKernel maxMapTorqueDifference=%d\n", amoebaGpu->maxMapTorqueDifference );
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "\n%5d multi[%d %d %d %d] offset=%d\n", ii,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].x,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].y,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].z,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].w,
amoebaGpu->psMultipoleAxisOffset->_pSysStream[0][ii] );
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].x,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].y,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].z,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].w,
amoebaGpu->psMultipoleAxisOffset->_pSysData[ii] );
int indexOffset = ii*3*amoebaGpu->maxMapTorqueDifference;
float sum[3] = { 0.0f, 0.0f, 0.0f };
for( int jj = 0; jj < amoebaGpu->maxMapTorqueDifference; jj++ ){
if( amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset] != 0.0f ){
if( amoebaGpu->torqueMapForce->_pSysData[indexOffset] != 0.0f ){
(void) fprintf( amoebaGpu->log," %4d %4d Temp[%16.9e %16.9e %16.9e] %d\n",
ii, jj + amoebaGpu->psMultipoleAxisOffset->_pSysStream[0][ii],
amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset],
amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+1],
amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+2], indexOffset );
sum[0] += amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset];
sum[1] += amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+1];
sum[2] += amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+2];
ii, jj + amoebaGpu->psMultipoleAxisOffset->_pSysData[ii],
amoebaGpu->torqueMapForce->_pSysData[indexOffset],
amoebaGpu->torqueMapForce->_pSysData[indexOffset+1],
amoebaGpu->torqueMapForce->_pSysData[indexOffset+2], indexOffset );
sum[0] += amoebaGpu->torqueMapForce->_pSysData[indexOffset];
sum[1] += amoebaGpu->torqueMapForce->_pSysData[indexOffset+1];
sum[2] += amoebaGpu->torqueMapForce->_pSysData[indexOffset+2];
}
indexOffset += 3;
}
......@@ -796,8 +796,8 @@ void cudaComputeAmoebaMapTorques( amoebaGpuContext amoebaGpu, CUDAStream<float>*
amoebaMapTorqueReduce_kernel<<< numBlocks, numThreads>>>(
numThreads, gpu->natoms,
amoebaGpu->maxMapTorqueDifference,
amoebaGpu->torqueMapForce->_pDevStream[0],
psForce->_pDevStream[0] );
amoebaGpu->torqueMapForce->_pDevData,
psForce->_pDevData );
LAUNCHERROR("amoebaMapTorqueReduce_kernel");
#ifdef AMOEBA_DEBUG
......@@ -813,13 +813,13 @@ void cudaComputeAmoebaMapTorques( amoebaGpuContext amoebaGpu, CUDAStream<float>*
int indexOffset = ii*3;
(void) fprintf( amoebaGpu->log,"E[%16.9e %16.9e %16.9e] ",
amoebaGpu->psForce->_pSysStream[0][indexOffset],
amoebaGpu->psForce->_pSysStream[0][indexOffset+1],
amoebaGpu->psForce->_pSysStream[0][indexOffset+2] );
amoebaGpu->psForce->_pSysData[indexOffset],
amoebaGpu->psForce->_pSysData[indexOffset+1],
amoebaGpu->psForce->_pSysData[indexOffset+2] );
(void) fprintf( amoebaGpu->log,"T[%16.9e %16.9e %16.9e]\n",
amoebaGpu->psTorque->_pSysStream[0][indexOffset],
amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
amoebaGpu->psTorque->_pSysStream[0][indexOffset+2] );
amoebaGpu->psTorque->_pSysData[indexOffset],
amoebaGpu->psTorque->_pSysData[indexOffset+1],
amoebaGpu->psTorque->_pSysData[indexOffset+2] );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii )ii = gpu->natoms - maxPrint;
}
(void) fflush( amoebaGpu->log );
......@@ -885,13 +885,13 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
int indexOffset = ii*3;
(void) fprintf( amoebaGpu->log,"E[%16.9e %16.9e %16.9e] ",
psForce->_pSysStream[0][indexOffset],
psForce->_pSysStream[0][indexOffset+1],
psForce->_pSysStream[0][indexOffset+2] );
psForce->_pSysData[indexOffset],
psForce->_pSysData[indexOffset+1],
psForce->_pSysData[indexOffset+2] );
(void) fprintf( amoebaGpu->log,"T[%16.9e %16.9e %16.9e]\n",
psTorque->_pSysStream[0][indexOffset],
psTorque->_pSysStream[0][indexOffset+1],
psTorque->_pSysStream[0][indexOffset+2] );
psTorque->_pSysData[indexOffset],
psTorque->_pSysData[indexOffset+1],
psTorque->_pSysData[indexOffset+2] );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii )ii = gpu->natoms - maxPrint;
}
int nansDetected = checkForNansAndInfinities( gpu->natoms*3, psForce );
......@@ -905,7 +905,7 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
// zero forces
#if 0
for( int ii = 0; ii < 3*gpu->natoms; ii++ ){
psForce->_pSysStream[0][ii] = 0.0f;
psForce->_pSysData[ii] = 0.0f;
}
psForce->Upload();
#endif
......@@ -914,18 +914,18 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
(void) fprintf( amoebaGpu->log,"Setting force & torque values.\n" );
for( int ii = 0; ii < 3*gpu->natoms; ii += 3 ){
psTorque->_pSysStream[0][ii] = 1.0f;
psTorque->_pSysStream[0][ii+1] = 0.0f;
psTorque->_pSysStream[0][ii+2] = 0.0f;
psTorque->_pSysData[ii] = 1.0f;
psTorque->_pSysData[ii+1] = 0.0f;
psTorque->_pSysData[ii+2] = 0.0f;
psForce->_pSysStream[0][ii] = 0.0f;
psForce->_pSysStream[0][ii+1] = 0.0f;
psForce->_pSysStream[0][ii+2] = 0.0f;
psForce->_pSysData[ii] = 0.0f;
psForce->_pSysData[ii+1] = 0.0f;
psForce->_pSysData[ii+2] = 0.0f;
}
for( int ii = 0; ii < gpu->natoms; ii++ ){
psCudaForce4->_pSysStream[0][ii].x = 0.0f;
psCudaForce4->_pSysStream[0][ii].y = 0.0f;
psCudaForce4->_pSysStream[0][ii].z = 0.0f;
psCudaForce4->_pSysData[ii].x = 0.0f;
psCudaForce4->_pSysData[ii].y = 0.0f;
psCudaForce4->_pSysData[ii].z = 0.0f;
}
psForce->Upload();
psTorque->Upload();
......@@ -938,16 +938,16 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
/*
AmoebaTorqueMapZeroKernel<<< numBlocks, numThreads >>>(
gpu->natoms, amoebaGpu->torqueMapForce->_pDevStream[0] );
gpu->natoms, amoebaGpu->torqueMapForce->_pDevData );
LAUNCHERROR("AmoebaMapTrqZeroKernel");
*/
//amoebaMapTorqueToForceOld_kernel<<< numBlocks, numThreads>>> (
amoebaMapTorqueToForce_kernel<<< numBlocks, numThreads>>> (
psTorque->_pDevStream[0],
psTorque->_pDevData,
amoebaGpu->maxMapTorqueDifference,
amoebaGpu->torqueMapForce->_pDevStream[0] );
amoebaGpu->torqueMapForce->_pDevData );
LAUNCHERROR("AmoebaMapTrqKernel");
//#ifdef AMOEBA_DEBUG
......@@ -957,23 +957,23 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
(void) fprintf( amoebaGpu->log,"Post AmoebaMapTrqKernel maxMapTorqueDifference=%d\n", amoebaGpu->maxMapTorqueDifference );
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "\n%5d multi[%d %d %d %d]\n", ii,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].x,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].y,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].z,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].w, amoebaGpu->psMultipoleAxisOffset->_pSysStream[0][ii] );
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].x,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].y,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].z,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].w, amoebaGpu->psMultipoleAxisOffset->_pSysData[ii] );
int indexOffset = ii*3*amoebaGpu->maxMapTorqueDifference;
float sum[3] = { 0.0f, 0.0f, 0.0f };
for( int jj = 0; jj < amoebaGpu->maxMapTorqueDifference; jj++ ){
if( amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset] != 0.0f ){
if( amoebaGpu->torqueMapForce->_pSysData[indexOffset] != 0.0f ){
(void) fprintf( amoebaGpu->log," %4d %4d Temp[%16.9e %16.9e %16.9e] %d\n",
ii, jj + amoebaGpu->psMultipoleAxisOffset->_pSysStream[0][ii],
amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset],
amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+1],
amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+2], indexOffset );
sum[0] += amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset];
sum[1] += amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+1];
sum[2] += amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+2];
ii, jj + amoebaGpu->psMultipoleAxisOffset->_pSysData[ii],
amoebaGpu->torqueMapForce->_pSysData[indexOffset],
amoebaGpu->torqueMapForce->_pSysData[indexOffset+1],
amoebaGpu->torqueMapForce->_pSysData[indexOffset+2], indexOffset );
sum[0] += amoebaGpu->torqueMapForce->_pSysData[indexOffset];
sum[1] += amoebaGpu->torqueMapForce->_pSysData[indexOffset+1];
sum[2] += amoebaGpu->torqueMapForce->_pSysData[indexOffset+2];
}
indexOffset += 3;
}
......@@ -988,8 +988,8 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
amoebaMapTorqueReduce_kernel2<<< numBlocks, numThreads>>>(
numThreads, gpu->natoms,
amoebaGpu->maxMapTorqueDifference,
amoebaGpu->torqueMapForce->_pDevStream[0],
psForce->_pDevStream[0], psCudaForce4->_pDevStream[0] );
amoebaGpu->torqueMapForce->_pDevData,
psForce->_pDevData, psCudaForce4->_pDevData );
LAUNCHERROR("amoebaMapTorqueReduce_kernel2");
#ifdef AMOEBA_DEBUG
......@@ -1007,21 +1007,21 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
int indexOffset = ii*3;
(void) fprintf( amoebaGpu->log,"FTtl[%16.9e %16.9e %16.9e] ",
psCudaForce4->_pSysStream[0][ii].x,
psCudaForce4->_pSysStream[0][ii].y,
psCudaForce4->_pSysStream[0][ii].z );
psCudaForce4->_pSysData[ii].x,
psCudaForce4->_pSysData[ii].y,
psCudaForce4->_pSysData[ii].z );
(void) fprintf( amoebaGpu->log,"F[%16.9e %16.9e %16.9e] ",
psForce->_pSysStream[0][indexOffset],
psForce->_pSysStream[0][indexOffset+1],
psForce->_pSysStream[0][indexOffset+2] );
psForce->_pSysData[indexOffset],
psForce->_pSysData[indexOffset+1],
psForce->_pSysData[indexOffset+2] );
(void) fprintf( amoebaGpu->log,"fT[%16.9e %16.9e %16.9e] ",
amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset],
amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+1],
amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+2] );
amoebaGpu->torqueMapForce->_pSysData[indexOffset],
amoebaGpu->torqueMapForce->_pSysData[indexOffset+1],
amoebaGpu->torqueMapForce->_pSysData[indexOffset+2] );
(void) fprintf( amoebaGpu->log,"T[%16.9e %16.9e %16.9e]\n",
psTorque->_pSysStream[0][indexOffset],
psTorque->_pSysStream[0][indexOffset+1],
psTorque->_pSysStream[0][indexOffset+2] );
psTorque->_pSysData[indexOffset],
psTorque->_pSysData[indexOffset+1],
psTorque->_pSysData[indexOffset+2] );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii )ii = gpu->natoms - maxPrint;
}
(void) fflush( amoebaGpu->log );
......@@ -1073,9 +1073,9 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce2( amoebaGpuContext amoebaGpu,
int numBlocks = 1 + (gpu->natoms/numThreads);
amoebaMapTorqueToForce_kernel<<< numBlocks, numThreads>>> (
psTorque->_pDevStream[0],
psTorque->_pDevData,
amoebaGpu->maxMapTorqueDifference,
amoebaGpu->torqueMapForce->_pDevStream[0] );
amoebaGpu->torqueMapForce->_pDevData );
LAUNCHERROR("AmoebaMapTrqKernel");
numBlocks = gpu->natoms;
......@@ -1084,8 +1084,8 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce2( amoebaGpuContext amoebaGpu,
amoebaMapTorqueReduce_kernel3<<< numBlocks, numThreads>>>(
numThreads, gpu->natoms,
amoebaGpu->maxMapTorqueDifference,
amoebaGpu->torqueMapForce->_pDevStream[0],
psCudaForce4->_pDevStream[0] );
amoebaGpu->torqueMapForce->_pDevData,
psCudaForce4->_pDevData );
LAUNCHERROR("amoebaMapTorqueReduce_kernel3");
#ifdef AMOEBA_DEBUG
......@@ -1103,17 +1103,17 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce2( amoebaGpuContext amoebaGpu,
int indexOffset = ii*3;
(void) fprintf( amoebaGpu->log,"FTtl[%16.9e %16.9e %16.9e] ",
psCudaForce4->_pSysStream[0][ii].x,
psCudaForce4->_pSysStream[0][ii].y,
psCudaForce4->_pSysStream[0][ii].z );
psCudaForce4->_pSysData[ii].x,
psCudaForce4->_pSysData[ii].y,
psCudaForce4->_pSysData[ii].z );
(void) fprintf( amoebaGpu->log,"F[%16.9e %16.9e %16.9e] ",
amoebaGpu->psForce->_pSysStream[0][indexOffset],
amoebaGpu->psForce->_pSysStream[0][indexOffset+1],
amoebaGpu->psForce->_pSysStream[0][indexOffset+2] );
amoebaGpu->psForce->_pSysData[indexOffset],
amoebaGpu->psForce->_pSysData[indexOffset+1],
amoebaGpu->psForce->_pSysData[indexOffset+2] );
(void) fprintf( amoebaGpu->log,"T[%16.9e %16.9e %16.9e]\n",
amoebaGpu->psTorque->_pSysStream[0][indexOffset],
amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
amoebaGpu->psTorque->_pSysStream[0][indexOffset+2] );
amoebaGpu->psTorque->_pSysData[indexOffset],
amoebaGpu->psTorque->_pSysData[indexOffset+1],
amoebaGpu->psTorque->_pSysData[indexOffset+2] );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii )ii = gpu->natoms - maxPrint;
}
(void) fflush( amoebaGpu->log );
......
......@@ -219,12 +219,12 @@ static void kReduceMutualInducedFields(amoebaGpuContext amoebaGpu, CUDAStream<fl
{
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevStream[0], outputArray->_pDevStream[0] );
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
LAUNCHERROR("kReduceMI_Fields1");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevStream[0], outputPolarArray->_pDevStream[0] );
amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData );
LAUNCHERROR("kReduceMI_Fields2");
}
......@@ -252,7 +252,7 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
}
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysStream[0], 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
memset( debugArray->_pSysData, 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
debugArray->Upload();
#endif
......@@ -260,13 +260,13 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaMutualInducedFieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0],
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
amoebaGpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0],
debugArray->_pDevStream[0], targetAtom );
amoebaGpu->psWorkArray_3_2->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
amoebaGpu->psWorkArray_3_2->_pDevData );
#endif
} else {
......@@ -280,13 +280,13 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
(void) fflush( amoebaGpu->log );
#endif
kCalculateAmoebaMutualInducedFieldN2_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0],
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
amoebaGpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0],
debugArray->_pDevStream[0], targetAtom );
amoebaGpu->psWorkArray_3_2->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
amoebaGpu->psWorkArray_3_2->_pDevData );
#endif
......@@ -314,34 +314,34 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
// MI
(void) fprintf( amoebaGpu->log,"Mult[%16.9e %16.9e %16.9e] ",
outputArray->_pSysStream[0][indexOffset],
outputArray->_pSysStream[0][indexOffset+1],
outputArray->_pSysStream[0][indexOffset+2] );
outputArray->_pSysData[indexOffset],
outputArray->_pSysData[indexOffset+1],
outputArray->_pSysData[indexOffset+2] );
// MI polar
(void) fprintf( amoebaGpu->log,"MultP[%16.9e %16.9e %16.9e] ",
outputPolarArray->_pSysStream[0][indexOffset],
outputPolarArray->_pSysStream[0][indexOffset+1],
outputPolarArray->_pSysStream[0][indexOffset+2] );
outputPolarArray->_pSysData[indexOffset],
outputPolarArray->_pSysData[indexOffset+1],
outputPolarArray->_pSysData[indexOffset+2] );
// coords
#if 0
(void) fprintf( amoebaGpu->log,"x[%16.9e %16.9e %16.9e] ",
gpu->psPosq4->_pSysStream[0][ii].x,
gpu->psPosq4->_pSysStream[0][ii].y,
gpu->psPosq4->_pSysStream[0][ii].z);
gpu->psPosq4->_pSysData[ii].x,
gpu->psPosq4->_pSysData[ii].y,
gpu->psPosq4->_pSysData[ii].z);
for( int jj = 0; jj < gpu->natoms && jj < 5; jj++ ){
int debugIndex = jj*gpu->natoms + ii;
float xx = gpu->psPosq4->_pSysStream[0][jj].x - gpu->psPosq4->_pSysStream[0][ii].x;
float yy = gpu->psPosq4->_pSysStream[0][jj].y - gpu->psPosq4->_pSysStream[0][ii].y;
float zz = gpu->psPosq4->_pSysStream[0][jj].z - gpu->psPosq4->_pSysStream[0][ii].z;
float xx = gpu->psPosq4->_pSysData[jj].x - gpu->psPosq4->_pSysData[ii].x;
float yy = gpu->psPosq4->_pSysData[jj].y - gpu->psPosq4->_pSysData[ii].y;
float zz = gpu->psPosq4->_pSysData[jj].z - gpu->psPosq4->_pSysData[ii].z;
(void) fprintf( amoebaGpu->log,"\n%4d %4d delta [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] ",
ii, jj, xx, yy, zz,
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y, debugArray->_pSysStream[0][debugIndex].z );
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );
}
#endif
......@@ -357,18 +357,18 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
int debugIndex = jj;
(void) fprintf( amoebaGpu->log,"%4d %4d Pint [%16.9e %16.9e %16.9e %16.9e] ",
ii, jj,
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
//debugIndex += gpu->natoms;
debugIndex += paddedNumberOfAtoms;
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e] ",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y, debugArray->_pSysStream[0][debugIndex].z );
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );
int index = 0;
sums[index][0] += debugArray->_pSysStream[0][debugIndex].x;
sums[index][1] += debugArray->_pSysStream[0][debugIndex].y;
sums[index][2] += debugArray->_pSysStream[0][debugIndex].z;
sums[index][0] += debugArray->_pSysData[debugIndex].x;
sums[index][1] += debugArray->_pSysData[debugIndex].y;
sums[index][2] += debugArray->_pSysData[debugIndex].z;
if( count && ( (count % 31) == 0) ){
static float saveSum[3] = { 0.0f, 0.0f, 0.0f };
......@@ -383,12 +383,12 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
debugIndex += paddedNumberOfAtoms;
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e] ",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y, debugArray->_pSysStream[0][debugIndex].z );
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );
index++;
sums[index][0] += debugArray->_pSysStream[0][debugIndex].x;
sums[index][1] += debugArray->_pSysStream[0][debugIndex].y;
sums[index][2] += debugArray->_pSysStream[0][debugIndex].z;
sums[index][0] += debugArray->_pSysData[debugIndex].x;
sums[index][1] += debugArray->_pSysData[debugIndex].y;
sums[index][2] += debugArray->_pSysData[debugIndex].z;
if( count && ( (count % 31) == 0) ){
static float saveSum[3] = { 0.0f, 0.0f, 0.0f };
......@@ -475,11 +475,11 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
kInitializeMutualInducedField_kernel<<< numBlocks, numThreads >>>(
gpu->natoms,
amoebaGpu->psE_Field->_pDevStream[0],
amoebaGpu->psE_FieldPolar->_pDevStream[0],
amoebaGpu->psPolarizability->_pDevStream[0],
amoebaGpu->psInducedDipole->_pDevStream[0],
amoebaGpu->psInducedDipolePolar->_pDevStream[0] );
amoebaGpu->psE_Field->_pDevData,
amoebaGpu->psE_FieldPolar->_pDevData,
amoebaGpu->psPolarizability->_pDevData,
amoebaGpu->psInducedDipole->_pDevData,
amoebaGpu->psInducedDipolePolar->_pDevData );
LAUNCHERROR("AmoebaMutualInducedFieldSetup");
#ifdef AMOEBA_DEBUG
......@@ -495,18 +495,18 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
int maxPrint = 20000;
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%4d pol=%12.4e ", ii,
amoebaGpu->psPolarizability->_pSysStream[0][offset] );
if( amoebaGpu->psPolarizability->_pSysStream[0][offset] != amoebaGpu->psPolarizability->_pSysStream[0][offset+1] ||
amoebaGpu->psPolarizability->_pSysStream[0][offset] != amoebaGpu->psPolarizability->_pSysStream[0][offset+2] ){
(void) fprintf( amoebaGpu->log, "PolX!!! %12.4e %12.4e ", amoebaGpu->psPolarizability->_pSysStream[0][offset+1], amoebaGpu->psPolarizability->_pSysStream[0][offset+2] );
amoebaGpu->psPolarizability->_pSysData[offset] );
if( amoebaGpu->psPolarizability->_pSysData[offset] != amoebaGpu->psPolarizability->_pSysData[offset+1] ||
amoebaGpu->psPolarizability->_pSysData[offset] != amoebaGpu->psPolarizability->_pSysData[offset+2] ){
(void) fprintf( amoebaGpu->log, "PolX!!! %12.4e %12.4e ", amoebaGpu->psPolarizability->_pSysData[offset+1], amoebaGpu->psPolarizability->_pSysData[offset+2] );
}
(void) fprintf( amoebaGpu->log," E[%14.6e %14.6e %14.6e] Mi[%14.6e %14.6e %14.6e] ",
amoebaGpu->psE_Field->_pSysStream[0][offset], amoebaGpu->psE_Field->_pSysStream[0][offset+1], amoebaGpu->psE_Field->_pSysStream[0][offset+2],
amoebaGpu->psInducedDipole->_pSysStream[0][offset], amoebaGpu->psInducedDipole->_pSysStream[0][offset+1], amoebaGpu->psInducedDipole->_pSysStream[0][offset+2] );
amoebaGpu->psE_Field->_pSysData[offset], amoebaGpu->psE_Field->_pSysData[offset+1], amoebaGpu->psE_Field->_pSysData[offset+2],
amoebaGpu->psInducedDipole->_pSysData[offset], amoebaGpu->psInducedDipole->_pSysData[offset+1], amoebaGpu->psInducedDipole->_pSysData[offset+2] );
(void) fprintf( amoebaGpu->log,"Ep[%14.6e %14.6e %14.6e] Mip[%14.6e %14.6e %14.6e]\n",
amoebaGpu->psE_FieldPolar->_pSysStream[0][offset], amoebaGpu->psE_FieldPolar->_pSysStream[0][offset+1], amoebaGpu->psE_FieldPolar->_pSysStream[0][offset+2],
amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset], amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset+1], amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset+2] );
amoebaGpu->psE_FieldPolar->_pSysData[offset], amoebaGpu->psE_FieldPolar->_pSysData[offset+1], amoebaGpu->psE_FieldPolar->_pSysData[offset+2],
amoebaGpu->psInducedDipolePolar->_pSysData[offset], amoebaGpu->psInducedDipolePolar->_pSysData[offset+1], amoebaGpu->psInducedDipolePolar->_pSysData[offset+2] );
offset += 3;
if( ii == maxPrint && (ii < (gpu->natoms - maxPrint) ) )ii = (gpu->natoms - maxPrint);
}
......@@ -529,17 +529,17 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
// post matrix multiply
kSorUpdateMutualInducedField_kernel<<< numBlocks, numThreads >>>(
gpu->natoms, amoebaGpu->psPolarizability->_pDevStream[0],
amoebaGpu->psInducedDipole->_pDevStream[0], amoebaGpu->psInducedDipolePolar->_pDevStream[0],
amoebaGpu->psE_Field->_pDevStream[0], amoebaGpu->psE_FieldPolar->_pDevStream[0],
amoebaGpu->psWorkVector[0]->_pDevStream[0], amoebaGpu->psWorkVector[1]->_pDevStream[0] );
gpu->natoms, amoebaGpu->psPolarizability->_pDevData,
amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData,
amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData,
amoebaGpu->psWorkVector[0]->_pDevData, amoebaGpu->psWorkVector[1]->_pDevData );
LAUNCHERROR("kSorUpdateMutualInducedField");
// get total epsilon -- performing sums on gpu
kReduceMutualInducedFieldDelta_kernel<<<1, amoebaGpu->epsilonThreadsPerBlock, 2*sizeof(float)*amoebaGpu->epsilonThreadsPerBlock>>>(
3*gpu->natoms, amoebaGpu->psWorkVector[0]->_pDevStream[0], amoebaGpu->psWorkVector[1]->_pDevStream[0],
amoebaGpu->psCurrentEpsilon->_pDevStream[0] );
3*gpu->natoms, amoebaGpu->psWorkVector[0]->_pDevData, amoebaGpu->psWorkVector[1]->_pDevData,
amoebaGpu->psCurrentEpsilon->_pDevData );
LAUNCHERROR("kReduceMutualInducedFieldDelta");
if( amoebaGpu->log ){
......@@ -548,7 +548,7 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
// Debye=48.033324f
amoebaGpu->psCurrentEpsilon->Download();
float currentEpsilon = amoebaGpu->psCurrentEpsilon->_pSysStream[0][0];
float currentEpsilon = amoebaGpu->psCurrentEpsilon->_pSysData[0];
amoebaGpu->mutualInducedCurrentEpsilon = currentEpsilon;
if( iteration > amoebaGpu->mutualInducedMaxIterations || amoebaGpu->mutualInducedCurrentEpsilon < amoebaGpu->mutualInducedTargetEpsilon ){
......@@ -569,9 +569,9 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
(void) fprintf( amoebaGpu->log, "%4d ", ii );
(void) fprintf( amoebaGpu->log," Mi[%14.6e %14.6e %14.6e] ",
amoebaGpu->psInducedDipole->_pSysStream[0][offset], amoebaGpu->psInducedDipole->_pSysStream[0][offset+1], amoebaGpu->psInducedDipole->_pSysStream[0][offset+2] );
amoebaGpu->psInducedDipole->_pSysData[offset], amoebaGpu->psInducedDipole->_pSysData[offset+1], amoebaGpu->psInducedDipole->_pSysData[offset+2] );
(void) fprintf( amoebaGpu->log,"Mip[%14.6e %14.6e %14.6e]\n",
amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset], amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset+1], amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset+2] );
amoebaGpu->psInducedDipolePolar->_pSysData[offset], amoebaGpu->psInducedDipolePolar->_pSysData[offset+1], amoebaGpu->psInducedDipolePolar->_pSysData[offset+2] );
if( ii == maxPrint && (ii < (gpu->natoms - maxPrint) ) ){
ii = (gpu->natoms - maxPrint);
offset = 3*(ii+1);
......
......@@ -6,8 +6,6 @@
#include "amoebaCudaKernels.h"
#include "bbsort.h"
//#define AMOEBA_DEBUG
static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
......@@ -241,6 +239,9 @@ void kGridSpreadFixedMultipoles_kernel()
int atomIndex = atomData.x;
int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z);
if( iz >= cSim.pmeGridSize.z ){
iz -= cSim.pmeGridSize.z;
}
float atomCharge = cSim.pPosq[atomIndex].w;
float atomDipoleX = xscale*cAmoebaSim.pLabFrameDipole[atomIndex*3];
float atomDipoleY = yscale*cAmoebaSim.pLabFrameDipole[atomIndex*3+1];
......@@ -271,6 +272,9 @@ void kGridSpreadFixedMultipoles_kernel()
int atomIndex = atomData.x;
int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z);
if( iz >= cSim.pmeGridSize.z ){
iz -= cSim.pmeGridSize.z;
}
float atomCharge = cSim.pPosq[atomIndex].w;
float atomDipoleX = xscale*cAmoebaSim.pLabFrameDipole[atomIndex*3];
float atomDipoleY = yscale*cAmoebaSim.pLabFrameDipole[atomIndex*3+1];
......@@ -332,6 +336,9 @@ void kGridSpreadInducedDipoles_kernel()
int atomIndex = atomData.x;
int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z);
if( iz >= cSim.pmeGridSize.z ){
iz -= cSim.pmeGridSize.z;
}
float inducedDipoleX = xscale*cAmoebaSim.pInducedDipole[atomIndex*3];
float inducedDipoleY = yscale*cAmoebaSim.pInducedDipole[atomIndex*3+1];
float inducedDipoleZ = zscale*cAmoebaSim.pInducedDipole[atomIndex*3+2];
......@@ -360,6 +367,9 @@ void kGridSpreadInducedDipoles_kernel()
int atomIndex = atomData.x;
int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z);
if( iz >= cSim.pmeGridSize.z ){
iz -= cSim.pmeGridSize.z;
}
float inducedDipoleX = xscale*cAmoebaSim.pInducedDipole[atomIndex*3];
float inducedDipoleY = yscale*cAmoebaSim.pInducedDipole[atomIndex*3+1];
float inducedDipoleZ = zscale*cAmoebaSim.pInducedDipole[atomIndex*3+2];
......@@ -705,21 +715,23 @@ void kComputeInducedPotentialFromGrid_kernel()
cAmoebaSim.pPhid[10*m+1] = tuv100_1;
cAmoebaSim.pPhid[10*m+2] = tuv010_1;
cAmoebaSim.pPhid[10*m+3] = tuv001_1;
cAmoebaSim.pPhid[10*m+4] = tuv100_1;
cAmoebaSim.pPhid[10*m+5] = tuv010_1;
cAmoebaSim.pPhid[10*m+4] = tuv200_1;
cAmoebaSim.pPhid[10*m+5] = tuv020_1;
cAmoebaSim.pPhid[10*m+6] = tuv002_1;
cAmoebaSim.pPhid[10*m+7] = tuv110_1;
cAmoebaSim.pPhid[10*m+8] = tuv101_1;
cAmoebaSim.pPhid[10*m+9] = tuv011_1;
cAmoebaSim.pPhip[10*m+1] = tuv100_2;
cAmoebaSim.pPhip[10*m+2] = tuv010_2;
cAmoebaSim.pPhip[10*m+3] = tuv001_2;
cAmoebaSim.pPhip[10*m+4] = tuv100_2;
cAmoebaSim.pPhip[10*m+5] = tuv010_2;
cAmoebaSim.pPhip[10*m+4] = tuv200_2;
cAmoebaSim.pPhip[10*m+5] = tuv020_2;
cAmoebaSim.pPhip[10*m+6] = tuv002_2;
cAmoebaSim.pPhip[10*m+7] = tuv110_2;
cAmoebaSim.pPhip[10*m+8] = tuv101_2;
cAmoebaSim.pPhip[10*m+9] = tuv011_2;
cAmoebaSim.pPhidp[20*m] = tuv000;
cAmoebaSim.pPhidp[20*m+1] = tuv100;
cAmoebaSim.pPhidp[20*m+2] = tuv010;
......@@ -888,23 +900,40 @@ void kComputeInducedDipoleForceAndEnergy_kernel()
float* phip = &cAmoebaSim.pPhip[10*i];
float* phid = &cAmoebaSim.pPhid[10*i];
float4 f = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
for (int k = 0; k < 3; k++) {
int j1 = deriv1[k+1];
int j2 = deriv2[k+1];
int j3 = deriv3[k+1];
energy += inducedDipole[k]*phi[k+1];
f.x += (inducedDipole[k]+inducedDipolePolar[k])*phi[j1] + inducedDipole[k]*phip[j1] + inducedDipolePolar[k]*phid[j1];
f.y += (inducedDipole[k]+inducedDipolePolar[k])*phi[j2] + inducedDipole[k]*phip[j2] + inducedDipolePolar[k]*phid[j2];
f.z += (inducedDipole[k]+inducedDipolePolar[k])*phi[j3] + inducedDipole[k]*phip[j3] + inducedDipolePolar[k]*phid[j3];
}
f.x *= cSim.pmeGridSize.x*cSim.invPeriodicBoxSizeX;
f.y *= cSim.pmeGridSize.y*cSim.invPeriodicBoxSizeY;
f.z *= cSim.pmeGridSize.z*cSim.invPeriodicBoxSizeZ;
for (int k = 0; k < 10; k++) {
f.x += multipole[k]*phidp[deriv1[k]];
f.y += multipole[k]*phidp[deriv2[k]];
f.z += multipole[k]*phidp[deriv3[k]];
}
f.x *= 0.5f*cAmoebaSim.electric*cSim.pmeGridSize.x*cSim.invPeriodicBoxSizeX;
f.y *= 0.5f*cAmoebaSim.electric*cSim.pmeGridSize.y*cSim.invPeriodicBoxSizeY;
f.z *= 0.5f*cAmoebaSim.electric*cSim.pmeGridSize.z*cSim.invPeriodicBoxSizeZ;
/*
f.x *= 0.5f*cAmoebaSim.electric;
f.y *= 0.5f*cAmoebaSim.electric;
f.z *= 0.5f*cAmoebaSim.electric;
*/
float4 force = cSim.pForce4[i];
force.x -= f.x;
force.y -= f.y;
......@@ -1021,5 +1050,7 @@ void kCalculateAmoebaPMEInducedDipoleForces(amoebaGpuContext amoebaGpu)
gpuContext gpu = amoebaGpu->gpuContext;
kComputeInducedDipoleForceAndEnergy_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kComputeInducedDipoleForceAndEnergy");
cudaComputeAmoebaMapTorquesAndAddTotalForce2(amoebaGpu, amoebaGpu->psTorque, gpu->psForce4);
LAUNCHERROR("cudaComputeAmoebaMapTorquesAndAddTotalForce2_kCalculateAmoebaPMEInducedDipoleForces");
}
......@@ -1089,12 +1089,12 @@ static void kReduceForceTorque(amoebaGpuContext amoebaGpu )
{
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psForce->_pDevStream[0] );
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psForce->_pDevData );
LAUNCHERROR("kReducePmeDirectElectrostaticForce");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psTorque->_pDevStream[0] );
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psTorque->_pDevData );
LAUNCHERROR("kReducePmeDirectElectrostaticTorque");
}
......@@ -1109,9 +1109,9 @@ static void zeroForce( amoebaGpuContext amoebaGpu )
{
gpuContext gpu = amoebaGpu->gpuContext;
for( int ii = 0; ii < amoebaGpu->gpuContext->natoms; ii++ ){
gpu->psForce4->_pSysStream[0][ii].x = 0.0f;
gpu->psForce4->_pSysStream[0][ii].y = 0.0f;
gpu->psForce4->_pSysStream[0][ii].z = 0.0f;
gpu->psForce4->_pSysData[ii].x = 0.0f;
gpu->psForce4->_pSysData[ii].y = 0.0f;
gpu->psForce4->_pSysData[ii].z = 0.0f;
}
gpu->psForce4->Upload();
}
......@@ -1129,10 +1129,10 @@ static void copyForce( amoebaGpuContext amoebaGpu, float conversion )
gpu->psForce4->Download();
int indexOffset = 0;
for( int ii = 0; ii < amoebaGpu->gpuContext->natoms; ii++ ){
amoebaGpu->psForce->_pSysStream[0][indexOffset] = conversion*(gpu->psForce4->_pSysStream[0][ii].x);
amoebaGpu->psForce->_pSysStream[0][indexOffset+1] = conversion*(gpu->psForce4->_pSysStream[0][ii].y);
amoebaGpu->psForce->_pSysStream[0][indexOffset+2] = conversion*(gpu->psForce4->_pSysStream[0][ii].z);
indexOffset += 3;
amoebaGpu->psForce->_pSysData[indexOffset] = conversion*(gpu->psForce4->_pSysData[ii].x);
amoebaGpu->psForce->_pSysData[indexOffset+1] = conversion*(gpu->psForce4->_pSysData[ii].y);
amoebaGpu->psForce->_pSysData[indexOffset+2] = conversion*(gpu->psForce4->_pSysData[ii].z);
indexOffset += 3;
}
amoebaGpu->psForce->Upload();
}
......@@ -1179,8 +1179,9 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
}
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysStream[0], 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
int maxOffset = 20;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(maxOffset*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysData, 0, sizeof( float )*4*maxOffset*paddedNumberOfAtoms);
debugArray->Upload();
unsigned int targetAtom = 49;
#endif
......@@ -1198,18 +1199,18 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
unsigned int lineTokenIndex = 1;
// (void) fprintf( amoebaGpu->log, " %u %s %s\n", ii, lineTokens[0].c_str(), lineTokens[lineTokenIndex].c_str() ); fflush( amoebaGpu->log );
amoebaGpu->psInducedDipole->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psInducedDipole->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psInducedDipole->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psInducedDipole->_pSysData[offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psInducedDipole->_pSysData[offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psInducedDipole->_pSysData[offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
offset -= 3;
amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psInducedDipolePolar->_pSysData[offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psInducedDipolePolar->_pSysData[offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psInducedDipolePolar->_pSysData[offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
}
float conversion = 0.1f;
for( int ii = 0; ii < 3*gpu->natoms; ii++ ){
amoebaGpu->psInducedDipole->_pSysStream[0][ii] *= conversion;
amoebaGpu->psInducedDipolePolar->_pSysStream[0][ii] *= conversion;
amoebaGpu->psInducedDipole->_pSysData[ii] *= conversion;
amoebaGpu->psInducedDipolePolar->_pSysData[ii] *= conversion;
}
amoebaGpu->gpuContext->sim.alphaEwald = 5.4459052e+00f;
SetCalculateAmoebaPmeDirectElectrostaticSim(amoebaGpu);
......@@ -1248,24 +1249,24 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
kCalculateAmoebaPmeDirectElectrostaticCutoffByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0],
debugArray->_pDevStream[0], targetAtom );
amoebaGpu->psWorkArray_3_2->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
amoebaGpu->psWorkArray_3_2->_pDevData );
#endif
} else {
kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0],
debugArray->_pDevStream[0], targetAtom );
amoebaGpu->psWorkArray_3_2->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
amoebaGpu->psWorkArray_3_2->_pDevData );
#endif
}
LAUNCHERROR("kCalculateAmoebaPmeDirectElectrostaticCutoffForces");
......@@ -1292,20 +1293,20 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
// force
(void) fprintf( amoebaGpu->log,"PmeDirectElectrostaticF [%16.9e %16.9e %16.9e] ",
conversion*amoebaGpu->psForce->_pSysStream[0][indexOffset],
conversion*amoebaGpu->psForce->_pSysStream[0][indexOffset+1],
conversion*amoebaGpu->psForce->_pSysStream[0][indexOffset+2] );
conversion*amoebaGpu->psForce->_pSysData[indexOffset],
conversion*amoebaGpu->psForce->_pSysData[indexOffset+1],
conversion*amoebaGpu->psForce->_pSysData[indexOffset+2] );
forceSum[0] += amoebaGpu->psForce->_pSysStream[0][indexOffset];
forceSum[1] += amoebaGpu->psForce->_pSysStream[0][indexOffset+1];
forceSum[2] += amoebaGpu->psForce->_pSysStream[0][indexOffset+2];
forceSum[0] += amoebaGpu->psForce->_pSysData[indexOffset];
forceSum[1] += amoebaGpu->psForce->_pSysData[indexOffset+1];
forceSum[2] += amoebaGpu->psForce->_pSysData[indexOffset+2];
// torque
(void) fprintf( amoebaGpu->log,"PmeDirectElectrostaticT [%16.9e %16.9e %16.9e] ",
conversion*amoebaGpu->psTorque->_pSysStream[0][indexOffset],
conversion*amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
conversion*amoebaGpu->psTorque->_pSysStream[0][indexOffset+2] );
conversion*amoebaGpu->psTorque->_pSysData[indexOffset],
conversion*amoebaGpu->psTorque->_pSysData[indexOffset+1],
conversion*amoebaGpu->psTorque->_pSysData[indexOffset+2] );
(void) fprintf( amoebaGpu->log,"\n" );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
......@@ -1337,15 +1338,15 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
int offset3 = 8*paddedNumberOfAtoms;
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj;
if( fabs( debugArray->_pSysStream[0][debugIndex+5*paddedNumberOfAtoms].y ) < 1.0e-10 )continue;
if( fabs( debugArray->_pSysData[debugIndex+5*paddedNumberOfAtoms].y ) < 1.0e-10 )continue;
if( jj != targetAtom ){
torqueSum0[0] += debugArray->_pSysStream[0][debugIndex+offset0].x + debugArray->_pSysStream[0][debugIndex+offset1].x;
torqueSum0[1] += debugArray->_pSysStream[0][debugIndex+offset0].y + debugArray->_pSysStream[0][debugIndex+offset1].y;
torqueSum0[2] += debugArray->_pSysStream[0][debugIndex+offset0].z + debugArray->_pSysStream[0][debugIndex+offset1].z;
torqueSum0[0] += debugArray->_pSysData[debugIndex+offset0].x + debugArray->_pSysData[debugIndex+offset1].x;
torqueSum0[1] += debugArray->_pSysData[debugIndex+offset0].y + debugArray->_pSysData[debugIndex+offset1].y;
torqueSum0[2] += debugArray->_pSysData[debugIndex+offset0].z + debugArray->_pSysData[debugIndex+offset1].z;
torqueSum1[0] += debugArray->_pSysStream[0][debugIndex+offset2].x + debugArray->_pSysStream[0][debugIndex+offset3].x;
torqueSum1[1] += debugArray->_pSysStream[0][debugIndex+offset2].y + debugArray->_pSysStream[0][debugIndex+offset3].y;
torqueSum1[2] += debugArray->_pSysStream[0][debugIndex+offset2].z + debugArray->_pSysStream[0][debugIndex+offset3].z;
torqueSum1[0] += debugArray->_pSysData[debugIndex+offset2].x + debugArray->_pSysData[debugIndex+offset3].x;
torqueSum1[1] += debugArray->_pSysData[debugIndex+offset2].y + debugArray->_pSysData[debugIndex+offset3].y;
torqueSum1[2] += debugArray->_pSysData[debugIndex+offset2].z + debugArray->_pSysData[debugIndex+offset3].z;
}
if( jj == 2 ){
offset0 += 2*paddedNumberOfAtoms;
......@@ -1353,8 +1354,8 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
}
for( int kk = 0; kk < 12; kk++ ){
(void) fprintf( amoebaGpu->log,"%5d %5d %5d [%16.9e %16.9e %16.9e %16.9e] E11\n", targetAtom, jj, kk,
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
debugIndex += paddedNumberOfAtoms;
}
(void) fprintf( amoebaGpu->log,"%5d %5d [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] Sum\n", targetAtom, jj,
......@@ -1405,13 +1406,48 @@ void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu )
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
zeroForce( amoebaGpu );
kCalculateAmoebaPMEInducedDipoleForces( amoebaGpu );
copyForce( amoebaGpu, -1.0f/41.84 );
float conversion = -1.0f/41.84;
copyForce( amoebaGpu, conversion );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector, gpu->psAtomIndex->_pSysData );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeRecipForce", fileId, outputVector );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeRecipDemForce", fileId, outputVector );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float>* debugArray = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "FArray");
int index = 0;
for( int ii = 0; ii < amoebaGpu->gpuContext->natoms; ii++ ){
debugArray->_pSysData[index] = amoebaGpu->psForce->_pSysData[index];
debugArray->_pSysData[index+1] = amoebaGpu->psForce->_pSysData[index+1];
debugArray->_pSysData[index+2] = amoebaGpu->psForce->_pSysData[index+2];
index += 3;
}
//zeroForce( amoebaGpu );
kCalculateAmoebaPMEInducedDipoleForces( amoebaGpu );
copyForce( amoebaGpu, conversion );
VectorOfDoubleVectors outputVector1;
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector1, gpu->psAtomIndex->_pSysData );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeRecipForce", fileId, outputVector1 );
VectorOfDoubleVectors outputVector2;
index = 0;
for( int ii = 0; ii < amoebaGpu->gpuContext->natoms; ii++ ){
amoebaGpu->psForce->_pSysData[index] -= debugArray->_pSysData[index];
amoebaGpu->psForce->_pSysData[index+1] -= debugArray->_pSysData[index+1];
amoebaGpu->psForce->_pSysData[index+2] -= debugArray->_pSysData[index+2];
index += 3;
}
amoebaGpu->psForce->Upload();
outputVector.resize(0);
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector2, gpu->psAtomIndex->_pSysData );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeRecipDepForce", fileId, outputVector2 );
zeroForce( amoebaGpu );
kCalculateAmoebaPMEInducedDipoleForces( amoebaGpu );
//zeroForce( amoebaGpu );
exit(0);
}
if( 0 ){
......
......@@ -335,8 +335,8 @@ if( atomI == targetAtom || atomJ == targetAtom ){
} else {
sA[threadIdx.x].tempForce[0] = forceTorqueEnergy[0].x;
sA[threadIdx.x].tempForce[1] = forceTorqueEnergy[1].y;
sA[threadIdx.x].tempForce[2] = forceTorqueEnergy[2].z;
sA[threadIdx.x].tempForce[1] = forceTorqueEnergy[0].y;
sA[threadIdx.x].tempForce[2] = forceTorqueEnergy[0].z;
sA[threadIdx.x].tempTorque[0] = forceTorqueEnergy[2].x;
sA[threadIdx.x].tempTorque[1] = forceTorqueEnergy[2].y;
......
......@@ -144,14 +144,13 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
unsigned int index = atomI == targetAtom ? (y + j) : atomI;
unsigned int indexI = 0;
unsigned int indexJ = indexI ? 0 : 2;
unsigned int indices[4] = { indexI, indexJ, indexI+1, indexJ+1 };
float flag = 7.0f;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j);
debugArray[index].z = dScaleValue;
debugArray[index].w = pScaleValue;
/*
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = (float) bExclusionFlag;
debugArray[index].y = (float) (tgx);
......@@ -163,14 +162,31 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
debugArray[index].y = (float) pScaleMask.x;
debugArray[index].z = (float) pScaleMask.y;
debugArray[index].w = flag;
*/
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[0].x;
debugArray[index].y = match ? 0.0f : ijField[1].x;
debugArray[index].z = match ? 0.0f : ijField[2].x;
debugArray[index].w = flag + 1.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[0].z;
debugArray[index].y = match ? 0.0f : ijField[1].z;
debugArray[index].z = match ? 0.0f : ijField[2].z;
debugArray[index].w = flag + 2.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[0].y;
debugArray[index].y = match ? 0.0f : ijField[1].y;
debugArray[index].z = match ? 0.0f : ijField[2].y;
debugArray[index].w = flag + 3.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[0].w;
debugArray[index].y = match ? 0.0f : ijField[1].w;
debugArray[index].z = match ? 0.0f : ijField[2].w;
debugArray[index].w = flag + 4.0f;
for( int ii = 0; ii < 4; ii++ ){
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[indices[ii]][0];
debugArray[index].y = match ? 0.0f : ijField[indices[ii]][1];
debugArray[index].z = match ? 0.0f : ijField[indices[ii]][2];
debugArray[index].w = flag;
}
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
index += cAmoebaSim.paddedNumberOfAtoms;
......@@ -314,6 +330,7 @@ if( (atomI == targetAtom || (y + jIdx) == targetAtom) ){
debugArray[index].w = pScaleValue;
float flag = 9.0f;
/*
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = (float) bExclusionFlag;
debugArray[index].y = (float) (tgx);
......@@ -325,30 +342,30 @@ if( (atomI == targetAtom || (y + jIdx) == targetAtom) ){
debugArray[index].y = (float) pScaleMask.x;
debugArray[index].z = (float) pScaleMask.y;
debugArray[index].w = (float) flags;
*/
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = outOfBounds ? 0.0f : ijField[indexI][0];
debugArray[index].y = outOfBounds ? 0.0f : ijField[indexI][1];
debugArray[index].z = outOfBounds ? 0.0f : ijField[indexI][2];
debugArray[index].w = flag;
debugArray[index].x = outOfBounds ? 0.0f : ijField[0].x;
debugArray[index].y = outOfBounds ? 0.0f : ijField[1].x;
debugArray[index].z = outOfBounds ? 0.0f : ijField[2].x;
debugArray[index].w = flag + 1.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = outOfBounds ? 0.0f : ijField[indexJ][0];
debugArray[index].y = outOfBounds ? 0.0f : ijField[indexJ][1];
debugArray[index].z = outOfBounds ? 0.0f : ijField[indexJ][2];
debugArray[index].w = flag;
debugArray[index].x = outOfBounds ? 0.0f : ijField[0].y;
debugArray[index].y = outOfBounds ? 0.0f : ijField[1].y;
debugArray[index].z = outOfBounds ? 0.0f : ijField[2].y;
debugArray[index].w = flag + 2.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = outOfBounds ? 0.0f : ijField[indexI+1][0];
debugArray[index].y = outOfBounds ? 0.0f : ijField[indexI+1][1];
debugArray[index].z = outOfBounds ? 0.0f : ijField[indexI+1][2];
debugArray[index].w = flag;
debugArray[index].x = outOfBounds ? 0.0f : ijField[0].z;
debugArray[index].y = outOfBounds ? 0.0f : ijField[1].z;
debugArray[index].z = outOfBounds ? 0.0f : ijField[2].z;
debugArray[index].w = flag + 3.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = outOfBounds ? 0.0f : ijField[indexJ+1][0];
debugArray[index].y = outOfBounds ? 0.0f : ijField[indexJ+1][1];
debugArray[index].z = outOfBounds ? 0.0f : ijField[indexJ+1][2];
debugArray[index].w = flag;
debugArray[index].x = outOfBounds ? 0.0f : ijField[0].w;
debugArray[index].y = outOfBounds ? 0.0f : ijField[1].w;
debugArray[index].z = outOfBounds ? 0.0f : ijField[2].w;
debugArray[index].w = flag + 4.0f;
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
index += cAmoebaSim.paddedNumberOfAtoms;
......
......@@ -122,7 +122,6 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
fieldPolarSum[1] += mask ? ijField[1].z : 0.0f;
fieldPolarSum[2] += mask ? ijField[2].z : 0.0f;
/*
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom || (y+j) == targetAtom ){
unsigned int index = atomI == targetAtom ? (y+j) : atomI;
......@@ -151,35 +150,35 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
index += cAmoebaSim.paddedNumberOfAtoms;
float flag = 6.0f;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
debugArray[index].x = ijField[0].x;
debugArray[index].y = ijField[1].x;
debugArray[index].z = ijField[2].x;
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
debugArray[index].x = ijField[0].x;
debugArray[index].y = ijField[1].x;
debugArray[index].z = ijField[2].x;
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI+1][0];
debugArray[index].y = ijField[indexI+1][1];
debugArray[index].z = ijField[indexI+1][2];
debugArray[index].x = ijField[0].z;
debugArray[index].y = ijField[1].z;
debugArray[index].z = ijField[2].z;
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ+1][0];
debugArray[index].y = ijField[indexJ+1][1];
debugArray[index].z = ijField[indexJ+1][2];
debugArray[index].x = ijField[0].z;
debugArray[index].y = ijField[1].z;
debugArray[index].z = ijField[2].z;
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[indexI][0];
debugArray[index].y = match ? 0.0f : ijField[indexI][1];
debugArray[index].z = match ? 0.0f : ijField[indexI][2];
debugArray[index].x = match ? 0.0f : ijField[0].x;
debugArray[index].y = match ? 0.0f : ijField[1].x;
debugArray[index].z = match ? 0.0f : ijField[2].x;
index += cAmoebaSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
......@@ -192,7 +191,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
}
#endif
*/
}
// Write results
......@@ -274,13 +273,13 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
} else {
sA[threadIdx.x].tempBuffer[0] = mask ? 0.0f : ijField[0].y;
sA[threadIdx.x].tempBuffer[1] = mask ? 0.0f : ijField[1].y;
sA[threadIdx.x].tempBuffer[2] = mask ? 0.0f : ijField[2].y;
sA[threadIdx.x].tempBuffer[0] = mask ? ijField[0].y : 0.0;
sA[threadIdx.x].tempBuffer[1] = mask ? ijField[1].y : 0.0;
sA[threadIdx.x].tempBuffer[2] = mask ? ijField[2].y : 0.0;
sA[threadIdx.x].tempBufferP[0] = mask ? 0.0f : ijField[0].w;
sA[threadIdx.x].tempBufferP[1] = mask ? 0.0f : ijField[1].w;
sA[threadIdx.x].tempBufferP[2] = mask ? 0.0f : ijField[2].w;
sA[threadIdx.x].tempBufferP[0] = mask ? ijField[0].w : 0.0;
sA[threadIdx.x].tempBufferP[1] = mask ? ijField[1].w : 0.0;
sA[threadIdx.x].tempBufferP[2] = mask ? ijField[2].w : 0.0;
if( tgx % 2 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
......
......@@ -423,15 +423,15 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
for( int ii = 0; ii < gpu->natoms; ii++ ){
int mIndex = 3*ii;
(void) fprintf( amoebaGpu->log,"%6d [%6d %6d %6d] x[%16.9e %16.9e %16.9e] dpl[%16.9e %16.9e %16.9e]\nRot[%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n\n", ii,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].x,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].y,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].w,
gpu->psPosq4->_pSysStream[0][ii].x,
gpu->psPosq4->_pSysStream[0][ii].y,
gpu->psPosq4->_pSysStream[0][ii].z,
amoebaGpu->psMolecularDipole->_pSysStream[0][mIndex],
amoebaGpu->psMolecularDipole->_pSysStream[0][mIndex+1],
amoebaGpu->psMolecularDipole->_pSysStream[0][mIndex+2] );
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].x,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].y,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].w,
gpu->psPosq4->_pSysData[ii].x,
gpu->psPosq4->_pSysData[ii].y,
gpu->psPosq4->_pSysData[ii].z,
amoebaGpu->psMolecularDipole->_pSysData[mIndex],
amoebaGpu->psMolecularDipole->_pSysData[mIndex+1],
amoebaGpu->psMolecularDipole->_pSysData[mIndex+2] );
}
}
// int64 kernelTime = AmoebaTiming::getTimeOfDay();
......@@ -452,10 +452,10 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
(void) fprintf( amoebaGpu->log, "Finished rotation kernel execution in %lf us\n", kernelTime ); (void) fflush( amoebaGpu->log );
(void) fprintf( amoebaGpu->log, "psLabFrameDipole=%p _pSysStream=%p _pSysStream[0]=%p _pDevStream=%p _pDevStream[0]=%p\n",
amoebaGpu->psLabFrameDipole, amoebaGpu->psLabFrameDipole->_pSysStream,
amoebaGpu->psLabFrameDipole->_pSysStream[0], amoebaGpu->psLabFrameDipole->_pDevStream, amoebaGpu->psLabFrameDipole->_pDevStream[0] );
amoebaGpu->psLabFrameDipole->_pSysData, amoebaGpu->psLabFrameDipole->_pDevStream, amoebaGpu->psLabFrameDipole->_pDevData );
fflush( amoebaGpu->log );
amoebaGpu->psRotationMatrix->Download();
//amoebaGpu->psRotationMatrix->Download();
amoebaGpu->psLabFrameDipole->Download();
(void) fprintf( amoebaGpu->log, "psLabFrameDipole completed\n" ); (void) fflush( amoebaGpu->log );
......@@ -469,46 +469,46 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
int quadrupoleOffset = 9*ii;
(void) fprintf( amoebaGpu->log,"\n%6d [%6d %6d %6d] ", ii,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].x,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].y,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].w );
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].x,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].y,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].w );
// coords
(void) fprintf( amoebaGpu->log,"x[%16.9e %16.9e %16.9e]\n",
gpu->psPosq4->_pSysStream[0][ii].x,
gpu->psPosq4->_pSysStream[0][ii].y,
gpu->psPosq4->_pSysStream[0][ii].z);
gpu->psPosq4->_pSysData[ii].x,
gpu->psPosq4->_pSysData[ii].y,
gpu->psPosq4->_pSysData[ii].z);
/*
(void) fprintf( amoebaGpu->log," R[%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n",
amoebaGpu->psRotationMatrix->_pSysStream[0][quadrupoleOffset],
amoebaGpu->psRotationMatrix->_pSysStream[0][quadrupoleOffset+1],
amoebaGpu->psRotationMatrix->_pSysStream[0][quadrupoleOffset+2],
amoebaGpu->psRotationMatrix->_pSysStream[0][quadrupoleOffset+3],
amoebaGpu->psRotationMatrix->_pSysStream[0][quadrupoleOffset+4],
amoebaGpu->psRotationMatrix->_pSysStream[0][quadrupoleOffset+5],
amoebaGpu->psRotationMatrix->_pSysStream[0][quadrupoleOffset+6],
amoebaGpu->psRotationMatrix->_pSysStream[0][quadrupoleOffset+7],
amoebaGpu->psRotationMatrix->_pSysStream[0][quadrupoleOffset+8] );
amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset],
amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+1],
amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+2],
amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+3],
amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+4],
amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+5],
amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+6],
amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+7],
amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+8] );
*/
// dipole
(void) fprintf( amoebaGpu->log," D[%16.9e %16.9e %16.9e]\n",
amoebaGpu->psLabFrameDipole->_pSysStream[0][dipoleOffset],
amoebaGpu->psLabFrameDipole->_pSysStream[0][dipoleOffset+1],
amoebaGpu->psLabFrameDipole->_pSysStream[0][dipoleOffset+2] );
amoebaGpu->psLabFrameDipole->_pSysData[dipoleOffset],
amoebaGpu->psLabFrameDipole->_pSysData[dipoleOffset+1],
amoebaGpu->psLabFrameDipole->_pSysData[dipoleOffset+2] );
// quadrupole
(void) fprintf( amoebaGpu->log," Q[%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n",
amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][quadrupoleOffset],
amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][quadrupoleOffset+1],
amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][quadrupoleOffset+2],
amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][quadrupoleOffset+3],
amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][quadrupoleOffset+4],
amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][quadrupoleOffset+5],
amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][quadrupoleOffset+6],
amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][quadrupoleOffset+7],
amoebaGpu->psLabFrameQuadrupole->_pSysStream[0][quadrupoleOffset+8] );
amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset],
amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+1],
amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+2],
amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+3],
amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+4],
amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+5],
amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+6],
amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+7],
amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+8] );
if( ii == maxPrint && (ii < (gpu->natoms - maxPrint)) ){
ii = gpu->natoms - maxPrint;
......@@ -549,41 +549,30 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
}
//#define GET_INDUCED_DIPOLE_FROM_FILE
#ifdef GET_INDUCED_DIPOLE_FROM_FILE
#include <stdlib.h>
#endif
void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaGeneralizedKirkwood )
{
std::string methodName = "kCalculateAmoebaMultipoleForces";
//printf("%s \n", methodName.c_str() ); fflush( stdout );
// compute lab frame moments
cudaComputeAmoebaLabFrameMoments( amoebaGpu );
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psLabFrameDipole, outputVector, gpu->psAtomIndex->_pSysData );
cudaLoadCudaFloatArray( gpu->natoms, 9, amoebaGpu->psLabFrameQuadrupole, outputVector, gpu->psAtomIndex->_pSysData );
cudaWriteVectorOfDoubleVectorsToFile( "CudaLabMoments", fileId, outputVector );
}
// compute fixed E-field and mutual induced field
if( hasAmoebaGeneralizedKirkwood ){
cudaComputeAmoebaFixedEAndGkFields( amoebaGpu );
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, 0.0 );
initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, 0.0 );
initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psGk_Field, 0.0 );
}
cudaComputeAmoebaMutualInducedAndGkField( amoebaGpu );
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, 0.0 );
initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, 0.0 );
initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipoleS, 0.0 );
initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolarS, 0.0 );
amoebaGpu->mutualInducedDone = 1;
}
} else {
if( amoebaGpu->multipoleNonbondedMethod == AMOEBA_NO_CUTOFF ){
cudaComputeAmoebaFixedEField( amoebaGpu );
......@@ -595,75 +584,12 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
//compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, amoebaGpu->psWorkUnit->_pDevStream[0], gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, amoebaGpu->psWorkUnit->_pDevData, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic");
/*
if( 0 ){
gpu->psInteractionCount->Download();
gpu->psInteractingWorkUnit->Download();
gpu->psInteractionFlag->Download();
amoebaGpu->psWorkUnit->Download();
(void) fprintf( amoebaGpu->log, "Ixn count=%u\n", gpu->psInteractionCount->_pSysStream[0][0] );
for( unsigned int ii = 0; ii < gpu->psInteractingWorkUnit->_length; ii++ ){
unsigned int x = gpu->psInteractingWorkUnit->_pSysStream[0][ii];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
//unsigned int y = ((x >> 2) & 0x7fff);
unsigned int exclusions = (x & 0x1);
x = (x >> 17) << GRIDBITS;
// x = (x >> 17);
(void) fprintf( amoebaGpu->log, "GpuCell %8u %8u [%5u %5u %1u] %10u ", ii, gpu->psInteractingWorkUnit->_pSysStream[0][ii], x,y,exclusions, gpu->psInteractionFlag->_pSysStream[0][ii] );
x = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
y = ((x >> 2) & 0x7fff) << GRIDBITS;
exclusions = (x & 0x1);
x = (x >> 17) << GRIDBITS;
(void) fprintf( amoebaGpu->log, " AmGpu %8u [%5u %5u %1u]\n", amoebaGpu->psWorkUnit->_pSysStream[0][ii], x,y,exclusions );
}
}
*/
cudaComputeAmoebaPmeFixedEField( amoebaGpu );
cudaComputeAmoebaPmeMutualInducedField( amoebaGpu );
#ifdef GET_INDUCED_DIPOLE_FROM_FILE
if( 0 ){
//std::string fileName = "waterInducedDipole.txt";
std::string fileName = "water_3_MI.txt";
StringVectorVector fileContents;
readFile( fileName, fileContents );
unsigned int offset = 0;
(void) fprintf( amoebaGpu->log, "Read file: %s %u\n", fileName.c_str(), fileContents.size() ); fflush( amoebaGpu->log );
for( unsigned int ii = 1; ii < fileContents.size()-1; ii++ ){
StringVector lineTokens = fileContents[ii];
unsigned int lineTokenIndex = 1;
(void) fprintf( amoebaGpu->log, " %u %s [%s %s %s] [%15.7e %15.7e %15.7e]\n", ii, lineTokens[0].c_str(),
lineTokens[lineTokenIndex].c_str(), lineTokens[lineTokenIndex+1].c_str(), lineTokens[lineTokenIndex+2].c_str(),
amoebaGpu->psInducedDipole->_pSysStream[0][offset], amoebaGpu->psInducedDipole->_pSysStream[0][offset+1], amoebaGpu->psInducedDipole->_pSysStream[0][offset+2]);
amoebaGpu->psInducedDipole->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psInducedDipole->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psInducedDipole->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
offset -= 3;
amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
}
(void) fflush( amoebaGpu->log );
float conversion = 0.1f;
for( int ii = 0; ii < 3*gpu->natoms; ii++ ){
amoebaGpu->psInducedDipole->_pSysStream[0][ii] *= conversion;
amoebaGpu->psInducedDipolePolar->_pSysStream[0][ii] *= conversion;
}
//amoebaGpu->gpuContext->sim.alphaEwald = 5.4459052e+00f;
//SetCalculateAmoebaPmeDirectElectrostaticSim(amoebaGpu);
amoebaGpu->psInducedDipole->Upload();
amoebaGpu->psInducedDipolePolar->Upload();
}
#endif
}
}
......
......@@ -66,7 +66,7 @@ void kClearFloat4_kernel( unsigned int bufferLength, float4* fieldToClear )
void kClearFloat4( amoebaGpuContext amoebaGpu, unsigned int entries, CUDAStream<float4>* fieldToClear )
{
kClearFloat4_kernel<<<amoebaGpu->gpuContext->blocksPerSM, 384>>>( entries, fieldToClear->_pDevStream[0] );
kClearFloat4_kernel<<<amoebaGpu->gpuContext->blocksPerSM, 384>>>( entries, fieldToClear->_pDevData );
LAUNCHERROR("kClearFloat4");
}
......@@ -90,7 +90,7 @@ void kClearFloat_kernel( unsigned int bufferLength, float* fieldToClear )
void kClearFloat( amoebaGpuContext amoebaGpu, unsigned int entries, CUDAStream<float>* fieldToClear )
{
kClearFloat_kernel<<<amoebaGpu->gpuContext->blocksPerSM, 384>>>( entries, fieldToClear->_pDevStream[0] );
kClearFloat_kernel<<<amoebaGpu->gpuContext->blocksPerSM, 384>>>( entries, fieldToClear->_pDevData );
LAUNCHERROR("kClearFloat");
}
......@@ -118,23 +118,23 @@ void kClearFields_3( amoebaGpuContext amoebaGpu, unsigned int numberToClear )
{
kClearFields_kernel<<<amoebaGpu->nonbondBlocks, 384>>>( amoebaGpu->paddedNumberOfAtoms*3*amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevStream[0] );
amoebaGpu->psWorkArray_3_1->_pDevData );
LAUNCHERROR("kClearFields_3_1");
kClearFields_kernel<<<amoebaGpu->nonbondBlocks, 384>>>( amoebaGpu->paddedNumberOfAtoms*3*amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
amoebaGpu->psWorkArray_3_2->_pDevData );
LAUNCHERROR("kClearFields_3_2");
if( numberToClear > 2 ){
kClearFields_kernel<<<amoebaGpu->nonbondBlocks, 384>>>( amoebaGpu->paddedNumberOfAtoms*3*amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_3->_pDevStream[0] );
amoebaGpu->psWorkArray_3_3->_pDevData );
LAUNCHERROR("kClearFields_3_3");
} else {
return;
}
if( numberToClear > 3 ){
kClearFields_kernel<<<amoebaGpu->nonbondBlocks, 384>>>( amoebaGpu->paddedNumberOfAtoms*3*amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_4->_pDevStream[0] );
amoebaGpu->psWorkArray_3_4->_pDevData );
LAUNCHERROR("kClearFields_3_4");
}
}
......@@ -145,11 +145,11 @@ void kClearFields_1( amoebaGpuContext amoebaGpu )
{
kClearFields_kernel<<<amoebaGpu->nonbondBlocks, 384>>>( amoebaGpu->paddedNumberOfAtoms*amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_1_1->_pDevStream[0] );
amoebaGpu->psWorkArray_1_1->_pDevData );
LAUNCHERROR("kClearFields_1_1");
kClearFields_kernel<<<amoebaGpu->nonbondBlocks, 384>>>( amoebaGpu->paddedNumberOfAtoms*amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_1_2->_pDevStream[0] );
amoebaGpu->psWorkArray_1_2->_pDevData );
LAUNCHERROR("kClearFields_1_2");
}
......
......@@ -271,7 +271,7 @@ static void kCalculateAmoebaVdw14_7Reduction(amoebaGpuContext amoebaGpu, CUDAStr
else
threadsPerBlock = G8X_NONBOND_THREADS_PER_BLOCK;
kCalculateAmoebaVdw14_7Reduction_kernel<<<amoebaGpu->gpuContext->sim.blocks, threadsPerBlock>>>(
vdwOutputArray->_pDevStream[0], forceOutputArray->_pDevStream[0] );
vdwOutputArray->_pDevData, forceOutputArray->_pDevData );
LAUNCHERROR("kCalculateAmoebaVdw14_7Reduction");
}
......@@ -369,7 +369,7 @@ static void kCalculateAmoebaVdw14_7CoordinateReduction(amoebaGpuContext amoebaGp
else
threadsPerBlock = G8X_THREADS_PER_BLOCK;
kCalculateAmoebaVdw14_7CoordinateReduction_kernel<<<amoebaGpu->gpuContext->sim.blocks, threadsPerBlock>>>(
coordinateArray->_pDevStream[0], reducedCoordinateArray->_pDevStream[0] );
coordinateArray->_pDevData, reducedCoordinateArray->_pDevData );
LAUNCHERROR("kCalculateAmoebaVdw14_7CoordinateReduction");
}
......@@ -404,7 +404,7 @@ void kCalculateAmoebaVdw14_7NonReduction_kernel( float* inputForce, float4* outp
static void kCalculateAmoebaVdw14_7NonReduction(amoebaGpuContext amoebaGpu, CUDAStream<float>* vdwOutputArray, CUDAStream<float4>* forceOutputArray )
{
kCalculateAmoebaVdw14_7NonReduction_kernel<<<amoebaGpu->gpuContext->sim.blocks, 384>>>(
vdwOutputArray->_pDevStream[0], forceOutputArray->_pDevStream[0] );
vdwOutputArray->_pDevData, forceOutputArray->_pDevData );
LAUNCHERROR("kCalculateAmoebaVdw14_7MonReduction");
}
......@@ -439,7 +439,7 @@ static void kReduceVdw14_7(amoebaGpuContext amoebaGpu, CUDAStream<float>* output
{
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevStream[0], outputArray->_pDevStream[0] );
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
LAUNCHERROR("kReduceVdw14_7");
}
......@@ -467,7 +467,7 @@ void kCalculateAmoebaVdw14_7CopyCoordinates_kernel( unsigned int bufferLength, f
void kCalculateAmoebaVdw14_7CopyCoordinates( amoebaGpuContext amoebaGpu, CUDAStream<float4>* toCopy, CUDAStream<float4>* copy )
{
kCalculateAmoebaVdw14_7CopyCoordinates_kernel<<<amoebaGpu->gpuContext->blocksPerSM, 384>>>( amoebaGpu->gpuContext->sim.paddedNumberOfAtoms,
toCopy->_pDevStream[0], copy->_pDevStream[0] );
toCopy->_pDevData, copy->_pDevData );
LAUNCHERROR("kCalculateAmoebaVdw14_7CopyCoordinates");
}
......@@ -499,7 +499,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
#ifdef AMOEBA_DEBUG
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysStream[0], 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
memset( debugArray->_pSysData, 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
debugArray->Upload();
int targetAtom = 342;
#endif
......@@ -534,10 +534,10 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
amoebaGpu->psVdwWorkUnit->Download();
unsigned int totalWarps = (amoebaGpu->nonbondBlocks*threadsPerBlock)/GRID;
float ratiof = (float)totalWarps/(float)amoebaGpu->psVdwWorkUnit->_length;
(void) fprintf( amoebaGpu->log, "Ixn warps=%u count=%u\n", totalWarps, gpu->psInteractionCount->_pSysStream[0][0] );
(void) fprintf( amoebaGpu->log, "Ixn warps=%u count=%u\n", totalWarps, gpu->psInteractionCount->_pSysData[0] );
for( unsigned int ii = 0; ii < amoebaGpu->psVdwWorkUnit->_length; ii++ ){
unsigned int x = amoebaGpu->psVdwWorkUnit->_pSysStream[0][ii];
unsigned int x = amoebaGpu->psVdwWorkUnit->_pSysData[ii];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
unsigned int exclusions = (x & 0x1);
x = (x >> 17) << GRIDBITS;
......@@ -570,20 +570,20 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
gpu->psInteractingWorkUnit->Download();
gpu->psInteractionFlag->Download();
amoebaGpu->psVdwWorkUnit->Download();
(void) fprintf( amoebaGpu->log, "Vdw Ixn count=%u\n", gpu->psInteractionCount->_pSysStream[0][0] );
(void) fprintf( amoebaGpu->log, "Vdw Ixn count=%u\n", gpu->psInteractionCount->_pSysData[0] );
for( unsigned int ii = 0; ii < gpu->psInteractingWorkUnit->_length; ii++ ){
unsigned int x = gpu->psInteractingWorkUnit->_pSysStream[0][ii];
unsigned int x = gpu->psInteractingWorkUnit->_pSysData[ii];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
unsigned int exclusions = (x & 0x1);
x = (x >> 17) << GRIDBITS;
(void) fprintf( amoebaGpu->log, "GpuCell %8u %8u [%5u %5u %1u] %10u ", ii, gpu->psInteractingWorkUnit->_pSysStream[0][ii], x,y,exclusions, gpu->psInteractionFlag->_pSysStream[0][ii] );
(void) fprintf( amoebaGpu->log, "GpuCell %8u %8u [%5u %5u %1u] %10u ", ii, gpu->psInteractingWorkUnit->_pSysData[ii], x,y,exclusions, gpu->psInteractionFlag->_pSysData[ii] );
x = amoebaGpu->psVdwWorkUnit->_pSysStream[0][ii];
x = amoebaGpu->psVdwWorkUnit->_pSysData[ii];
y = ((x >> 2) & 0x7fff) << GRIDBITS;
exclusions = (x & 0x1);
x = (x >> 17) << GRIDBITS;
(void) fprintf( amoebaGpu->log, " AmGpu %8u [%5u %5u %1u]\n", amoebaGpu->psWorkUnit->_pSysStream[0][ii], x,y,exclusions );
(void) fprintf( amoebaGpu->log, " AmGpu %8u [%5u %5u %1u]\n", amoebaGpu->psWorkUnit->_pSysData[ii], x,y,exclusions );
}
(void) fflush( amoebaGpu->log );
}
......@@ -591,29 +591,29 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaVdw14_7CutoffByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psAmoebaVdwCoordinates->_pDevStream[0],
amoebaGpu->psVdwSigmaEpsilon->_pDevStream[0],
amoebaGpu->psAmoebaVdwCoordinates->_pDevData,
amoebaGpu->psVdwSigmaEpsilon->_pDevData,
amoebaGpu->vdwSigmaCombiningRule,
amoebaGpu->vdwEpsilonCombiningRule,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
debugArray->_pDevStream[0], targetAtom );
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_1->_pDevStream[0] );
amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
} else {
kCalculateAmoebaVdw14_7Cutoff_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psAmoebaVdwCoordinates->_pDevStream[0],
amoebaGpu->psVdwSigmaEpsilon->_pDevStream[0],
amoebaGpu->psAmoebaVdwCoordinates->_pDevData,
amoebaGpu->psVdwSigmaEpsilon->_pDevData,
amoebaGpu->vdwSigmaCombiningRule,
amoebaGpu->vdwEpsilonCombiningRule,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
debugArray->_pDevStream[0], targetAtom );
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_1->_pDevStream[0] );
amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
}
......@@ -624,30 +624,30 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaVdw14_7N2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
amoebaGpu->psVdwWorkUnit->_pDevStream[0],
amoebaGpu->psAmoebaVdwCoordinates->_pDevStream[0],
amoebaGpu->psVdwSigmaEpsilon->_pDevStream[0],
amoebaGpu->psVdwWorkUnit->_pDevData,
amoebaGpu->psAmoebaVdwCoordinates->_pDevData,
amoebaGpu->psVdwSigmaEpsilon->_pDevData,
amoebaGpu->vdwSigmaCombiningRule,
amoebaGpu->vdwEpsilonCombiningRule,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
debugArray->_pDevStream[0], targetAtom );
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_1->_pDevStream[0] );
amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
} else {
kCalculateAmoebaVdw14_7N2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
amoebaGpu->psVdwWorkUnit->_pDevStream[0],
amoebaGpu->psAmoebaVdwCoordinates->_pDevStream[0],
amoebaGpu->psVdwSigmaEpsilon->_pDevStream[0],
amoebaGpu->psVdwWorkUnit->_pDevData,
amoebaGpu->psAmoebaVdwCoordinates->_pDevData,
amoebaGpu->psVdwSigmaEpsilon->_pDevData,
amoebaGpu->vdwSigmaCombiningRule,
amoebaGpu->vdwEpsilonCombiningRule,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
debugArray->_pDevStream[0], targetAtom );
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_1->_pDevStream[0] );
amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
}
......@@ -669,11 +669,11 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
(void) fprintf( amoebaGpu->log,"%5d %5d DebugVdw\n", targetAtom, jj );
for( int kk = 0; kk < 5; kk++ ){
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
if( kk == 4 && ( fabs( debugArray->_pSysStream[0][debugIndex].x ) > cutOff ||
fabs( debugArray->_pSysStream[0][debugIndex].y ) > cutOff ||
fabs( debugArray->_pSysStream[0][debugIndex].z ) > cutOff ) ){
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
if( kk == 4 && ( fabs( debugArray->_pSysData[debugIndex].x ) > cutOff ||
fabs( debugArray->_pSysData[debugIndex].y ) > cutOff ||
fabs( debugArray->_pSysData[debugIndex].z ) > cutOff ) ){
(void) fprintf( amoebaGpu->log," XXXX\n" );
}
debugIndex += paddedNumberOfAtoms;
......
......@@ -32,7 +32,7 @@ void GetCalculateAmoebaCudaWcaDispersionSim(amoebaGpuContext amoebaGpu)
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
fprintf( stderr, "In GetCalculateAmoebaCudaWcaDispersionSim: %p %lu %u\n",
gpu->psInteractionCount->_pSysStream[0], gpu->psInteractionCount->_pSysStream[0][0], gpu->sim.workUnits );
gpu->psInteractionCount->_pSysData, gpu->psInteractionCount->_pSysData[0], gpu->sim.workUnits );
RTERROR(status, "GetCalculateAmoebaCudaWcaDispersionSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));
......@@ -363,7 +363,7 @@ static void kReduceWcaDispersion(amoebaGpuContext amoebaGpu, CUDAStream<float>*
{
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevStream[0], outputArray->_pDevStream[0] );
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
LAUNCHERROR("kReduceWcaDispersion");
}
......@@ -373,7 +373,7 @@ static void kReduceWcaDispersionToFloat4(amoebaGpuContext amoebaGpu, CUDAStream<
{
kReduceFieldsToFloat4_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevStream[0], outputArray->_pDevStream[0] );
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
LAUNCHERROR("kReduceWcaDispersion");
}
......@@ -400,7 +400,7 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
static const char* methodName = "kCalculateAmoebaWcaDispersionForces";
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysStream[0], 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
memset( debugArray->_pSysData, 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
debugArray->Upload();
int targetAtom = 3;
#endif
......@@ -420,14 +420,14 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaWcaDispersionN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(WcaDispersionParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0],
gpu->psPosq4->_pDevStream[0],
amoebaGpu->psWcaDispersionRadiusEpsilon->_pDevStream[0],
amoebaGpu->psWorkUnit->_pDevData,
gpu->psPosq4->_pDevData,
amoebaGpu->psWcaDispersionRadiusEpsilon->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
debugArray->_pDevStream[0], targetAtom );
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_1->_pDevStream[0] );
amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
} else {
......@@ -442,14 +442,14 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
#endif
kCalculateAmoebaWcaDispersionN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(WcaDispersionParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0],
gpu->psPosq4->_pDevStream[0],
amoebaGpu->psWcaDispersionRadiusEpsilon->_pDevStream[0],
amoebaGpu->psWorkUnit->_pDevData,
gpu->psPosq4->_pDevData,
amoebaGpu->psWcaDispersionRadiusEpsilon->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
debugArray->_pDevStream[0], targetAtom );
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_1->_pDevStream[0] );
amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
}
......@@ -481,80 +481,80 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
for( int kk = 0; kk < -3; kk++ ){
if( kk == 1 ){
block = static_cast<int>(debugArray->_pSysStream[0][debugIndex].w + 1.0e-04);
block = static_cast<int>(debugArray->_pSysData[debugIndex].w + 1.0e-04);
if( buffers.find(block) == buffers.end() ){
buffers[block] = 0.0;
}
}
if( kk == 1 && jj != targetAtom ){
sums[0] += debugArray->_pSysStream[0][debugIndex].y;
sums[1] += debugArray->_pSysStream[0][debugIndex].z;
sums[2] += debugArray->_pSysStream[0][debugIndex].w;
double x4 = debugArray->_pSysStream[0][debugIndex].x - (debugArray->_pSysStream[0][debugIndex].y + debugArray->_pSysStream[0][debugIndex].z + debugArray->_pSysStream[0][debugIndex].w);
sums[0] += debugArray->_pSysData[debugIndex].y;
sums[1] += debugArray->_pSysData[debugIndex].z;
sums[2] += debugArray->_pSysData[debugIndex].w;
double x4 = debugArray->_pSysData[debugIndex].x - (debugArray->_pSysData[debugIndex].y + debugArray->_pSysData[debugIndex].z + debugArray->_pSysData[debugIndex].w);
sums[3] += x4;
//sum += debugArray->_pSysStream[0][debugIndex].x;
sum += debugArray->_pSysStream[0][debugIndex].z;
buffers[block] += debugArray->_pSysStream[0][debugIndex].z;
//sum += debugArray->_pSysData[debugIndex].x;
sum += debugArray->_pSysData[debugIndex].z;
buffers[block] += debugArray->_pSysData[debugIndex].z;
(void) fprintf( amoebaGpu->log," %16.9e [%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w, x4);
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w, x4);
} else if( kk == 2 && jj != targetAtom){
//sum += debugArray->_pSysStream[0][debugIndex].x;
sum += debugArray->_pSysStream[0][debugIndex].z;
sums[4] += debugArray->_pSysStream[0][debugIndex].y;
sums[5] += debugArray->_pSysStream[0][debugIndex].z;
sums[6] += debugArray->_pSysStream[0][debugIndex].w;
double x4 = debugArray->_pSysStream[0][debugIndex].x - (debugArray->_pSysStream[0][debugIndex].y + debugArray->_pSysStream[0][debugIndex].z + debugArray->_pSysStream[0][debugIndex].w);
//sum += debugArray->_pSysData[debugIndex].x;
sum += debugArray->_pSysData[debugIndex].z;
sums[4] += debugArray->_pSysData[debugIndex].y;
sums[5] += debugArray->_pSysData[debugIndex].z;
sums[6] += debugArray->_pSysData[debugIndex].w;
double x4 = debugArray->_pSysData[debugIndex].x - (debugArray->_pSysData[debugIndex].y + debugArray->_pSysData[debugIndex].z + debugArray->_pSysData[debugIndex].w);
sums[7] += x4;
buffers[block] += debugArray->_pSysStream[0][debugIndex].z;
buffers[block] += debugArray->_pSysData[debugIndex].z;
(void) fprintf( amoebaGpu->log," %16.9e [%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w, x4);
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w, x4);
} else {
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e] %7u\n",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w, debugIndex );
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w, debugIndex );
}
if( kk == 5 )(void) fprintf( amoebaGpu->log,"\n" );
debugIndex += paddedNumberOfAtoms;
}
block = static_cast<int>(debugArray->_pSysStream[0][debugIndex+paddedNumberOfAtoms].w + 1.0e-04);
block = static_cast<int>(debugArray->_pSysData[debugIndex+paddedNumberOfAtoms].w + 1.0e-04);
if( buffers.find(block) == buffers.end() ){
buffers[block] = 0.0;
maxD[block] = 0.0;
}
for( int kk = 0; kk < 3; kk++ ){
if( kk == 0 && jj != targetAtom ){
sum += debugArray->_pSysStream[0][debugIndex].z;
buffers[block] += debugArray->_pSysStream[0][debugIndex].z;
sum += debugArray->_pSysData[debugIndex].z;
buffers[block] += debugArray->_pSysData[debugIndex].z;
trackI[block].push_back( jj );
trackD[block].push_back( debugArray->_pSysStream[0][debugIndex].z );
trackT[block].push_back( debugArray->_pSysStream[0][debugIndex].w );
if( fabs( debugArray->_pSysStream[0][debugIndex].w ) > maxD[block] ){
maxD[block] = fabs( debugArray->_pSysStream[0][debugIndex].w );
trackD[block].push_back( debugArray->_pSysData[debugIndex].z );
trackT[block].push_back( debugArray->_pSysData[debugIndex].w );
if( fabs( debugArray->_pSysData[debugIndex].w ) > maxD[block] ){
maxD[block] = fabs( debugArray->_pSysData[debugIndex].w );
}
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w);
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w);
} else if( kk == 2 && jj != targetAtom){
sum += debugArray->_pSysStream[0][debugIndex].z;
buffers[block] += debugArray->_pSysStream[0][debugIndex].z;
sum += debugArray->_pSysData[debugIndex].z;
buffers[block] += debugArray->_pSysData[debugIndex].z;
trackI[block].push_back( jj );
trackD[block].push_back( debugArray->_pSysStream[0][debugIndex].z );
trackT[block].push_back( debugArray->_pSysStream[0][debugIndex].w );
if( fabs( debugArray->_pSysStream[0][debugIndex].w ) > maxD[block] ){
maxD[block] = fabs( debugArray->_pSysStream[0][debugIndex].w );
trackD[block].push_back( debugArray->_pSysData[debugIndex].z );
trackT[block].push_back( debugArray->_pSysData[debugIndex].w );
if( fabs( debugArray->_pSysData[debugIndex].w ) > maxD[block] ){
maxD[block] = fabs( debugArray->_pSysData[debugIndex].w );
}
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w);
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w);
} else {
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e] %7u\n",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w, debugIndex );
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w, debugIndex );
}
if( kk == 5 )(void) fprintf( amoebaGpu->log,"\n" );
debugIndex += paddedNumberOfAtoms;
......@@ -600,11 +600,11 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
int debugIndex9 = debugIndex + 9*paddedNumberOfAtoms;
int debugIndex10 = debugIndex + 10*paddedNumberOfAtoms;
(void) fprintf( amoebaGpu->log,"%6d %14.7e %14.7e %14.7e %14.7e %14.7e %14.7e %14.7e %14.7e %14.7e \n", jj,
debugArray->_pSysStream[0][debugIndex4].x*debugArray->_pSysStream[0][debugIndex4].x, // r2
debugArray->_pSysStream[0][debugIndex4].y, debugArray->_pSysStream[0][debugIndex9].y, // de
debugArray->_pSysStream[0][debugIndex5].x, debugArray->_pSysStream[0][debugIndex10].x, // dll
debugArray->_pSysStream[0][debugIndex5].y, debugArray->_pSysStream[0][debugIndex10].y, // duu
debugArray->_pSysStream[0][debugIndex4].z, debugArray->_pSysStream[0][debugIndex9].z ); // emxio
debugArray->_pSysData[debugIndex4].x*debugArray->_pSysData[debugIndex4].x, // r2
debugArray->_pSysData[debugIndex4].y, debugArray->_pSysData[debugIndex9].y, // de
debugArray->_pSysData[debugIndex5].x, debugArray->_pSysData[debugIndex10].x, // dll
debugArray->_pSysData[debugIndex5].y, debugArray->_pSysData[debugIndex10].y, // duu
debugArray->_pSysData[debugIndex4].z, debugArray->_pSysData[debugIndex9].z ); // emxio
}
*/
}
......@@ -616,7 +616,7 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
gpu->psEnergy->Download();
double sum = 0.0;
for (int i = 0; i < gpu->sim.energyOutputBuffers; i++){
sum += gpu->psEnergy->_pSysStream[0][i];
sum += gpu->psEnergy->_pSysData[i];
if( fabsf( (*gpu->psEnergy)[i]) > 0.0 )
(void) fprintf( amoebaGpu->log,"SumQQ %6d %14.7e QQ SUM\n", i, (*gpu->psEnergy)[i] );
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment