Commit 101f206d authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Added loop over particles for torque mapping

parent 41abd9fb
...@@ -75,15 +75,13 @@ __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1) ...@@ -75,15 +75,13 @@ __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else #else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1) __launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif #endif
void amoebaMapTorqueToForce_kernel( float* torque ){ void amoebaMapTorqueToForce_kernel( float* torque )
{
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
int ii; int ii;
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; int particleIndex = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
int numOfAtoms = cSim.atoms;
if( threadId >= numOfAtoms )return;
float4* atomCoord = cSim.pPosq; float4* atomCoord = cSim.pPosq;
int4* multiPoleAtoms = cAmoebaSim.pMultipoleParticlesIdsAndAxisType; int4* multiPoleAtoms = cAmoebaSim.pMultipoleParticlesIdsAndAxisType;
...@@ -113,255 +111,255 @@ void amoebaMapTorqueToForce_kernel( float* torque ){ ...@@ -113,255 +111,255 @@ void amoebaMapTorqueToForce_kernel( float* torque ){
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
int axisAtom = multiPoleAtoms[threadId].z; while( particleIndex < cSim.atoms )
int axisType = multiPoleAtoms[threadId].w; {
// NoAxisType
if( axisType == 5 )
{
return;
}
vector[U][0] = atomCoord[threadId].x - atomCoord[axisAtom].x;
vector[U][1] = atomCoord[threadId].y - atomCoord[axisAtom].y;
vector[U][2] = atomCoord[threadId].z - atomCoord[axisAtom].z;
norms[U] = normVector3( vector[U] );
if( axisType != 4 ){
axisAtom = multiPoleAtoms[threadId].x;
vector[V][0] = atomCoord[threadId].x - atomCoord[axisAtom].x;
vector[V][1] = atomCoord[threadId].y - atomCoord[axisAtom].y;
vector[V][2] = atomCoord[threadId].z - atomCoord[axisAtom].z;
} else {
vector[V][0] = 0.1f;
vector[V][1] = 0.1f;
vector[V][2] = 0.1f;
}
norms[V] = normVector3( vector[V] );
// W = UxV
if( axisType < 2 || axisType > 3 ){
crossVector3( vector[U], vector[V], vector[W] );
} else {
axisAtom = multiPoleAtoms[threadId].y;
vector[W][0] = atomCoord[threadId].x - atomCoord[axisAtom].x;
vector[W][1] = atomCoord[threadId].y - atomCoord[axisAtom].y;
vector[W][2] = atomCoord[threadId].z - atomCoord[axisAtom].z;
}
norms[W] = normVector3( vector[W] );
crossVector3( vector[V], vector[U], vector[UV] );
crossVector3( vector[W], vector[U], vector[UW] );
crossVector3( vector[W], vector[V], vector[VW] );
norms[UV] = normVector3( vector[UV] ); int axisAtom = multiPoleAtoms[particleIndex].z;
norms[UW] = normVector3( vector[UW] ); int axisType = multiPoleAtoms[particleIndex].w;
norms[VW] = normVector3( vector[VW] );
angles[UV][0] = DOT3( vector[U], vector[V] );
angles[UV][1] = sqrtf( 1.0f - angles[UV][0]*angles[UV][0]);
angles[UW][0] = DOT3( vector[U], vector[W] );
angles[UW][1] = sqrtf( 1.0f - angles[UW][0]*angles[UW][0]);
angles[VW][0] = DOT3( vector[V], vector[W] );
angles[VW][1] = sqrtf( 1.0f - angles[VW][0]*angles[VW][0]);
float dphi[3];
dphi[U] = DOT3( vector[U], (torque + threadId*3) );
dphi[V] = DOT3( vector[V], (torque + threadId*3) );
dphi[W] = DOT3( vector[W], (torque + threadId*3) );
dphi[U] *= -1.0f;
dphi[V] *= -1.0f;
dphi[W] *= -1.0f;
// z-then-x and bisector
if( axisType == 0 || axisType == 1 ){
float factor1;
float factor2;
float factor3;
float factor4;
factor1 = dphi[V]/(norms[U]*angles[UV][1]); // NoAxisType
factor2 = dphi[W]/(norms[U]);
factor3 = -dphi[U]/(norms[V]*angles[UV][1]);
if( axisType == 1 ){ if( axisType < 5 && multiPoleAtoms[particleIndex].z >= 0 )
factor2 *= 0.5f; {
factor4 = 0.5f*dphi[W]/(norms[V]);
} else { vector[U][0] = atomCoord[particleIndex].x - atomCoord[axisAtom].x;
factor4 = 0.0f; vector[U][1] = atomCoord[particleIndex].y - atomCoord[axisAtom].y;
} vector[U][2] = atomCoord[particleIndex].z - atomCoord[axisAtom].z;
for( ii = 0; ii < 3; ii++ ){ norms[U] = normVector3( vector[U] );
forces[Z][ii] = vector[UV][ii]*factor1 + factor2*vector[UW][ii];
forces[X][ii] = vector[UV][ii]*factor3 + factor4*vector[VW][ii]; if( axisType != 4 && multiPoleAtoms[particleIndex].x >= 0 ){
forces[I][ii] = -(forces[X][ii] + forces[Z][ii]);
forces[Y][ii] = 0.0f; axisAtom = multiPoleAtoms[particleIndex].x;
} vector[V][0] = atomCoord[particleIndex].x - atomCoord[axisAtom].x;
vector[V][1] = atomCoord[particleIndex].y - atomCoord[axisAtom].y;
} else if( axisType == 2 ){ vector[V][2] = atomCoord[particleIndex].z - atomCoord[axisAtom].z;
// z-bisect } else {
vector[V][0] = 0.1f;
vector[R][0] = vector[V][0] + vector[W][0]; vector[V][1] = 0.1f;
vector[R][1] = vector[V][1] + vector[W][1]; vector[V][2] = 0.1f;
vector[R][2] = vector[V][2] + vector[W][2]; }
crossVector3( vector[U], vector[R], vector[S] ); norms[V] = normVector3( vector[V] );
norms[R] = normVector3( vector[R] ); // W = UxV
norms[S] = normVector3( vector[S] );
if( axisType < 2 || axisType > 3 ){
crossVector3( vector[R], vector[U], vector[UR] ); crossVector3( vector[U], vector[V], vector[W] );
crossVector3( vector[S], vector[U], vector[US] ); } else {
crossVector3( vector[S], vector[V], vector[VS] ); axisAtom = multiPoleAtoms[particleIndex].y;
crossVector3( vector[S], vector[W], vector[WS] );
vector[W][0] = atomCoord[particleIndex].x - atomCoord[axisAtom].x;
norms[UR] = normVector3( vector[UR] ); vector[W][1] = atomCoord[particleIndex].y - atomCoord[axisAtom].y;
norms[US] = normVector3( vector[US] ); vector[W][2] = atomCoord[particleIndex].z - atomCoord[axisAtom].z;
norms[VS] = normVector3( vector[VS] ); }
norms[WS] = normVector3( vector[WS] ); norms[W] = normVector3( vector[W] );
angles[UR][0] = DOT3( vector[U], vector[R] ); crossVector3( vector[V], vector[U], vector[UV] );
angles[UR][1] = sqrtf( 1.0f - angles[UR][0]*angles[UR][0]); crossVector3( vector[W], vector[U], vector[UW] );
crossVector3( vector[W], vector[V], vector[VW] );
angles[US][0] = DOT3( vector[U], vector[S] );
angles[US][1] = sqrtf( 1.0f - angles[US][0]*angles[US][0]); norms[UV] = normVector3( vector[UV] );
norms[UW] = normVector3( vector[UW] );
angles[VS][0] = DOT3( vector[V], vector[S] ); norms[VW] = normVector3( vector[VW] );
angles[VS][1] = sqrtf( 1.0f - angles[VS][0]*angles[VS][0]);
angles[UV][0] = DOT3( vector[U], vector[V] );
angles[WS][0] = DOT3( vector[W], vector[S] ); angles[UV][1] = sqrtf( 1.0f - angles[UV][0]*angles[UV][0]);
angles[WS][1] = sqrtf( 1.0f - angles[WS][0]*angles[WS][0]);
angles[UW][0] = DOT3( vector[U], vector[W] );
float t1[3]; angles[UW][1] = sqrtf( 1.0f - angles[UW][0]*angles[UW][0]);
float t2[3];
t1[0] = vector[V][0] - vector[S][0]*angles[VS][0]; angles[VW][0] = DOT3( vector[V], vector[W] );
t1[1] = vector[V][1] - vector[S][1]*angles[VS][0]; angles[VW][1] = sqrtf( 1.0f - angles[VW][0]*angles[VW][0]);
t1[2] = vector[V][2] - vector[S][2]*angles[VS][0];
float dphi[3];
t2[0] = vector[W][0] - vector[S][0]*angles[WS][0]; dphi[U] = DOT3( vector[U], (torque + particleIndex*3) );
t2[1] = vector[W][1] - vector[S][1]*angles[WS][0]; dphi[V] = DOT3( vector[V], (torque + particleIndex*3) );
t2[2] = vector[W][2] - vector[S][2]*angles[WS][0]; dphi[W] = DOT3( vector[W], (torque + particleIndex*3) );
float notUsed = normVector3( t1 );
notUsed = normVector3( t2 ); dphi[U] *= -1.0f;
float ut1cos = DOT3( vector[U], t1 ); dphi[V] *= -1.0f;
float ut1sin = sqrtf( 1.0f - ut1cos*ut1cos); dphi[W] *= -1.0f;
float ut2cos = DOT3( vector[U], t2 );
float ut2sin = sqrtf( 1.0f - ut2cos*ut2cos); // z-then-x and bisector
float dphiR = -1.0f*DOT3( vector[R], (torque + threadId*3) ); if( axisType == 0 || axisType == 1 ){
float dphiS = -1.0f*DOT3( vector[S], (torque + threadId*3) );
float factor1;
float factor1 = dphiR/(norms[U]*angles[UR][1]); float factor2;
float factor2 = dphiS/(norms[U]); float factor3;
float factor3 = dphi[U]/(norms[V]*(ut1sin+ut2sin)); float factor4;
float factor4 = dphi[U]/(norms[W]*(ut1sin+ut2sin));
for( ii = 0; ii < 3; ii++ ){ factor1 = dphi[V]/(norms[U]*angles[UV][1]);
forces[Z][ii] = vector[UR][ii]*factor1 + factor2*vector[US][ii]; factor2 = dphi[W]/(norms[U]);
forces[X][ii] = (angles[VS][1]*vector[S][ii] - angles[VS][0]*t1[ii])*factor3; factor3 = -dphi[U]/(norms[V]*angles[UV][1]);
forces[Y][ii] = (angles[WS][1]*vector[S][ii] - angles[WS][0]*t2[ii])*factor4;
forces[I][ii] = -(forces[X][ii] + forces[Y][ii] + forces[Z][ii]); if( axisType == 1 ){
} factor2 *= 0.5f;
factor4 = 0.5f*dphi[W]/(norms[V]);
} else if( axisType == 3 ){ } else {
factor4 = 0.0f;
// 3-fold }
for( ii = 0; ii < 3; ii++ ){ for( ii = 0; ii < 3; ii++ ){
float du = vector[UW][ii]*dphi[W]/(norms[U]*angles[UW][1]) + forces[Z][ii] = vector[UV][ii]*factor1 + factor2*vector[UW][ii];
vector[UV][ii]*dphi[V]/(norms[U]*angles[UV][1]) - forces[X][ii] = vector[UV][ii]*factor3 + factor4*vector[VW][ii];
vector[UW][ii]*dphi[U]/(norms[U]*angles[UW][1]) - forces[I][ii] = -(forces[X][ii] + forces[Z][ii]);
vector[UV][ii]*dphi[U]/(norms[U]*angles[UV][1]); forces[Y][ii] = 0.0f;
}
float dv = vector[VW][ii]*dphi[W]/(norms[V]*angles[VW][1]) -
vector[UV][ii]*dphi[U]/(norms[V]*angles[UV][1]) - } else if( axisType == 2 ){
vector[VW][ii]*dphi[V]/(norms[V]*angles[VW][1]) +
vector[UV][ii]*dphi[V]/(norms[V]*angles[UV][1]); // z-bisect
float dw = -vector[UW][ii]*dphi[U]/(norms[W]*angles[UW][1]) - vector[R][0] = vector[V][0] + vector[W][0];
vector[VW][ii]*dphi[V]/(norms[W]*angles[VW][1]) + vector[R][1] = vector[V][1] + vector[W][1];
vector[UW][ii]*dphi[W]/(norms[W]*angles[UW][1]) + vector[R][2] = vector[V][2] + vector[W][2];
vector[VW][ii]*dphi[W]/(norms[W]*angles[VW][1]);
crossVector3( vector[U], vector[R], vector[S] );
du /= 3.0f;
dv /= 3.0f; norms[R] = normVector3( vector[R] );
dw /= 3.0f; norms[S] = normVector3( vector[S] );
forces[Z][ii] = du; crossVector3( vector[R], vector[U], vector[UR] );
forces[X][ii] = dv; crossVector3( vector[S], vector[U], vector[US] );
forces[Y][ii] = dw; crossVector3( vector[S], vector[V], vector[VS] );
forces[I][ii] = -(du + dv + dw); crossVector3( vector[S], vector[W], vector[WS] );
}
norms[UR] = normVector3( vector[UR] );
} else if( axisType == 4 ){ norms[US] = normVector3( vector[US] );
norms[VS] = normVector3( vector[VS] );
// z-only norms[WS] = normVector3( vector[WS] );
for( ii = 0; ii < 3; ii++ ){ angles[UR][0] = DOT3( vector[U], vector[R] );
float du = vector[UV][ii]*dphi[V]/(norms[U]*angles[UV][1]); angles[UR][1] = sqrtf( 1.0f - angles[UR][0]*angles[UR][0]);
forces[Z][ii] = du;
forces[X][ii] = 0.0f; angles[US][0] = DOT3( vector[U], vector[S] );
forces[Y][ii] = 0.0f; angles[US][1] = sqrtf( 1.0f - angles[US][0]*angles[US][0]);
forces[I][ii] = -du;
} angles[VS][0] = DOT3( vector[V], vector[S] );
} else { angles[VS][1] = sqrtf( 1.0f - angles[VS][0]*angles[VS][0]);
for( ii = 0; ii < 3; ii++ ){ angles[WS][0] = DOT3( vector[W], vector[S] );
forces[Z][ii] = 0.0f; angles[WS][1] = sqrtf( 1.0f - angles[WS][0]*angles[WS][0]);
forces[X][ii] = 0.0f;
forces[Y][ii] = 0.0f; float t1[3];
forces[I][ii] = 0.0f; float t2[3];
} t1[0] = vector[V][0] - vector[S][0]*angles[VS][0];
} t1[1] = vector[V][1] - vector[S][1]*angles[VS][0];
t1[2] = vector[V][2] - vector[S][2]*angles[VS][0];
// load results
t2[0] = vector[W][0] - vector[S][0]*angles[WS][0];
// Z t2[1] = vector[W][1] - vector[S][1]*angles[WS][0];
t2[2] = vector[W][2] - vector[S][2]*angles[WS][0];
int4 forceBufferIndices = cAmoebaSim.pMultipoleParticlesTorqueBufferIndices[threadId]; float notUsed = normVector3( t1 );
loadMappedTorque( multiPoleAtoms[threadId].z, forceBufferIndices.z, forces[Z] ); notUsed = normVector3( t2 );
float ut1cos = DOT3( vector[U], t1 );
// X float ut1sin = sqrtf( 1.0f - ut1cos*ut1cos);
float ut2cos = DOT3( vector[U], t2 );
if( axisType != 4 ){ float ut2sin = sqrtf( 1.0f - ut2cos*ut2cos);
loadMappedTorque( multiPoleAtoms[threadId].x, forceBufferIndices.x, forces[X] );
} float dphiR = -1.0f*DOT3( vector[R], (torque + particleIndex*3) );
float dphiS = -1.0f*DOT3( vector[S], (torque + particleIndex*3) );
// Y
float factor1 = dphiR/(norms[U]*angles[UR][1]);
if( axisType == 2 || axisType == 3 ){ float factor2 = dphiS/(norms[U]);
int particleId = multiPoleAtoms[threadId].y; float factor3 = dphi[U]/(norms[V]*(ut1sin+ut2sin));
if( particleId > -1 ){ float factor4 = dphi[U]/(norms[W]*(ut1sin+ut2sin));
loadMappedTorque( multiPoleAtoms[threadId].y, forceBufferIndices.y, forces[Y] ); for( ii = 0; ii < 3; ii++ ){
forces[Z][ii] = vector[UR][ii]*factor1 + factor2*vector[US][ii];
forces[X][ii] = (angles[VS][1]*vector[S][ii] - angles[VS][0]*t1[ii])*factor3;
forces[Y][ii] = (angles[WS][1]*vector[S][ii] - angles[WS][0]*t2[ii])*factor4;
forces[I][ii] = -(forces[X][ii] + forces[Y][ii] + forces[Z][ii]);
}
} else if( axisType == 3 ){
// 3-fold
for( ii = 0; ii < 3; ii++ ){
float du = vector[UW][ii]*dphi[W]/(norms[U]*angles[UW][1]) +
vector[UV][ii]*dphi[V]/(norms[U]*angles[UV][1]) -
vector[UW][ii]*dphi[U]/(norms[U]*angles[UW][1]) -
vector[UV][ii]*dphi[U]/(norms[U]*angles[UV][1]);
float dv = vector[VW][ii]*dphi[W]/(norms[V]*angles[VW][1]) -
vector[UV][ii]*dphi[U]/(norms[V]*angles[UV][1]) -
vector[VW][ii]*dphi[V]/(norms[V]*angles[VW][1]) +
vector[UV][ii]*dphi[V]/(norms[V]*angles[UV][1]);
float dw = -vector[UW][ii]*dphi[U]/(norms[W]*angles[UW][1]) -
vector[VW][ii]*dphi[V]/(norms[W]*angles[VW][1]) +
vector[UW][ii]*dphi[W]/(norms[W]*angles[UW][1]) +
vector[VW][ii]*dphi[W]/(norms[W]*angles[VW][1]);
du /= 3.0f;
dv /= 3.0f;
dw /= 3.0f;
forces[Z][ii] = du;
forces[X][ii] = dv;
forces[Y][ii] = dw;
forces[I][ii] = -(du + dv + dw);
}
} else if( axisType == 4 ){
// z-only
for( ii = 0; ii < 3; ii++ ){
float du = vector[UV][ii]*dphi[V]/(norms[U]*angles[UV][1]);
forces[Z][ii] = du;
forces[X][ii] = 0.0f;
forces[Y][ii] = 0.0f;
forces[I][ii] = -du;
}
} else {
for( ii = 0; ii < 3; ii++ ){
forces[Z][ii] = 0.0f;
forces[X][ii] = 0.0f;
forces[Y][ii] = 0.0f;
forces[I][ii] = 0.0f;
}
}
// load results
// Z
int4 forceBufferIndices = cAmoebaSim.pMultipoleParticlesTorqueBufferIndices[particleIndex];
loadMappedTorque( multiPoleAtoms[particleIndex].z, forceBufferIndices.z, forces[Z] );
// X
if( axisType != 4 ){
loadMappedTorque( multiPoleAtoms[particleIndex].x, forceBufferIndices.x, forces[X] );
}
// Y
if( axisType == 2 || axisType == 3 ){
int particleId = multiPoleAtoms[particleIndex].y;
if( particleId > -1 ){
loadMappedTorque( multiPoleAtoms[particleIndex].y, forceBufferIndices.y, forces[Y] );
}
}
// put particle force in buffer 0
loadMappedTorque( particleIndex, 0, forces[I] );
} }
particleIndex += gridDim.x*blockDim.x;
} }
// put particle force in buffer 0
loadMappedTorque( threadId, 0, forces[I] );
} }
void cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpuContext amoebaGpu, CUDAStream<float>* psTorque ) void cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpuContext amoebaGpu, CUDAStream<float>* psTorque )
{ {
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
int numThreads = min(256, (gpu->natoms)); amoebaMapTorqueToForce_kernel<<< gpu->sim.blocks, gpu->sim.update_threads_per_block>>> ( psTorque->_pDevData );
int numBlocks = 1 + (gpu->natoms/numThreads);
amoebaMapTorqueToForce_kernel<<< numBlocks, numThreads>>> ( psTorque->_pDevData );
LAUNCHERROR("amoebaMapTorqueToForce"); LAUNCHERROR("amoebaMapTorqueToForce");
} }
...@@ -1089,8 +1089,8 @@ static void kReduceTorque(amoebaGpuContext amoebaGpu ) ...@@ -1089,8 +1089,8 @@ static void kReduceTorque(amoebaGpuContext amoebaGpu )
{ {
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>( kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers, gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData ); amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData );
LAUNCHERROR("kReducePmeDirectElectrostaticTorque"); LAUNCHERROR("kReducePmeDirectElectrostaticTorque");
} }
......
...@@ -147,7 +147,6 @@ void kCudaComputeLabFrameMoments_kernel( void ) ...@@ -147,7 +147,6 @@ void kCudaComputeLabFrameMoments_kernel( void )
float vectorZ[3]; float vectorZ[3];
int particleIndex = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; int particleIndex = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
int numberOfParticles = cSim.atoms;
float4* particleCoord = cSim.pPosq; float4* particleCoord = cSim.pPosq;
int4* multiPoleParticles = cAmoebaSim.pMultipoleParticlesIdsAndAxisType; int4* multiPoleParticles = cAmoebaSim.pMultipoleParticlesIdsAndAxisType;
...@@ -162,7 +161,7 @@ void kCudaComputeLabFrameMoments_kernel( void ) ...@@ -162,7 +161,7 @@ void kCudaComputeLabFrameMoments_kernel( void )
// code common to ZThenX and Bisector // code common to ZThenX and Bisector
while( particleIndex < numberOfParticles ) while( particleIndex < cSim.atoms )
{ {
if( multiPoleParticles[particleIndex].x >= 0 && multiPoleParticles[particleIndex].z >= 0 ) if( multiPoleParticles[particleIndex].x >= 0 && multiPoleParticles[particleIndex].z >= 0 )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment