Commit 6bad9d44 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Removal of several arrays no longer needed

parent 1beac75d
...@@ -97,87 +97,94 @@ void gpuPrintCudaStream( std::string name, ...@@ -97,87 +97,94 @@ void gpuPrintCudaStream( std::string name,
} }
extern "C" extern "C"
void gpuPrintCudaStreamFloat( CUDAStream<float>* cUDAStream, FILE* log ) int gpuPrintCudaStreamFloat( CUDAStream<float>* cUDAStream, FILE* log )
{ {
if( cUDAStream == NULL )return; if( cUDAStream == NULL )return 0;
gpuPrintCudaStream( cUDAStream->_name.c_str(), gpuPrintCudaStream( cUDAStream->_name.c_str(),
cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride, cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
cUDAStream->_length*cUDAStream->_subStreams*sizeof( float ), cUDAStream->_length*cUDAStream->_subStreams*sizeof( float ),
static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream), static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log ); static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
return cUDAStream->_length*cUDAStream->_subStreams*sizeof( float );
} }
extern "C" extern "C"
void gpuPrintCudaStreamFloat2( CUDAStream<float2>* cUDAStream, FILE* log ) int gpuPrintCudaStreamFloat2( CUDAStream<float2>* cUDAStream, FILE* log )
{ {
if( cUDAStream == NULL )return; if( cUDAStream == NULL )return 0;
gpuPrintCudaStream( cUDAStream->_name.c_str(), gpuPrintCudaStream( cUDAStream->_name.c_str(),
cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride, cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
cUDAStream->_length*cUDAStream->_subStreams*sizeof( float2 ), cUDAStream->_length*cUDAStream->_subStreams*sizeof( float2 ),
static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream), static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log ); static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
return cUDAStream->_length*cUDAStream->_subStreams*2*sizeof( float );
} }
extern "C" extern "C"
void gpuPrintCudaStreamFloat4( CUDAStream<float4>* cUDAStream, FILE* log ) int gpuPrintCudaStreamFloat4( CUDAStream<float4>* cUDAStream, FILE* log )
{ {
if( cUDAStream == NULL )return; if( cUDAStream == NULL )return 0;
gpuPrintCudaStream( cUDAStream->_name.c_str(), gpuPrintCudaStream( cUDAStream->_name.c_str(),
cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride, cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
cUDAStream->_length*cUDAStream->_subStreams*sizeof( float4 ), cUDAStream->_length*cUDAStream->_subStreams*sizeof( float4 ),
static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream), static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log ); static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
return cUDAStream->_length*cUDAStream->_subStreams*4*sizeof( float );
} }
extern "C" extern "C"
void gpuPrintCudaStreamUnsignedInt( CUDAStream<unsigned int>* cUDAStream, FILE* log ) int gpuPrintCudaStreamUnsignedInt( CUDAStream<unsigned int>* cUDAStream, FILE* log )
{ {
if( cUDAStream == NULL )return; if( cUDAStream == NULL )return 0;
gpuPrintCudaStream( cUDAStream->_name.c_str(), gpuPrintCudaStream( cUDAStream->_name.c_str(),
cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride, cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
cUDAStream->_length*cUDAStream->_subStreams*sizeof( unsigned int ), cUDAStream->_length*cUDAStream->_subStreams*sizeof( unsigned int ),
static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream), static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log ); static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
return cUDAStream->_length*cUDAStream->_subStreams*sizeof( unsigned int );
} }
extern "C" extern "C"
void gpuPrintCudaStreamInt( CUDAStream<int>* cUDAStream, FILE* log ) int gpuPrintCudaStreamInt( CUDAStream<int>* cUDAStream, FILE* log )
{ {
if( cUDAStream == NULL )return; if( cUDAStream == NULL )return 0;
gpuPrintCudaStream( cUDAStream->_name.c_str(), gpuPrintCudaStream( cUDAStream->_name.c_str(),
cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride, cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
cUDAStream->_length*cUDAStream->_subStreams*sizeof( int ), cUDAStream->_length*cUDAStream->_subStreams*sizeof( int ),
static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream), static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log ); static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
return cUDAStream->_length*cUDAStream->_subStreams*sizeof( int );
} }
extern "C" extern "C"
void gpuPrintCudaStreamInt2( CUDAStream<int2>* cUDAStream, FILE* log ) int gpuPrintCudaStreamInt2( CUDAStream<int2>* cUDAStream, FILE* log )
{ {
if( cUDAStream == NULL )return; if( cUDAStream == NULL )return 0;
gpuPrintCudaStream( cUDAStream->_name.c_str(), gpuPrintCudaStream( cUDAStream->_name.c_str(),
cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride, cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
cUDAStream->_length*cUDAStream->_subStreams*sizeof( int2 ), cUDAStream->_length*cUDAStream->_subStreams*sizeof( int2 ),
static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream), static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log ); static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
return cUDAStream->_length*cUDAStream->_subStreams*2*sizeof( int );
} }
extern "C" extern "C"
void gpuPrintCudaStreamInt4( CUDAStream<int4>* cUDAStream, FILE* log ) int gpuPrintCudaStreamInt4( CUDAStream<int4>* cUDAStream, FILE* log )
{ {
if( cUDAStream == NULL )return; if( cUDAStream == NULL )return 0;
gpuPrintCudaStream( cUDAStream->_name.c_str(), gpuPrintCudaStream( cUDAStream->_name.c_str(),
cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride, cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
cUDAStream->_length*cUDAStream->_subStreams*sizeof( int4 ), cUDAStream->_length*cUDAStream->_subStreams*sizeof( int4 ),
static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream), static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log ); static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
return cUDAStream->_length*cUDAStream->_subStreams*4*sizeof( int );
} }
extern "C" extern "C"
...@@ -186,6 +193,8 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log ) ...@@ -186,6 +193,8 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
if( log == NULL )return; if( log == NULL )return;
_gpuContext* gpu = amoebaGpu->gpuContext; _gpuContext* gpu = amoebaGpu->gpuContext;
int totalMemory = 0;
(void) fprintf( log, "cudaAmoebaGmxSimulation:\n\n" ); (void) fprintf( log, "cudaAmoebaGmxSimulation:\n\n" );
(void) fprintf( log, "\n" ); (void) fprintf( log, "\n" );
...@@ -206,32 +215,32 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log ) ...@@ -206,32 +215,32 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
(void) fprintf( log, " outputBuffers %u\n", gpu->sim.outputBuffers ); (void) fprintf( log, " outputBuffers %u\n", gpu->sim.outputBuffers );
(void) fprintf( log, " workUnits %u\n", amoebaGpu->workUnits ); (void) fprintf( log, " workUnits %u\n", amoebaGpu->workUnits );
gpuPrintCudaStreamFloat( amoebaGpu->gpuContext->psEnergy, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->gpuContext->psEnergy, log );
gpuPrintCudaStreamFloat4( amoebaGpu->gpuContext->psForce4, log ); totalMemory += gpuPrintCudaStreamFloat4( amoebaGpu->gpuContext->psForce4, log );
gpuPrintCudaStreamFloat4( amoebaGpu->gpuContext->psPosq4, log ); totalMemory += gpuPrintCudaStreamFloat4( amoebaGpu->gpuContext->psPosq4, log );
gpuPrintCudaStreamFloat2( amoebaGpu->gpuContext->psObcData, log ); totalMemory += gpuPrintCudaStreamFloat2( amoebaGpu->gpuContext->psObcData, log );
gpuPrintCudaStreamFloat( amoebaGpu->gpuContext->psBornForce, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->gpuContext->psBornForce, log );
(void) fprintf( log, "\n\n" ); (void) fprintf( log, "\n\n" );
(void) fprintf( log, " amoebaBonds %u\n", amoebaGpu->amoebaSim.amoebaBonds ); (void) fprintf( log, " amoebaBonds %u\n", amoebaGpu->amoebaSim.amoebaBonds );
gpuPrintCudaStreamFloat( amoebaGpu->psWorkArray_3_1, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psWorkArray_3_1, log );
gpuPrintCudaStreamFloat( amoebaGpu->psWorkArray_3_2, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psWorkArray_3_2, log );
gpuPrintCudaStreamFloat( amoebaGpu->psWorkArray_3_3, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psWorkArray_3_3, log );
gpuPrintCudaStreamFloat( amoebaGpu->psWorkArray_3_4, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psWorkArray_3_4, log );
gpuPrintCudaStreamFloat( amoebaGpu->psWorkArray_1_1, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psWorkArray_1_1, log );
gpuPrintCudaStreamFloat( amoebaGpu->psWorkArray_1_2, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psWorkArray_1_2, log );
(void) fprintf( log, "\n\n" ); (void) fprintf( log, "\n\n" );
gpuPrintCudaStreamUnsignedInt( amoebaGpu->psWorkUnit, log ); totalMemory += gpuPrintCudaStreamUnsignedInt( amoebaGpu->psWorkUnit, log );
gpuPrintCudaStreamInt( amoebaGpu->psScalingIndicesIndex, log ); totalMemory += gpuPrintCudaStreamInt( amoebaGpu->psScalingIndicesIndex, log );
gpuPrintCudaStreamInt( amoebaGpu->ps_D_ScaleIndices, log ); totalMemory += gpuPrintCudaStreamInt( amoebaGpu->ps_D_ScaleIndices, log );
gpuPrintCudaStreamInt2( amoebaGpu->ps_P_ScaleIndices, log ); totalMemory += gpuPrintCudaStreamInt2( amoebaGpu->ps_P_ScaleIndices, log );
gpuPrintCudaStreamInt2( amoebaGpu->ps_M_ScaleIndices, log ); totalMemory += gpuPrintCudaStreamInt2( amoebaGpu->ps_M_ScaleIndices, log );
if( amoebaGpu->psAmoebaBondParameter)(void) fprintf( log, "\n" ); if( amoebaGpu->psAmoebaBondParameter)(void) fprintf( log, "\n" );
gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaBondID, log ); totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaBondID, log );
gpuPrintCudaStreamFloat2( amoebaGpu->psAmoebaBondParameter, log ); totalMemory += gpuPrintCudaStreamFloat2( amoebaGpu->psAmoebaBondParameter, log );
(void) fprintf( log, " amoebaBonds %u\n", amoebaGpu->amoebaSim.amoebaBonds ); (void) fprintf( log, " amoebaBonds %u\n", amoebaGpu->amoebaSim.amoebaBonds );
(void) fprintf( log, " amoebaBond_offset %u\n", amoebaGpu->amoebaSim.amoebaBond_offset ); (void) fprintf( log, " amoebaBond_offset %u\n", amoebaGpu->amoebaSim.amoebaBond_offset );
(void) fprintf( log, " cubic %15.7e\n", amoebaGpu->amoebaSim.amoebaBondCubicParameter); (void) fprintf( log, " cubic %15.7e\n", amoebaGpu->amoebaSim.amoebaBondCubicParameter);
...@@ -239,9 +248,9 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log ) ...@@ -239,9 +248,9 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
(void) fprintf( log, " pAmoebaBondID %p\n", amoebaGpu->amoebaSim.pAmoebaBondID ); (void) fprintf( log, " pAmoebaBondID %p\n", amoebaGpu->amoebaSim.pAmoebaBondID );
(void) fprintf( log, " pAmoebaBondParameter %p\n", amoebaGpu->amoebaSim.pAmoebaBondParameter ); (void) fprintf( log, " pAmoebaBondParameter %p\n", amoebaGpu->amoebaSim.pAmoebaBondParameter );
gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaAngleID1, log ); totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaAngleID1, log );
gpuPrintCudaStreamInt2( amoebaGpu->psAmoebaAngleID2, log ); totalMemory += gpuPrintCudaStreamInt2( amoebaGpu->psAmoebaAngleID2, log );
gpuPrintCudaStreamFloat2( amoebaGpu->psAmoebaAngleParameter, log ); totalMemory += gpuPrintCudaStreamFloat2( amoebaGpu->psAmoebaAngleParameter, log );
(void) fprintf( log, "\n" ); (void) fprintf( log, "\n" );
(void) fprintf( log, " amoebaAngles %u\n", amoebaGpu->amoebaSim.amoebaAngles ); (void) fprintf( log, " amoebaAngles %u\n", amoebaGpu->amoebaSim.amoebaAngles );
(void) fprintf( log, " amoebaAngle_offset %u\n", amoebaGpu->amoebaSim.amoebaAngle_offset ); (void) fprintf( log, " amoebaAngle_offset %u\n", amoebaGpu->amoebaSim.amoebaAngle_offset );
...@@ -254,9 +263,9 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log ) ...@@ -254,9 +263,9 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
(void) fprintf( log, " pAmoebaAngleParameter %p\n", amoebaGpu->amoebaSim.pAmoebaAngleParameter ); (void) fprintf( log, " pAmoebaAngleParameter %p\n", amoebaGpu->amoebaSim.pAmoebaAngleParameter );
if( amoebaGpu->psAmoebaInPlaneAngleID1 )(void) fprintf( log, "\n" ); if( amoebaGpu->psAmoebaInPlaneAngleID1 )(void) fprintf( log, "\n" );
gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaInPlaneAngleID1, log ); totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaInPlaneAngleID1, log );
gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaInPlaneAngleID2, log ); totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaInPlaneAngleID2, log );
gpuPrintCudaStreamFloat2( amoebaGpu->psAmoebaInPlaneAngleParameter, log ); totalMemory += gpuPrintCudaStreamFloat2( amoebaGpu->psAmoebaInPlaneAngleParameter, log );
(void) fprintf( log, "\n" ); (void) fprintf( log, "\n" );
(void) fprintf( log, " amoebaInPlaneAngles %u\n", amoebaGpu->amoebaSim.amoebaInPlaneAngles ); (void) fprintf( log, " amoebaInPlaneAngles %u\n", amoebaGpu->amoebaSim.amoebaInPlaneAngles );
(void) fprintf( log, " amoebaInPlaneAngle_offset %u\n", amoebaGpu->amoebaSim.amoebaInPlaneAngle_offset ); (void) fprintf( log, " amoebaInPlaneAngle_offset %u\n", amoebaGpu->amoebaSim.amoebaInPlaneAngle_offset );
...@@ -270,10 +279,10 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log ) ...@@ -270,10 +279,10 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
if( amoebaGpu->psAmoebaTorsionID1)(void) fprintf( log, "\n" ); if( amoebaGpu->psAmoebaTorsionID1)(void) fprintf( log, "\n" );
gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaTorsionID1, log ); totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaTorsionID1, log );
gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaTorsionID2, log ); totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaTorsionID2, log );
gpuPrintCudaStreamFloat4( amoebaGpu->psAmoebaTorsionParameter1, log ); totalMemory += gpuPrintCudaStreamFloat4( amoebaGpu->psAmoebaTorsionParameter1, log );
gpuPrintCudaStreamFloat2( amoebaGpu->psAmoebaTorsionParameter2, log ); totalMemory += gpuPrintCudaStreamFloat2( amoebaGpu->psAmoebaTorsionParameter2, log );
(void) fprintf( log, " amoebaTorsions %u\n", amoebaGpu->amoebaSim.amoebaTorsions ); (void) fprintf( log, " amoebaTorsions %u\n", amoebaGpu->amoebaSim.amoebaTorsions );
(void) fprintf( log, " amoebaTorsion_offset %u\n", amoebaGpu->amoebaSim.amoebaTorsion_offset ); (void) fprintf( log, " amoebaTorsion_offset %u\n", amoebaGpu->amoebaSim.amoebaTorsion_offset );
(void) fprintf( log, " pAmoebaTorsionID1 %p\n", amoebaGpu->amoebaSim.pAmoebaTorsionID1 ); (void) fprintf( log, " pAmoebaTorsionID1 %p\n", amoebaGpu->amoebaSim.pAmoebaTorsionID1 );
...@@ -282,10 +291,10 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log ) ...@@ -282,10 +291,10 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
(void) fprintf( log, " pAmoebaTorsionParameter2 %p\n", amoebaGpu->amoebaSim.pAmoebaTorsionParameter2 ); (void) fprintf( log, " pAmoebaTorsionParameter2 %p\n", amoebaGpu->amoebaSim.pAmoebaTorsionParameter2 );
if( amoebaGpu->psAmoebaPiTorsionID1)(void) fprintf( log, "\n" ); if( amoebaGpu->psAmoebaPiTorsionID1)(void) fprintf( log, "\n" );
gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaPiTorsionID1, log ); totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaPiTorsionID1, log );
gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaPiTorsionID2, log ); totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaPiTorsionID2, log );
gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaPiTorsionID3, log ); totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaPiTorsionID3, log );
gpuPrintCudaStreamFloat( amoebaGpu->psAmoebaPiTorsionParameter, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psAmoebaPiTorsionParameter, log );
(void) fprintf( log, " amoebaPiTorsions %u\n", amoebaGpu->amoebaSim.amoebaPiTorsions ); (void) fprintf( log, " amoebaPiTorsions %u\n", amoebaGpu->amoebaSim.amoebaPiTorsions );
(void) fprintf( log, " amoebaPiTorsion_offset %u\n", amoebaGpu->amoebaSim.amoebaPiTorsion_offset ); (void) fprintf( log, " amoebaPiTorsion_offset %u\n", amoebaGpu->amoebaSim.amoebaPiTorsion_offset );
...@@ -295,9 +304,9 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log ) ...@@ -295,9 +304,9 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
(void) fprintf( log, " pAmoebaPiTorsionParameter %p\n", amoebaGpu->amoebaSim.pAmoebaPiTorsionParameter ); (void) fprintf( log, " pAmoebaPiTorsionParameter %p\n", amoebaGpu->amoebaSim.pAmoebaPiTorsionParameter );
if( amoebaGpu->psAmoebaStretchBendID1)(void) fprintf( log, "\n" ); if( amoebaGpu->psAmoebaStretchBendID1)(void) fprintf( log, "\n" );
gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaStretchBendID1, log ); totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaStretchBendID1, log );
gpuPrintCudaStreamInt2( amoebaGpu->psAmoebaStretchBendID2, log ); totalMemory += gpuPrintCudaStreamInt2( amoebaGpu->psAmoebaStretchBendID2, log );
gpuPrintCudaStreamFloat4( amoebaGpu->psAmoebaStretchBendParameter, log ); totalMemory += gpuPrintCudaStreamFloat4( amoebaGpu->psAmoebaStretchBendParameter, log );
(void) fprintf( log, " amoebaStretchBend %u\n", amoebaGpu->amoebaSim.amoebaStretchBends ); (void) fprintf( log, " amoebaStretchBend %u\n", amoebaGpu->amoebaSim.amoebaStretchBends );
(void) fprintf( log, " amoebaStretchBend_offset %u\n", amoebaGpu->amoebaSim.amoebaStretchBend_offset ); (void) fprintf( log, " amoebaStretchBend_offset %u\n", amoebaGpu->amoebaSim.amoebaStretchBend_offset );
(void) fprintf( log, " pAmoebaStretchBendID1 %p\n", amoebaGpu->amoebaSim.pAmoebaStretchBendID1 ); (void) fprintf( log, " pAmoebaStretchBendID1 %p\n", amoebaGpu->amoebaSim.pAmoebaStretchBendID1 );
...@@ -305,9 +314,9 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log ) ...@@ -305,9 +314,9 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
(void) fprintf( log, " pAmoebaStretchBendParameter %p\n", amoebaGpu->amoebaSim.pAmoebaStretchBendParameter ); (void) fprintf( log, " pAmoebaStretchBendParameter %p\n", amoebaGpu->amoebaSim.pAmoebaStretchBendParameter );
if( amoebaGpu->psAmoebaOutOfPlaneBendID1)(void) fprintf( log, "\n" ); if( amoebaGpu->psAmoebaOutOfPlaneBendID1)(void) fprintf( log, "\n" );
gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaOutOfPlaneBendID1, log ); totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaOutOfPlaneBendID1, log );
gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaOutOfPlaneBendID2, log ); totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaOutOfPlaneBendID2, log );
gpuPrintCudaStreamFloat( amoebaGpu->psAmoebaOutOfPlaneBendParameter, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psAmoebaOutOfPlaneBendParameter, log );
(void) fprintf( log, " amoebaOutOfPlaneBend %u\n", amoebaGpu->amoebaSim.amoebaOutOfPlaneBends ); (void) fprintf( log, " amoebaOutOfPlaneBend %u\n", amoebaGpu->amoebaSim.amoebaOutOfPlaneBends );
(void) fprintf( log, " amoebaOutOfPlaneBend_offset %u\n", amoebaGpu->amoebaSim.amoebaOutOfPlaneBend_offset ); (void) fprintf( log, " amoebaOutOfPlaneBend_offset %u\n", amoebaGpu->amoebaSim.amoebaOutOfPlaneBend_offset );
(void) fprintf( log, " amoebaOutOfPlaneBendCubicK %15.7e\n", amoebaGpu->amoebaSim.amoebaOutOfPlaneBendCubicK ); (void) fprintf( log, " amoebaOutOfPlaneBendCubicK %15.7e\n", amoebaGpu->amoebaSim.amoebaOutOfPlaneBendCubicK );
...@@ -319,10 +328,10 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log ) ...@@ -319,10 +328,10 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
(void) fprintf( log, " pAmoebaOutOfPlaneBendParameter %p\n", amoebaGpu->amoebaSim.pAmoebaOutOfPlaneBendParameter ); (void) fprintf( log, " pAmoebaOutOfPlaneBendParameter %p\n", amoebaGpu->amoebaSim.pAmoebaOutOfPlaneBendParameter );
if( amoebaGpu->psAmoebaTorsionTorsionID1)(void) fprintf( log, "\n" ); if( amoebaGpu->psAmoebaTorsionTorsionID1)(void) fprintf( log, "\n" );
gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaTorsionTorsionID1, log ); totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaTorsionTorsionID1, log );
gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaTorsionTorsionID2, log ); totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaTorsionTorsionID2, log );
gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaTorsionTorsionID3, log ); totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaTorsionTorsionID3, log );
gpuPrintCudaStreamFloat4( amoebaGpu->psAmoebaTorsionTorsionGrids, log ); totalMemory += gpuPrintCudaStreamFloat4( amoebaGpu->psAmoebaTorsionTorsionGrids, log );
(void) fprintf( log, "\n" ); (void) fprintf( log, "\n" );
(void) fprintf( log, " amoebaTorsionTorsions %u\n", amoebaGpu->amoebaSim.amoebaTorsionTorsions ); (void) fprintf( log, " amoebaTorsionTorsions %u\n", amoebaGpu->amoebaSim.amoebaTorsionTorsions );
(void) fprintf( log, " amoebaTorsionTorsion_offset %u\n", amoebaGpu->amoebaSim.amoebaTorsionTorsion_offset ); (void) fprintf( log, " amoebaTorsionTorsion_offset %u\n", amoebaGpu->amoebaSim.amoebaTorsionTorsion_offset );
...@@ -333,8 +342,8 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log ) ...@@ -333,8 +342,8 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
(void) fprintf( log, " pOutputBufferCounter %p\n", amoebaGpu->gpuContext->pOutputBufferCounter ); (void) fprintf( log, " pOutputBufferCounter %p\n", amoebaGpu->gpuContext->pOutputBufferCounter );
if( amoebaGpu->psAmoebaUreyBradleyParameter)(void) fprintf( log, "\n" ); if( amoebaGpu->psAmoebaUreyBradleyParameter)(void) fprintf( log, "\n" );
gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaUreyBradleyID, log ); totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaUreyBradleyID, log );
gpuPrintCudaStreamFloat2( amoebaGpu->psAmoebaUreyBradleyParameter, log ); totalMemory += gpuPrintCudaStreamFloat2( amoebaGpu->psAmoebaUreyBradleyParameter, log );
(void) fprintf( log, " amoebaUreyBradleys %u\n", amoebaGpu->amoebaSim.amoebaUreyBradleys ); (void) fprintf( log, " amoebaUreyBradleys %u\n", amoebaGpu->amoebaSim.amoebaUreyBradleys );
(void) fprintf( log, " amoebaUreyBradley_offset %u\n", amoebaGpu->amoebaSim.amoebaUreyBradley_offset ); (void) fprintf( log, " amoebaUreyBradley_offset %u\n", amoebaGpu->amoebaSim.amoebaUreyBradley_offset );
(void) fprintf( log, " cubic %15.7e\n", amoebaGpu->amoebaSim.amoebaUreyBradleyCubicParameter); (void) fprintf( log, " cubic %15.7e\n", amoebaGpu->amoebaSim.amoebaUreyBradleyCubicParameter);
...@@ -343,20 +352,26 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log ) ...@@ -343,20 +352,26 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
(void) fprintf( log, " pAmoebaUreyBradleyParameter %p\n", amoebaGpu->amoebaSim.pAmoebaUreyBradleyParameter ); (void) fprintf( log, " pAmoebaUreyBradleyParameter %p\n", amoebaGpu->amoebaSim.pAmoebaUreyBradleyParameter );
(void) fprintf( log, "\n\n" ); (void) fprintf( log, "\n\n" );
gpuPrintCudaStreamInt4( amoebaGpu->psMultipoleParticlesIdsAndAxisType, log ); totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psMultipoleParticlesIdsAndAxisType, log );
(void) fprintf( log, " pMultipoleParticlesIdsAndAxisType %p\n", amoebaGpu->amoebaSim.pMultipoleParticlesIdsAndAxisType); (void) fprintf( log, " pMultipoleParticlesIdsAndAxisType %p\n", amoebaGpu->amoebaSim.pMultipoleParticlesIdsAndAxisType);
(void) fprintf( log, " maxTorqueBufferIndex %d\n", amoebaGpu->maxTorqueBufferIndex ); (void) fprintf( log, " maxTorqueBufferIndex %d\n", amoebaGpu->maxTorqueBufferIndex );
gpuPrintCudaStreamInt4( amoebaGpu->psMultipoleParticlesTorqueBufferIndices, log ); totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psMultipoleParticlesTorqueBufferIndices, log );
int memory = gpuPrintCudaStreamFloat4( amoebaGpu->psTorqueMapForce4, log );
if( amoebaGpu->torqueMapForce4Delete )totalMemory += memory;
totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psTorque, log );
(void) fprintf( log, " psMultipoleParticlesTorqueBufferIndices %p\n", amoebaGpu->amoebaSim.pMultipoleParticlesTorqueBufferIndices); (void) fprintf( log, " psMultipoleParticlesTorqueBufferIndices %p\n", amoebaGpu->amoebaSim.pMultipoleParticlesTorqueBufferIndices);
gpuPrintCudaStreamFloat( amoebaGpu->psMolecularDipole, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psMolecularDipole, log );
(void) fprintf( log, " pMolecularDipole %p\n", amoebaGpu->amoebaSim.pMolecularDipole); (void) fprintf( log, " pMolecularDipole %p\n", amoebaGpu->amoebaSim.pMolecularDipole);
gpuPrintCudaStreamFloat( amoebaGpu->psMolecularQuadrupole, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psMolecularQuadrupole, log );
(void) fprintf( log, " pMolecularQuadrupole %p\n", amoebaGpu->amoebaSim.pMolecularQuadrupole ); (void) fprintf( log, " pMolecularQuadrupole %p\n", amoebaGpu->amoebaSim.pMolecularQuadrupole );
gpuPrintCudaStreamFloat( amoebaGpu->psLabFrameDipole, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psLabFrameDipole, log );
gpuPrintCudaStreamFloat( amoebaGpu->psLabFrameQuadrupole, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psLabFrameQuadrupole, log );
(void) fprintf( log, " polarizationType %d\n", amoebaGpu->amoebaSim.polarizationType ); (void) fprintf( log, " polarizationType %d\n", amoebaGpu->amoebaSim.polarizationType );
(void) fprintf( log, " maxCovalentDegreeSz %d\n", amoebaGpu->maxCovalentDegreeSz ); (void) fprintf( log, " maxCovalentDegreeSz %d\n", amoebaGpu->maxCovalentDegreeSz );
...@@ -380,15 +395,11 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log ) ...@@ -380,15 +395,11 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
(void) fprintf( log, " fd %15.7e\n", amoebaGpu->amoebaSim.fd ); (void) fprintf( log, " fd %15.7e\n", amoebaGpu->amoebaSim.fd );
(void) fprintf( log, " fq %15.7e\n", amoebaGpu->amoebaSim.fq ); (void) fprintf( log, " fq %15.7e\n", amoebaGpu->amoebaSim.fq );
gpuPrintCudaStreamFloat2( amoebaGpu->psDampingFactorAndThole, log ); totalMemory += gpuPrintCudaStreamFloat2( amoebaGpu->psDampingFactorAndThole, log );
gpuPrintCudaStreamInt( amoebaGpu->psCovalentDegree, log );
gpuPrintCudaStreamInt( amoebaGpu->psPolarizationDegree, log );
gpuPrintCudaStreamFloat( amoebaGpu->psE_Field, log );
gpuPrintCudaStreamFloat( amoebaGpu->psE_FieldPolar, log );
gpuPrintCudaStreamFloat( amoebaGpu->psPolarizability, log );
totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psE_Field, log );
totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psE_FieldPolar, log );
totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psPolarizability, log );
(void) fprintf( log, " mutualInducedIterativeMethod %d\n", amoebaGpu->mutualInducedIterativeMethod); (void) fprintf( log, " mutualInducedIterativeMethod %d\n", amoebaGpu->mutualInducedIterativeMethod);
(void) fprintf( log, " mutualInducedMaxIterations %d\n", amoebaGpu->mutualInducedMaxIterations); (void) fprintf( log, " mutualInducedMaxIterations %d\n", amoebaGpu->mutualInducedMaxIterations);
...@@ -396,19 +407,22 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log ) ...@@ -396,19 +407,22 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
(void) fprintf( log, " mutualInducedTargetEpsilon %10.3e\n", amoebaGpu->mutualInducedTargetEpsilon); (void) fprintf( log, " mutualInducedTargetEpsilon %10.3e\n", amoebaGpu->mutualInducedTargetEpsilon);
(void) fprintf( log, " mutualInducedCurrentEpsilon %10.3e\n", amoebaGpu->mutualInducedCurrentEpsilon ); (void) fprintf( log, " mutualInducedCurrentEpsilon %10.3e\n", amoebaGpu->mutualInducedCurrentEpsilon );
gpuPrintCudaStreamFloat( amoebaGpu->psInducedDipole, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psInducedDipole, log );
gpuPrintCudaStreamFloat( amoebaGpu->psInducedDipolePolar, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psInducedDipolePolar, log );
gpuPrintCudaStreamFloat( amoebaGpu->psCurrentEpsilon, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psCurrentEpsilon, log );
(void) fprintf( log, " numberOfSorWorkVectors %u\n", amoebaGpu->numberOfSorWorkVectors); (void) fprintf( log, " numberOfSorWorkVectors %u\n", amoebaGpu->numberOfSorWorkVectors);
gpuPrintCudaStreamFloat( amoebaGpu->psWorkVector[0], log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psWorkVector[0], log );
gpuPrintCudaStreamFloat( amoebaGpu->psTorque, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psWorkVector[1], log );
totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psWorkVector[2], log );
gpuPrintCudaStreamFloat( amoebaGpu->psGk_Field, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psWorkVector[3], log );
gpuPrintCudaStreamFloat( amoebaGpu->psInducedDipoleS, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psTorque, log );
gpuPrintCudaStreamFloat( amoebaGpu->psInducedDipolePolarS, log );
gpuPrintCudaStreamFloat( amoebaGpu->psBorn, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psGk_Field, log );
gpuPrintCudaStreamFloat( amoebaGpu->psBornPolar, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psInducedDipoleS, log );
totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psInducedDipolePolarS, log );
totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psBorn, log );
totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psBornPolar, log );
(void) fprintf( log, " includeObcCavityTerm %d\n", amoebaGpu->includeObcCavityTerm ); (void) fprintf( log, " includeObcCavityTerm %d\n", amoebaGpu->includeObcCavityTerm );
(void) fprintf( log, " dielectricOffset %15.7e\n", gpu->sim.dielectricOffset ); (void) fprintf( log, " dielectricOffset %15.7e\n", gpu->sim.dielectricOffset );
(void) fprintf( log, " probeRadius %15.7e\n", gpu->sim.probeRadius ); (void) fprintf( log, " probeRadius %15.7e\n", gpu->sim.probeRadius );
...@@ -420,14 +434,14 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log ) ...@@ -420,14 +434,14 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
(void) fprintf( log, " vdwEpsilonCombiningRule %d\n", amoebaGpu->vdwEpsilonCombiningRule); (void) fprintf( log, " vdwEpsilonCombiningRule %d\n", amoebaGpu->vdwEpsilonCombiningRule);
(void) fprintf( log, " vdwUsePBC %d\n", amoebaGpu->amoebaSim.vdwUsePBC); (void) fprintf( log, " vdwUsePBC %d\n", amoebaGpu->amoebaSim.vdwUsePBC);
(void) fprintf( log, " vdwCutoff2 %15.7e\n", amoebaGpu->amoebaSim.vdwCutoff2); (void) fprintf( log, " vdwCutoff2 %15.7e\n", amoebaGpu->amoebaSim.vdwCutoff2);
gpuPrintCudaStreamFloat2( amoebaGpu->psVdwSigmaEpsilon, log ); totalMemory += gpuPrintCudaStreamFloat2( amoebaGpu->psVdwSigmaEpsilon, log );
gpuPrintCudaStreamInt( amoebaGpu->psAmoebaVdwNonReductionID, log ); totalMemory += gpuPrintCudaStreamInt( amoebaGpu->psAmoebaVdwNonReductionID, log );
gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaVdwReductionID, log ); totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaVdwReductionID, log );
gpuPrintCudaStreamFloat( amoebaGpu->psAmoebaVdwReduction, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psAmoebaVdwReduction, log );
gpuPrintCudaStreamFloat4( amoebaGpu->psAmoebaVdwCoordinates, log ); totalMemory += gpuPrintCudaStreamFloat4( amoebaGpu->psAmoebaVdwCoordinates, log );
gpuPrintCudaStreamUnsignedInt( amoebaGpu->psVdwWorkUnit, log ); totalMemory += gpuPrintCudaStreamUnsignedInt( amoebaGpu->psVdwWorkUnit, log );
gpuPrintCudaStreamInt( amoebaGpu->psVdwExclusionIndicesIndex, log ); totalMemory += gpuPrintCudaStreamInt( amoebaGpu->psVdwExclusionIndicesIndex, log );
gpuPrintCudaStreamInt( amoebaGpu->psVdwExclusionIndices, log ); totalMemory += gpuPrintCudaStreamInt( amoebaGpu->psVdwExclusionIndices, log );
(void) fprintf( log, " amoebaVdwNonReductions %u\n", amoebaGpu->amoebaSim.amoebaVdwNonReductions ); (void) fprintf( log, " amoebaVdwNonReductions %u\n", amoebaGpu->amoebaSim.amoebaVdwNonReductions );
(void) fprintf( log, " pAmoebaVdwNonReductionID %p\n", amoebaGpu->amoebaSim.pAmoebaVdwNonReductionID ); (void) fprintf( log, " pAmoebaVdwNonReductionID %p\n", amoebaGpu->amoebaSim.pAmoebaVdwNonReductionID );
(void) fprintf( log, " amoebaVdwReductions %u\n", amoebaGpu->amoebaSim.amoebaVdwReductions ); (void) fprintf( log, " amoebaVdwReductions %u\n", amoebaGpu->amoebaSim.amoebaVdwReductions );
...@@ -436,7 +450,7 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log ) ...@@ -436,7 +450,7 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
(void) fprintf( log, " pVdwExclusionIndicesIndex %p\n", amoebaGpu->amoebaSim.pVdwExclusionIndicesIndex); (void) fprintf( log, " pVdwExclusionIndicesIndex %p\n", amoebaGpu->amoebaSim.pVdwExclusionIndicesIndex);
(void) fprintf( log, " pVdwExclusionIndices %p\n", amoebaGpu->amoebaSim.pVdwExclusionIndices); (void) fprintf( log, " pVdwExclusionIndices %p\n", amoebaGpu->amoebaSim.pVdwExclusionIndices);
gpuPrintCudaStreamFloat2( amoebaGpu->psWcaDispersionRadiusEpsilon, log ); totalMemory += gpuPrintCudaStreamFloat2( amoebaGpu->psWcaDispersionRadiusEpsilon, log );
(void) fprintf( log, "\n" ); (void) fprintf( log, "\n" );
(void) fprintf( log, " epso %15.7e\n", amoebaGpu->amoebaSim.epso ); (void) fprintf( log, " epso %15.7e\n", amoebaGpu->amoebaSim.epso );
(void) fprintf( log, " epsh %15.7e\n", amoebaGpu->amoebaSim.epsh ); (void) fprintf( log, " epsh %15.7e\n", amoebaGpu->amoebaSim.epsh );
...@@ -447,6 +461,8 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log ) ...@@ -447,6 +461,8 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
(void) fprintf( log, " dispoff %15.7e\n", amoebaGpu->amoebaSim.dispoff ); (void) fprintf( log, " dispoff %15.7e\n", amoebaGpu->amoebaSim.dispoff );
(void) fprintf( log, " totalMaxWcaDispersionEnergy %15.7e\n", amoebaGpu->amoebaSim.totalMaxWcaDispersionEnergy ); (void) fprintf( log, " totalMaxWcaDispersionEnergy %15.7e\n", amoebaGpu->amoebaSim.totalMaxWcaDispersionEnergy );
(void) fprintf( log, " total array memory %d\n", totalMemory );
(void) fflush( log ); (void) fflush( log );
} }
...@@ -1349,20 +1365,14 @@ static void gpuFixedEFieldAllocate( amoebaGpuContext amoebaGpu ) ...@@ -1349,20 +1365,14 @@ static void gpuFixedEFieldAllocate( amoebaGpuContext amoebaGpu )
amoebaGpu->psDampingFactorAndThole = new CUDAStream<float2>(paddedNumberOfAtoms, 1, "DampingFactorAndThole"); amoebaGpu->psDampingFactorAndThole = new CUDAStream<float2>(paddedNumberOfAtoms, 1, "DampingFactorAndThole");
amoebaGpu->amoebaSim.pDampingFactorAndThole = amoebaGpu->psDampingFactorAndThole->_pDevData; amoebaGpu->amoebaSim.pDampingFactorAndThole = amoebaGpu->psDampingFactorAndThole->_pDevData;
amoebaGpu->psCovalentDegree = new CUDAStream<int>(amoebaGpu->maxCovalentDegreeSz*paddedNumberOfAtoms, 1, "CovalentDegree"); amoebaGpu->covalentDegree.resize( amoebaGpu->maxCovalentDegreeSz*paddedNumberOfAtoms, 0 );
amoebaGpu->psPolarizationDegree = new CUDAStream<int>(amoebaGpu->maxCovalentDegreeSz*paddedNumberOfAtoms, 1, "PolarizationDegree"); amoebaGpu->polarizationDegree.resize( amoebaGpu->maxCovalentDegreeSz*paddedNumberOfAtoms, 0 );
unsigned int offset = paddedNumberOfAtoms*sizeof( float ); unsigned int offset = paddedNumberOfAtoms*sizeof( float );
memset( amoebaGpu->psDampingFactorAndThole->_pSysData, 0,2*offset ); memset( amoebaGpu->psDampingFactorAndThole->_pSysData, 0,2*offset );
//memset( amoebaGpu->psE_Field->_pSysData, 0, offset*3 ); //memset( amoebaGpu->psE_Field->_pSysData, 0, offset*3 );
//memset( amoebaGpu->psE_FieldPolar->_pSysData, 0, offset*3 ); //memset( amoebaGpu->psE_FieldPolar->_pSysData, 0, offset*3 );
// should be removed XXXXX
offset = amoebaGpu->maxCovalentDegreeSz*paddedNumberOfAtoms*sizeof( int );
memset( amoebaGpu->psCovalentDegree->_pSysData, 0, offset );
memset( amoebaGpu->psPolarizationDegree->_pSysData, 0, offset );
} }
/**--------------------------------------------------------------------------------------- /**---------------------------------------------------------------------------------------
...@@ -1666,7 +1676,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect ...@@ -1666,7 +1676,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
const int particlesOffset = ii*amoebaGpu->maxCovalentDegreeSz; const int particlesOffset = ii*amoebaGpu->maxCovalentDegreeSz;
const int minCovalentIndex = minCovalentIndices[ii]; const int minCovalentIndex = minCovalentIndices[ii];
amoebaGpu->psCovalentDegree->_pSysData[particlesOffset] = minCovalentIndex; amoebaGpu->covalentDegree[particlesOffset] = minCovalentIndex;
// covalent info // covalent info
...@@ -1680,7 +1690,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect ...@@ -1680,7 +1690,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
") is out of range -- maxCovalentDegreeSz needs to be increased." << std::endl; ") is out of range -- maxCovalentDegreeSz needs to be increased." << std::endl;
errorCount++; errorCount++;
} else { } else {
amoebaGpu->psCovalentDegree->_pSysData[particlesOffset+covalentIndex] = covalentDegree[jj] + 1; amoebaGpu->covalentDegree[particlesOffset+covalentIndex] = covalentDegree[jj] + 1;
} }
} }
} }
...@@ -1688,7 +1698,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect ...@@ -1688,7 +1698,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
// polarization covalent info // polarization covalent info
const int minCovalentPolarizationIndex = minCovalentPolarizationIndices[ii]; const int minCovalentPolarizationIndex = minCovalentPolarizationIndices[ii];
amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset] = minCovalentPolarizationIndex; amoebaGpu->polarizationDegree[particlesOffset] = minCovalentPolarizationIndex;
for( unsigned int jj = 4; jj < covalentInfo.size(); jj++ ){ for( unsigned int jj = 4; jj < covalentInfo.size(); jj++ ){
const std::vector<int> covalentList = covalentInfo[jj]; const std::vector<int> covalentList = covalentInfo[jj];
...@@ -1699,7 +1709,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect ...@@ -1699,7 +1709,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
") is out of range -- maxCovalentDegreeSz needs to be increased." << std::endl; ") is out of range -- maxCovalentDegreeSz needs to be increased." << std::endl;
errorCount++; errorCount++;
} else { } else {
amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset+covalentIndex] = covalentDegree[jj] + 1; amoebaGpu->polarizationDegree[particlesOffset+covalentIndex] = covalentDegree[jj] + 1;
} }
} }
} }
...@@ -1739,7 +1749,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect ...@@ -1739,7 +1749,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
// covalent/polarization degree // covalent/polarization degree
(void) fprintf( amoebaGpu->log,"%3d covalent/polarization degree: minIdx[%6d %6d] Thole=%12.5f dampingFactor=%12.5f\n", ii, (void) fprintf( amoebaGpu->log,"%3d covalent/polarization degree: minIdx[%6d %6d] Thole=%12.5f dampingFactor=%12.5f\n", ii,
amoebaGpu->psCovalentDegree->_pSysData[particlesOffset], amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset], amoebaGpu->covalentDegree[particlesOffset], amoebaGpu->polarizationDegree[particlesOffset],
amoebaGpu->psDampingFactorAndThole->_pSysData[ii].y, amoebaGpu->psDampingFactorAndThole->_pSysData[ii].x ); amoebaGpu->psDampingFactorAndThole->_pSysData[ii].y, amoebaGpu->psDampingFactorAndThole->_pSysData[ii].x );
// covalent // covalent
...@@ -1752,20 +1762,20 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect ...@@ -1752,20 +1762,20 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
int count = 0; int count = 0;
for( int jj = 1; jj < amoebaGpu->maxCovalentDegreeSz; jj++ ){ for( int jj = 1; jj < amoebaGpu->maxCovalentDegreeSz; jj++ ){
if( amoebaGpu->psCovalentDegree->_pSysData[particlesOffset+jj] == kk ){ if( amoebaGpu->covalentDegree[particlesOffset+jj] == kk ){
if( count == 0 ){ if( count == 0 ){
(void) fprintf( amoebaGpu->log,"%d [", kk ); (void) fprintf( amoebaGpu->log,"%d [", kk );
} }
float pScale = polarScale[kk-1]; float pScale = polarScale[kk-1];
int particle2Index = amoebaGpu->psCovalentDegree->_pSysData[particlesOffset] + jj - 1; int particle2Index = amoebaGpu->covalentDegree[particlesOffset] + jj - 1;
if( kk == 4 && particle2Index >= amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset] ){ if( kk == 4 && particle2Index >= amoebaGpu->polarizationDegree[particlesOffset] ){
int particle2Offset = particle2Index - amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset] + 1; int particle2Offset = particle2Index - amoebaGpu->polarizationDegree[particlesOffset] + 1;
if( particle2Offset < amoebaGpu->maxCovalentDegreeSz && amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset+particle2Offset] == 1 ){ if( particle2Offset < amoebaGpu->maxCovalentDegreeSz && amoebaGpu->polarizationDegree[particlesOffset+particle2Offset] == 1 ){
pScale *= 0.5; pScale *= 0.5;
} }
} }
(void) fprintf( amoebaGpu->log,"%5d %5.1f ", (void) fprintf( amoebaGpu->log,"%5d %5.1f ",
amoebaGpu->psCovalentDegree->_pSysData[particlesOffset] + jj - 1, pScale ); amoebaGpu->covalentDegree[particlesOffset] + jj - 1, pScale );
count++; count++;
} }
} }
...@@ -1782,11 +1792,11 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect ...@@ -1782,11 +1792,11 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
int count = 0; int count = 0;
for( int jj = 1; jj < amoebaGpu->maxCovalentDegreeSz; jj++ ){ for( int jj = 1; jj < amoebaGpu->maxCovalentDegreeSz; jj++ ){
if( amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset+jj] == kk ){ if( amoebaGpu->polarizationDegree[particlesOffset+jj] == kk ){
if( count == 0 ){ if( count == 0 ){
(void) fprintf( amoebaGpu->log,"%d [", kk ); (void) fprintf( amoebaGpu->log,"%d [", kk );
} }
(void) fprintf( amoebaGpu->log,"%5d ", amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset] + jj - 1 ); (void) fprintf( amoebaGpu->log,"%5d ", amoebaGpu->polarizationDegree[particlesOffset] + jj - 1 );
count++; count++;
} }
} }
...@@ -1818,19 +1828,19 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect ...@@ -1818,19 +1828,19 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
// print entries w/ degree=kk // print entries w/ degree=kk
for( int jj = 1; jj < amoebaGpu->maxCovalentDegreeSz; jj++ ){ for( int jj = 1; jj < amoebaGpu->maxCovalentDegreeSz; jj++ ){
if( amoebaGpu->psCovalentDegree->_pSysData[particlesOffset+jj] ){ if( amoebaGpu->covalentDegree[particlesOffset+jj] ){
int index = amoebaGpu->psCovalentDegree->_pSysData[particlesOffset+jj]; int index = amoebaGpu->covalentDegree[particlesOffset+jj];
float pScale = polarScale[index-1]; float pScale = polarScale[index-1];
float mScale = mpoleScale[index-1]; float mScale = mpoleScale[index-1];
int particle2Index = amoebaGpu->psCovalentDegree->_pSysData[particlesOffset] + jj - 1; int particle2Index = amoebaGpu->covalentDegree[particlesOffset] + jj - 1;
if( index == 4 && particle2Index >= amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset] ){ if( index == 4 && particle2Index >= amoebaGpu->polarizationDegree[particlesOffset] ){
int particle2Offset = particle2Index - amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset] + 1; int particle2Offset = particle2Index - amoebaGpu->polarizationDegree[particlesOffset] + 1;
if( particle2Offset < amoebaGpu->maxCovalentDegreeSz && amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset+particle2Offset] == 1 ){ if( particle2Offset < amoebaGpu->maxCovalentDegreeSz && amoebaGpu->polarizationDegree[particlesOffset+particle2Offset] == 1 ){
pScale *= 0.5; pScale *= 0.5;
} }
} }
pScaleCheckSum[ii] += (pScale - 1.0f); pScaleCheckSum[ii] += (pScale - 1.0f);
int covIndex = amoebaGpu->psCovalentDegree->_pSysData[particlesOffset]; int covIndex = amoebaGpu->covalentDegree[particlesOffset];
if( pScale != 1.0f ){ if( pScale != 1.0f ){
MapIntFloat* pMap = amoebaGpu->pMapArray[ii]; MapIntFloat* pMap = amoebaGpu->pMapArray[ii];
(*pMap)[covIndex+jj-1] = pScale; (*pMap)[covIndex+jj-1] = pScale;
...@@ -1841,10 +1851,10 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect ...@@ -1841,10 +1851,10 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
// polarization // polarization
for( int jj = 1; jj < amoebaGpu->maxCovalentDegreeSz; jj++ ){ for( int jj = 1; jj < amoebaGpu->maxCovalentDegreeSz; jj++ ){
if( amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset+jj] ){ if( amoebaGpu->polarizationDegree[particlesOffset+jj] ){
int index = amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset+jj]; int index = amoebaGpu->polarizationDegree[particlesOffset+jj];
dScaleCheckSum[ii] += (directScale[index-1] - 1.0f); dScaleCheckSum[ii] += (directScale[index-1] - 1.0f);
int covIndex = amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset]; int covIndex = amoebaGpu->polarizationDegree[particlesOffset];
if( directScale[index-1] != 1.0f ){ if( directScale[index-1] != 1.0f ){
MapIntFloat* dMap = amoebaGpu->dMapArray[ii]; MapIntFloat* dMap = amoebaGpu->dMapArray[ii];
(*dMap)[covIndex+jj-1] = directScale[index-1]; (*dMap)[covIndex+jj-1] = directScale[index-1];
...@@ -1872,8 +1882,6 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect ...@@ -1872,8 +1882,6 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
amoebaGpu->psMultipoleParticlesTorqueBufferIndices->Upload(); amoebaGpu->psMultipoleParticlesTorqueBufferIndices->Upload();
amoebaGpu->psMolecularDipole->Upload(); amoebaGpu->psMolecularDipole->Upload();
amoebaGpu->psMolecularQuadrupole->Upload(); amoebaGpu->psMolecularQuadrupole->Upload();
amoebaGpu->psCovalentDegree->Upload();
amoebaGpu->psPolarizationDegree->Upload();
amoebaGpu->psDampingFactorAndThole->Upload(); amoebaGpu->psDampingFactorAndThole->Upload();
amoebaGpu->psPolarizability->Upload(); amoebaGpu->psPolarizability->Upload();
amoebaGpu->gpuContext->psPosq4->Upload(); amoebaGpu->gpuContext->psPosq4->Upload();
...@@ -2652,21 +2660,27 @@ void amoebaGpuShutDown(amoebaGpuContext gpu) ...@@ -2652,21 +2660,27 @@ void amoebaGpuShutDown(amoebaGpuContext gpu)
delete gpu->psMolecularDipole; delete gpu->psMolecularDipole;
delete gpu->psMolecularQuadrupole; delete gpu->psMolecularQuadrupole;
delete gpu->psLabFrameDipole; delete gpu->psLabFrameDipole;
delete gpu->psLabFrameQuadrupole; delete gpu->psLabFrameQuadrupole;
delete gpu->psDampingFactorAndThole; delete gpu->psDampingFactorAndThole;
delete gpu->psCovalentDegree;
delete gpu->psPolarizationDegree;
delete gpu->psE_Field; delete gpu->psE_Field;
delete gpu->psE_FieldPolar; delete gpu->psE_FieldPolar;
delete gpu->psInducedDipole; delete gpu->psInducedDipole;
delete gpu->psInducedDipolePolar; delete gpu->psInducedDipolePolar;
delete gpu->psPolarizability; delete gpu->psPolarizability;
delete gpu->psCurrentEpsilon; delete gpu->psCurrentEpsilon;
delete gpu->psWorkVector[0]; delete gpu->psWorkVector[0];
delete gpu->psWorkVector[1]; delete gpu->psWorkVector[1];
delete gpu->psWorkVector[2]; delete gpu->psWorkVector[2];
delete gpu->psWorkVector[3]; delete gpu->psWorkVector[3];
delete gpu->psTorque; delete gpu->psTorque;
delete gpu->psGk_Field; delete gpu->psGk_Field;
...@@ -2773,6 +2787,7 @@ void amoebaGpuBuildOutputBuffers( amoebaGpuContext amoebaGpu, int hasAmoebaGener ...@@ -2773,6 +2787,7 @@ void amoebaGpuBuildOutputBuffers( amoebaGpuContext amoebaGpu, int hasAmoebaGener
amoebaGpu->amoebaSim.pWorkArray_3_2 = amoebaGpu->psWorkArray_3_2->_pDevData; amoebaGpu->amoebaSim.pWorkArray_3_2 = amoebaGpu->psWorkArray_3_2->_pDevData;
// used in GK calculations // used in GK calculations
if( hasAmoebaGeneralizedKirkwood ) if( hasAmoebaGeneralizedKirkwood )
{ {
if( amoebaGpu->psWorkArray_3_3 ) if( amoebaGpu->psWorkArray_3_3 )
...@@ -2810,19 +2825,19 @@ static void getScalingDegrees( amoebaGpuContext amoebaGpu, unsigned int particle ...@@ -2810,19 +2825,19 @@ static void getScalingDegrees( amoebaGpuContext amoebaGpu, unsigned int particle
{ {
int particlesOffset = particleI*amoebaGpu->maxCovalentDegreeSz; int particlesOffset = particleI*amoebaGpu->maxCovalentDegreeSz;
unsigned int minCovalentIndex = static_cast<unsigned int>(amoebaGpu->psCovalentDegree->_pSysData[particlesOffset]); unsigned int minCovalentIndex = static_cast<unsigned int>(amoebaGpu->covalentDegree[particlesOffset]);
unsigned int minCovalentPolarizationIndex = static_cast<unsigned int>(amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset]); unsigned int minCovalentPolarizationIndex = static_cast<unsigned int>(amoebaGpu->polarizationDegree[particlesOffset]);
if( particleJ < minCovalentIndex || particleJ >= (minCovalentIndex + amoebaGpu->maxCovalentDegreeSz-1) ){ if( particleJ < minCovalentIndex || particleJ >= (minCovalentIndex + amoebaGpu->maxCovalentDegreeSz-1) ){
*covalentDegree = 0; *covalentDegree = 0;
} else { } else {
*covalentDegree = amoebaGpu->psCovalentDegree->_pSysData[particlesOffset + (particleJ-minCovalentIndex) + 1]; *covalentDegree = amoebaGpu->covalentDegree[particlesOffset + (particleJ-minCovalentIndex) + 1];
} }
if( particleJ < minCovalentPolarizationIndex || particleJ >= (minCovalentPolarizationIndex + amoebaGpu->maxCovalentDegreeSz-1) ){ if( particleJ < minCovalentPolarizationIndex || particleJ >= (minCovalentPolarizationIndex + amoebaGpu->maxCovalentDegreeSz-1) ){
*polarizationDegree = 0; *polarizationDegree = 0;
} else { } else {
*polarizationDegree = amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset + (particleJ-minCovalentPolarizationIndex) + 1]; *polarizationDegree = amoebaGpu->polarizationDegree[particlesOffset + (particleJ-minCovalentPolarizationIndex) + 1];
} }
/* if( *covalentDegree > 5 || *polarizationDegree > 5 ){ /* if( *covalentDegree > 5 || *polarizationDegree > 5 ){
...@@ -2886,9 +2901,7 @@ void amoebaGpuBuildScalingList( amoebaGpuContext amoebaGpu ) ...@@ -2886,9 +2901,7 @@ void amoebaGpuBuildScalingList( amoebaGpuContext amoebaGpu )
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
if( amoebaGpu->psCovalentDegree == NULL ){ if( amoebaGpu->covalentDegree.size() < 1 )return;
return;
}
const unsigned int paddedAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; const unsigned int paddedAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
const unsigned int actualAtoms = amoebaGpu->gpuContext->natoms; const unsigned int actualAtoms = amoebaGpu->gpuContext->natoms;
...@@ -2914,18 +2927,18 @@ void amoebaGpuBuildScalingList( amoebaGpuContext amoebaGpu ) ...@@ -2914,18 +2927,18 @@ void amoebaGpuBuildScalingList( amoebaGpuContext amoebaGpu )
{ {
int x = atom1/grid; int x = atom1/grid;
int particlesOffset = atom1*amoebaGpu->maxCovalentDegreeSz; int particlesOffset = atom1*amoebaGpu->maxCovalentDegreeSz;
int minCovalentIndex = amoebaGpu->psCovalentDegree->_pSysData[particlesOffset]; int minCovalentIndex = amoebaGpu->covalentDegree[particlesOffset];
int minPolarCovIndex = amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset]; int minPolarCovIndex = amoebaGpu->polarizationDegree[particlesOffset];
int maxCIndex = 0; int maxCIndex = 0;
int maxPIndex = 0; int maxPIndex = 0;
for (int jj = amoebaGpu->maxCovalentDegreeSz - 1; jj >= 1 && (maxPIndex == 0 || maxCIndex == 0); jj-- ) for (int jj = amoebaGpu->maxCovalentDegreeSz - 1; jj >= 1 && (maxPIndex == 0 || maxCIndex == 0); jj-- )
{ {
if( amoebaGpu->psCovalentDegree->_pSysData[particlesOffset+jj] && maxCellIndex[x] < (minCovalentIndex+jj) ) if( amoebaGpu->covalentDegree[particlesOffset+jj] && maxCellIndex[x] < (minCovalentIndex+jj) )
{ {
maxCellIndex[x] = minCovalentIndex + jj; maxCellIndex[x] = minCovalentIndex + jj;
maxCIndex++; maxCIndex++;
} }
if( amoebaGpu->psPolarizationDegree->_pSysData[particlesOffset+jj] && maxCellIndex[x] < (minPolarCovIndex+jj) ) if( amoebaGpu->polarizationDegree[particlesOffset+jj] && maxCellIndex[x] < (minPolarCovIndex+jj) )
{ {
maxCellIndex[x] = minPolarCovIndex + jj; maxCellIndex[x] = minPolarCovIndex + jj;
maxPIndex++; maxPIndex++;
......
...@@ -30,20 +30,10 @@ ...@@ -30,20 +30,10 @@
#include "kernels/gputypes.h" #include "kernels/gputypes.h"
#include "amoebaCudaTypes.h" #include "amoebaCudaTypes.h"
#define THREADS_PER_BLOCK 256
#include <map> #include <map>
typedef std::map<int,float> MapIntFloat; typedef std::map<int,float> MapIntFloat;
typedef MapIntFloat::const_iterator MapIntFloatCI; typedef MapIntFloat::const_iterator MapIntFloatCI;
/*
* Remove
* pMapArray, dMapArray, paddedNumberOfAtoms, nonbondBlocks, nonbondThreadsPerBlock, nonbondOutputBuffers
* allocation of torqueMapForce psCovalentDegree psPolarizationDegree
*
THREADS_PER_BLOCK
*/
struct _amoebaGpuContext { struct _amoebaGpuContext {
_gpuContext* gpuContext; _gpuContext* gpuContext;
...@@ -112,7 +102,6 @@ struct _amoebaGpuContext { ...@@ -112,7 +102,6 @@ struct _amoebaGpuContext {
// multipole parameters // multipole parameters
CUDAStream<int4>* psMultipoleParticlesIdsAndAxisType; CUDAStream<int4>* psMultipoleParticlesIdsAndAxisType;
CUDAStream<int>* psMultipoleAxisOffset;
// buffer indices used for mapping torques onto forces // buffer indices used for mapping torques onto forces
...@@ -133,10 +122,10 @@ struct _amoebaGpuContext { ...@@ -133,10 +122,10 @@ struct _amoebaGpuContext {
CUDAStream<float2>* psDampingFactorAndThole; CUDAStream<float2>* psDampingFactorAndThole;
// slated for removal -- no longer used // used to setup scaling constants
CUDAStream<int>* psCovalentDegree; std::vector<int> covalentDegree;
CUDAStream<int>* psPolarizationDegree; std::vector<int> polarizationDegree;
// fixed-E field // fixed-E field
......
...@@ -255,18 +255,22 @@ void kInitializeMutualInducedAndGkField_kernel( ...@@ -255,18 +255,22 @@ void kInitializeMutualInducedAndGkField_kernel(
float* inducedDipolePolarS ) float* inducedDipolePolarS )
{ {
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*cSim.atoms )return; while( pos < 3*cSim.atoms )
{
fixedEField[pos] *= polarizability[pos];
inducedDipole[pos] = fixedEField[pos];
fixedEField[threadId] *= polarizability[threadId]; fixedEFieldPolar[pos] *= polarizability[pos];
inducedDipole[threadId] = fixedEField[threadId]; inducedDipolePolar[pos] = fixedEFieldPolar[pos];
fixedEFieldPolar[threadId] *= polarizability[threadId]; fixedGkField[pos] *= polarizability[pos];
inducedDipolePolar[threadId] = fixedEFieldPolar[threadId]; inducedDipoleS[pos] = fixedEField[pos] + fixedGkField[pos];
inducedDipolePolarS[pos] = fixedEFieldPolar[pos] + fixedGkField[pos];
fixedGkField[threadId] *= polarizability[threadId]; pos += blockDim.x*gridDim.x;
inducedDipoleS[threadId] = fixedEField[threadId] + fixedGkField[threadId]; }
inducedDipolePolarS[threadId] = fixedEFieldPolar[threadId] + fixedGkField[threadId];
} }
...@@ -355,21 +359,24 @@ void kSorUpdateMutualInducedAndGkField_kernel( ...@@ -355,21 +359,24 @@ void kSorUpdateMutualInducedAndGkField_kernel(
{ {
float polarSOR = 0.70f; float polarSOR = 0.70f;
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*cSim.atoms)return; while( pos < 3*cSim.atoms )
{
float previousDipole = inducedDipole[threadId]; float previousDipole = inducedDipole[pos];
float previousDipoleP = inducedDipoleP[threadId]; float previousDipoleP = inducedDipoleP[pos];
inducedDipole[threadId] = fixedEField[threadId] + polarizability[threadId]*matrixProduct[threadId]; inducedDipole[pos] = fixedEField[pos] + polarizability[pos]*matrixProduct[pos];
inducedDipoleP[threadId] = fixedEFieldP[threadId] + polarizability[threadId]*matrixProductP[threadId]; inducedDipoleP[pos] = fixedEFieldP[pos] + polarizability[pos]*matrixProductP[pos];
inducedDipole[threadId] = previousDipole + polarSOR*( inducedDipole[threadId] - previousDipole ); inducedDipole[pos] = previousDipole + polarSOR*( inducedDipole[pos] - previousDipole );
inducedDipoleP[threadId] = previousDipoleP + polarSOR*( inducedDipoleP[threadId] - previousDipoleP ); inducedDipoleP[pos] = previousDipoleP + polarSOR*( inducedDipoleP[pos] - previousDipoleP );
matrixProduct[threadId] = ( inducedDipole[threadId] - previousDipole )*( inducedDipole[threadId] - previousDipole ); matrixProduct[pos] = ( inducedDipole[pos] - previousDipole )*( inducedDipole[pos] - previousDipole );
matrixProductP[threadId] = ( inducedDipoleP[threadId] - previousDipoleP )*( inducedDipoleP[threadId] - previousDipoleP ); matrixProductP[pos] = ( inducedDipoleP[pos] - previousDipoleP )*( inducedDipoleP[pos] - previousDipoleP );
pos += blockDim.x*gridDim.x;
}
} }
__global__ __global__
...@@ -389,21 +396,23 @@ void kSorUpdateMutualInducedAndGkFieldS_kernel( ...@@ -389,21 +396,23 @@ void kSorUpdateMutualInducedAndGkFieldS_kernel(
{ {
float polarSOR = 0.70f; float polarSOR = 0.70f;
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*cSim.atoms)return; while( pos < 3*cSim.atoms )
{
float previousDipole = inducedDipole[threadId]; float previousDipole = inducedDipole[pos];
float previousDipoleP = inducedDipoleP[threadId]; float previousDipoleP = inducedDipoleP[pos];
inducedDipole[threadId] = fixedGkField[threadId] + fixedEField[threadId] + polarizability[threadId]*matrixProduct[threadId]; inducedDipole[pos] = fixedGkField[pos] + fixedEField[pos] + polarizability[pos]*matrixProduct[pos];
inducedDipoleP[threadId] = fixedGkField[threadId] + fixedEFieldP[threadId] + polarizability[threadId]*matrixProductP[threadId]; inducedDipoleP[pos] = fixedGkField[pos] + fixedEFieldP[pos] + polarizability[pos]*matrixProductP[pos];
inducedDipole[threadId] = previousDipole + polarSOR*( inducedDipole[threadId] - previousDipole ); inducedDipole[pos] = previousDipole + polarSOR*( inducedDipole[pos] - previousDipole );
inducedDipoleP[threadId] = previousDipoleP + polarSOR*( inducedDipoleP[threadId] - previousDipoleP ); inducedDipoleP[pos] = previousDipoleP + polarSOR*( inducedDipoleP[pos] - previousDipoleP );
matrixProduct[threadId] = ( inducedDipole[threadId] - previousDipole )*( inducedDipole[threadId] - previousDipole ); matrixProduct[pos] = ( inducedDipole[pos] - previousDipole )*( inducedDipole[pos] - previousDipole );
matrixProductP[threadId] = ( inducedDipoleP[threadId] - previousDipoleP )*( inducedDipoleP[threadId] - previousDipoleP ); matrixProductP[pos] = ( inducedDipoleP[pos] - previousDipoleP )*( inducedDipoleP[pos] - previousDipoleP );
pos += blockDim.x*gridDim.x;
}
} }
// reduce psWorkArray_3_1 -> outputArray // reduce psWorkArray_3_1 -> outputArray
...@@ -437,46 +446,6 @@ static void kReduceMutualInducedAndGkFields(amoebaGpuContext amoebaGpu, ...@@ -437,46 +446,6 @@ static void kReduceMutualInducedAndGkFields(amoebaGpuContext amoebaGpu,
LAUNCHERROR("kReduceMutualInducedAndGkFields4"); LAUNCHERROR("kReduceMutualInducedAndGkFields4");
} }
#ifdef AMOEBA_DEBUG
#if 0
static void printMiFieldBuffer( amoebaGpuContext amoebaGpu, unsigned int bufferIndex )
{
(void) fprintf( amoebaGpu->log, "MI Field Buffer %u\n", bufferIndex );
unsigned int start = bufferIndex*3*gpu->sim.paddedNumberOfAtoms;
unsigned int stop = (bufferIndex+1)*3*gpu->sim.paddedNumberOfAtoms;
for( unsigned int ii = start; ii < stop; ii += 3 ){
unsigned int ii3Index = ii/3;
unsigned int bufferIndex = ii3Index/(gpu->sim.paddedNumberOfAtoms);
unsigned int particleIndex = ii3Index - bufferIndex*(gpu->sim.paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, " %6u %3u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
ii/3, bufferIndex, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysData[ii],
amoebaGpu->psWorkArray_3_1->_pSysData[ii+1],
amoebaGpu->psWorkArray_3_1->_pSysData[ii+2],
amoebaGpu->psWorkArray_3_2->_pSysData[ii],
amoebaGpu->psWorkArray_3_2->_pSysData[ii+1],
amoebaGpu->psWorkArray_3_2->_pSysData[ii+2] );
}
}
static void printMiFieldAtomBuffers( amoebaGpuContext amoebaGpu, unsigned int targetAtom )
{
(void) fprintf( amoebaGpu->log, "MI Field atom %u\n", targetAtom );
for( unsigned int ii = 0; ii < gpu->sim.outputBuffers; ii++ ){
unsigned int particleIndex = 3*(targetAtom + ii*gpu->sim.paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, " %2u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
ii, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex],
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+1],
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+2],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+1],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+2] );
}
}
#endif
#endif
/**--------------------------------------------------------------------------------------- /**---------------------------------------------------------------------------------------
Compute mutual induce field Compute mutual induce field
...@@ -576,14 +545,6 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon ...@@ -576,14 +545,6 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
amoebaGpu->psWorkArray_3_3->Download(); amoebaGpu->psWorkArray_3_3->Download();
amoebaGpu->psWorkArray_3_4->Download(); amoebaGpu->psWorkArray_3_4->Download();
//printMiFieldAtomBuffers( amoebaGpu, (targetAtom + 0) );
//printMiFieldAtomBuffers( amoebaGpu, (targetAtom + 1) );
//printMiFieldAtomBuffers( amoebaGpu, 100 );
//printMiFieldBuffer( amoebaGpu, 0 );
//printMiFieldBuffer( amoebaGpu, 1 );
//printMiFieldBuffer( amoebaGpu, 37 );
//printMiFieldBuffer( amoebaGpu, 38 );
if( amoebaGpu->log && iteration == 1 ){ if( amoebaGpu->log && iteration == 1 ){
(void) fprintf( amoebaGpu->log, "Finished MI kernel execution %d\n", iteration ); (void) fflush( amoebaGpu->log ); (void) fprintf( amoebaGpu->log, "Finished MI kernel execution %d\n", iteration ); (void) fflush( amoebaGpu->log );
...@@ -711,28 +672,13 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe ...@@ -711,28 +672,13 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
int iteration; int iteration;
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
int numOfElems = gpu->natoms*3;
int numThreads = min( THREADS_PER_BLOCK, numOfElems );
int numBlocks = numOfElems/numThreads;
if( (numOfElems % numThreads) != 0 )numBlocks++;
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log && timestep == 1 ){
(void) fprintf( amoebaGpu->log, "%s %d numOfElems=%d numThreads=%d numBlocks=%d "
"maxIterations=%d targetEpsilon=%.3e\n",
methodName, gpu->natoms, numOfElems, numThreads, numBlocks,
amoebaGpu->mutualInducedMaxIterations, amoebaGpu->mutualInducedTargetEpsilon);
(void) fflush( amoebaGpu->log );
}
#endif
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
// set E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability // set E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
// initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability // initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
kInitializeMutualInducedAndGkField_kernel<<< numBlocks, numThreads >>>( kInitializeMutualInducedAndGkField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_Field->_pDevData,
amoebaGpu->psE_FieldPolar->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData,
amoebaGpu->psGk_Field->_pDevData, amoebaGpu->psGk_Field->_pDevData,
...@@ -812,14 +758,14 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe ...@@ -812,14 +758,14 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
// post matrix multiply // post matrix multiply
kSorUpdateMutualInducedAndGkField_kernel<<< numBlocks, numThreads >>>( kSorUpdateMutualInducedAndGkField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
amoebaGpu->psPolarizability->_pDevData, amoebaGpu->psPolarizability->_pDevData,
amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData, amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData,
amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData, amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData,
amoebaGpu->psWorkVector[0]->_pDevData, amoebaGpu->psWorkVector[1]->_pDevData ); amoebaGpu->psWorkVector[0]->_pDevData, amoebaGpu->psWorkVector[1]->_pDevData );
LAUNCHERROR("cudaComputeAmoebaMutualInducedAndGkFieldSorUpdate1"); LAUNCHERROR("cudaComputeAmoebaMutualInducedAndGkFieldSorUpdate1");
kSorUpdateMutualInducedAndGkFieldS_kernel<<< numBlocks, numThreads >>>( kSorUpdateMutualInducedAndGkFieldS_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
amoebaGpu->psPolarizability->_pDevData, amoebaGpu->psPolarizability->_pDevData,
amoebaGpu->psInducedDipoleS->_pDevData, amoebaGpu->psInducedDipolePolarS->_pDevData, amoebaGpu->psInducedDipoleS->_pDevData, amoebaGpu->psInducedDipolePolarS->_pDevData,
amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData, amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData,
......
...@@ -120,14 +120,18 @@ void kInitializeMutualInducedField_kernel( ...@@ -120,14 +120,18 @@ void kInitializeMutualInducedField_kernel(
float* inducedDipolePolar ) float* inducedDipolePolar )
{ {
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*numberOfAtoms )return; while( pos < 3*cSim.atoms )
{
fixedEField[pos] *= polarizability[pos];
inducedDipole[pos] = fixedEField[pos];
fixedEField[threadId] *= polarizability[threadId]; fixedEFieldPolar[pos] *= polarizability[pos];
inducedDipole[threadId] = fixedEField[threadId]; inducedDipolePolar[pos] = fixedEFieldPolar[pos];
fixedEFieldPolar[threadId] *= polarizability[threadId]; pos += blockDim.x*gridDim.x;
inducedDipolePolar[threadId] = fixedEFieldPolar[threadId]; }
} }
...@@ -195,20 +199,24 @@ void kSorUpdateMutualInducedField_kernel( ...@@ -195,20 +199,24 @@ void kSorUpdateMutualInducedField_kernel(
{ {
float polarSOR = 0.70f; float polarSOR = 0.70f;
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*numberOfEntries )return; while( pos < 3*cSim.atoms )
{
float previousDipole = inducedDipole[pos];
float previousDipoleP = inducedDipoleP[pos];
float previousDipole = inducedDipole[threadId]; inducedDipole[pos] = fixedEField[pos] + polarizability[pos]*matrixProduct[pos];
float previousDipoleP = inducedDipoleP[threadId]; inducedDipoleP[pos] = fixedEFieldP[pos] + polarizability[pos]*matrixProductP[pos];
inducedDipole[threadId] = fixedEField[threadId] + polarizability[threadId]*matrixProduct[threadId]; inducedDipole[pos] = previousDipole + polarSOR*( inducedDipole[pos] - previousDipole );
inducedDipoleP[threadId] = fixedEFieldP[threadId] + polarizability[threadId]*matrixProductP[threadId]; inducedDipoleP[pos] = previousDipoleP + polarSOR*( inducedDipoleP[pos] - previousDipoleP );
inducedDipole[threadId] = previousDipole + polarSOR*( inducedDipole[threadId] - previousDipole ); matrixProduct[pos] = ( inducedDipole[pos] - previousDipole )*( inducedDipole[pos] - previousDipole );
inducedDipoleP[threadId] = previousDipoleP + polarSOR*( inducedDipoleP[threadId] - previousDipoleP ); matrixProductP[pos] = ( inducedDipoleP[pos] - previousDipoleP )*( inducedDipoleP[pos] - previousDipoleP );
matrixProduct[threadId] = ( inducedDipole[threadId] - previousDipole )*( inducedDipole[threadId] - previousDipole ); pos += blockDim.x*gridDim.x;
matrixProductP[threadId] = ( inducedDipoleP[threadId] - previousDipoleP )*( inducedDipoleP[threadId] - previousDipoleP ); }
} }
...@@ -470,28 +478,13 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu ...@@ -470,28 +478,13 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
int iteration; int iteration;
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
int numOfElems = gpu->natoms*3;
int numThreads = min( THREADS_PER_BLOCK, numOfElems );
int numBlocks = numOfElems/numThreads;
if( (numOfElems % numThreads) != 0 )numBlocks++;
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s %d numOfElems=%d numThreads=%d numBlocks=%d "
"maxIterations=%d targetEpsilon=%.3e\n",
methodName, gpu->natoms, numOfElems, numThreads, numBlocks,
amoebaGpu->mutualInducedMaxIterations, amoebaGpu->mutualInducedTargetEpsilon);
(void) fflush( amoebaGpu->log );
}
#endif
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
// set E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability // set E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
// initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability // initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
kInitializeMutualInducedField_kernel<<< numBlocks, numThreads >>>( kInitializeMutualInducedField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
gpu->natoms, gpu->natoms,
amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_Field->_pDevData,
amoebaGpu->psE_FieldPolar->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData,
...@@ -555,7 +548,7 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu ...@@ -555,7 +548,7 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
// post matrix multiply // post matrix multiply
kSorUpdateMutualInducedField_kernel<<< numBlocks, numThreads >>>( kSorUpdateMutualInducedField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
gpu->natoms, amoebaGpu->psPolarizability->_pDevData, gpu->natoms, amoebaGpu->psPolarizability->_pDevData,
amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData, amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData,
amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData, amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData,
......
...@@ -242,14 +242,16 @@ static void kInitializeMutualInducedField_kernel( ...@@ -242,14 +242,16 @@ static void kInitializeMutualInducedField_kernel(
float* inducedDipolePolar ) float* inducedDipolePolar )
{ {
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*numberOfAtoms )return; while( pos < 3*cSim.atoms )
{
fixedEField[threadId] *= polarizability[threadId]; fixedEField[pos] *= polarizability[pos];
inducedDipole[threadId] = fixedEField[threadId]; inducedDipole[pos] = fixedEField[pos];
fixedEFieldPolar[threadId] *= polarizability[threadId]; fixedEFieldPolar[pos] *= polarizability[pos];
inducedDipolePolar[threadId] = fixedEFieldPolar[threadId]; inducedDipolePolar[pos] = fixedEFieldPolar[pos];
pos += blockDim.x*gridDim.x;
}
} }
...@@ -325,27 +327,31 @@ static void kSorUpdateMutualInducedField_kernel( ...@@ -325,27 +327,31 @@ static void kSorUpdateMutualInducedField_kernel(
float* matrixProduct, float* matrixProductP ) float* matrixProduct, float* matrixProductP )
{ {
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*numberOfEntries )return; while( pos < 3*cSim.atoms )
{
float previousDipole = inducedDipole[threadId]; float previousDipole = inducedDipole[pos];
float previousDipoleP = inducedDipoleP[threadId]; float previousDipoleP = inducedDipoleP[pos];
// add self terms to fields // add self terms to fields
const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi; const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
matrixProduct[threadId] += term*previousDipole; matrixProduct[pos] += term*previousDipole;
matrixProductP[threadId] += term*previousDipoleP; matrixProductP[pos] += term*previousDipoleP;
inducedDipole[threadId] = fixedEField[threadId] + polarizability[threadId]*matrixProduct[threadId]; inducedDipole[pos] = fixedEField[pos] + polarizability[pos]*matrixProduct[pos];
inducedDipoleP[threadId] = fixedEFieldP[threadId] + polarizability[threadId]*matrixProductP[threadId]; inducedDipoleP[pos] = fixedEFieldP[pos] + polarizability[pos]*matrixProductP[pos];
const float polarSOR = 0.70f; const float polarSOR = 0.70f;
inducedDipole[threadId] = previousDipole + polarSOR*( inducedDipole[threadId] - previousDipole ); inducedDipole[pos] = previousDipole + polarSOR*( inducedDipole[pos] - previousDipole );
inducedDipoleP[threadId] = previousDipoleP + polarSOR*( inducedDipoleP[threadId] - previousDipoleP ); inducedDipoleP[pos] = previousDipoleP + polarSOR*( inducedDipoleP[pos] - previousDipoleP );
matrixProduct[pos] = ( inducedDipole[pos] - previousDipole )*( inducedDipole[pos] - previousDipole );
matrixProductP[pos] = ( inducedDipoleP[pos] - previousDipoleP )*( inducedDipoleP[pos] - previousDipoleP );
matrixProduct[threadId] = ( inducedDipole[threadId] - previousDipole )*( inducedDipole[threadId] - previousDipole ); pos += blockDim.x*gridDim.x;
matrixProductP[threadId] = ( inducedDipoleP[threadId] - previousDipoleP )*( inducedDipoleP[threadId] - previousDipoleP ); }
} }
...@@ -539,28 +545,13 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -539,28 +545,13 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
int iteration; int iteration;
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
int numOfElems = gpu->natoms*3;
int numThreads = min( THREADS_PER_BLOCK, numOfElems );
int numBlocks = numOfElems/numThreads;
if( (numOfElems % numThreads) != 0 )numBlocks++;
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s %d numOfElems=%d numThreads=%d numBlocks=%d "
"maxIterations=%d targetEpsilon=%.3e\n",
methodName, gpu->natoms, numOfElems, numThreads, numBlocks,
amoebaGpu->mutualInducedMaxIterations, amoebaGpu->mutualInducedTargetEpsilon);
(void) fflush( amoebaGpu->log );
}
#endif
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
// set E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability // set E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
// initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability // initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
kInitializeMutualInducedField_kernel<<< numBlocks, numThreads >>>( kInitializeMutualInducedField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
gpu->natoms, gpu->natoms,
amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_Field->_pDevData,
amoebaGpu->psE_FieldPolar->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData,
...@@ -607,7 +598,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -607,7 +598,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
// post matrix multiply // post matrix multiply
kSorUpdateMutualInducedField_kernel<<< numBlocks, numThreads >>>( kSorUpdateMutualInducedField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
gpu->natoms, amoebaGpu->psPolarizability->_pDevData, gpu->natoms, amoebaGpu->psPolarizability->_pDevData,
amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData, amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData,
amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData, amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData,
......
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
#include <stdio.h>
#undef AMOEBA_OFFSET_3
#undef AMOEBA_INCLUDE_DIAGONAL
#define METHOD_NAME(a, b) a##ExcludeDiagonalOffset1##b
#include "kCalculateAmoebaCudaReduce.h"
#undef METHOD_NAME
#define AMOEBA_OFFSET_3
#define METHOD_NAME(a, b) a##ExcludeDiagonalOffset3##b
#include "kCalculateAmoebaCudaReduce.h"
#undef METHOD_NAME
#undef AMOEBA_OFFSET_3
#define AMOEBA_INCLUDE_DIAGONAL
#define METHOD_NAME(a, b) a##IncludeDiagonalOffset1##b
#include "kCalculateAmoebaCudaReduce.h"
#undef METHOD_NAME
#define AMOEBA_OFFSET_3
#define METHOD_NAME(a, b) a##IncludeDiagonalOffset3##b
#include "kCalculateAmoebaCudaReduce.h"
#undef METHOD_NAME
#undef AMOEBA_OFFSET_3
#undef AMOEBA_INCLUDE_DIAGONAL
void cudaReduceN2ToN( float *N2Array, int Nsz, float *NArray, int includeDiagonal, int offset )
{
int numThreads = min(THREADS_PER_BLOCK, (Nsz));
int numBlocksPerAtom = (Nsz / numThreads);
if( Nsz % numThreads ){
numBlocksPerAtom++;
}
int numBlocks = numBlocksPerAtom*Nsz;
float *partialSum1_d;
// allocate GPU memory
cudaMalloc( (void**) &partialSum1_d, numBlocks*offset*sizeof(float) );
if( includeDiagonal ){
if( offset == 3 ){
kCalculateAmoebaReduceIncludeDiagonalOffset3N2ToNBlockLevel<<< numBlocks, numThreads >>>( N2Array, partialSum1_d, Nsz, numBlocksPerAtom );
LAUNCHERROR("kCalculateAmoebaReduceN2ToNBlockLevel1");
} else if( offset == 1 ){
kCalculateAmoebaReduceIncludeDiagonalOffset1N2ToNBlockLevel<<< numBlocks, numThreads >>>( N2Array, partialSum1_d, Nsz, numBlocksPerAtom );
LAUNCHERROR("kCalculateAmoebaReduceN2ToNBlockLevel2");
}
} else {
if( offset == 3 ){
kCalculateAmoebaReduceExcludeDiagonalOffset3N2ToNBlockLevel<<< numBlocks, numThreads >>>( N2Array, partialSum1_d, Nsz, numBlocksPerAtom );
LAUNCHERROR("kCalculateAmoebaReduceN2ToNBlockLevel3");
} else if( offset == 1 ){
kCalculateAmoebaReduceExcludeDiagonalOffset1N2ToNBlockLevel<<< numBlocks, numThreads >>>( N2Array, partialSum1_d, Nsz, numBlocksPerAtom );
LAUNCHERROR("kCalculateAmoebaReduceN2ToNBlockLevel4");
}
}
int numBlocks2 = numBlocks;
numBlocks = numBlocks2*Nsz/numThreads;
if( (numBlocks2*Nsz) % numThreads ){
numBlocks++;
}
if( offset == 3 ){
kCalculateAmoebaReduceIncludeDiagonalOffset3N2ToNFinal<<< numBlocks, numThreads >>>(partialSum1_d, NArray, Nsz, numBlocksPerAtom );
LAUNCHERROR("kCalculateAmoebaReduceN2ToNFinal3");
} else if( offset == 1 ){
kCalculateAmoebaReduceIncludeDiagonalOffset1N2ToNFinal<<< numBlocks, numThreads >>>(partialSum1_d, NArray, Nsz, numBlocksPerAtom );
LAUNCHERROR("kCalculateAmoebaReduceN2ToNFinal1");
}
//Free memory
cudaFree(partialSum1_d);
}
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
typedef unsigned int uint;
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateAmoebaReduce, N2ToNBlockLevel)( float *N2Array, float *partialSum, int num,int numberOfBlocksPerAtom )
{
uint tid = threadIdx.x;
__shared__ float asx[THREADS_PER_BLOCK];
asx[tid] = 0.0f;
#ifdef AMOEBA_OFFSET_3
__shared__ float asy[THREADS_PER_BLOCK];
__shared__ float asz[THREADS_PER_BLOCK];
asx[tid] = 0.0f;
asy[tid] = asz[tid] = 0.0f;
int offset = 3;
#else
int offset = 1;
#endif
int atomI = blockIdx.x / numberOfBlocksPerAtom;
int atomJ = (blockIdx.x % numberOfBlocksPerAtom)*blockDim.x+tid;
#ifdef AMOEBA_INCLUDE_DIAGONAL
if( atomJ < num && atomI < num ){
#else
if( atomJ < num && atomJ != atomI ){
#endif
int index = offset*(atomI*num + atomJ);
asx[tid] = N2Array[index];
#ifdef AMOEBA_OFFSET_3
asy[tid] = N2Array[index+1];
asz[tid] = N2Array[index+2];
#endif
}
__syncthreads(); //to make sure all the elements are loaded
for( uint s = (blockDim.x)/2; s != 0; s >>= 1 ){
if( tid < s ){
asx[tid] += asx[tid+s];
#ifdef AMOEBA_OFFSET_3
asy[tid] += asy[tid+s];
asz[tid] += asz[tid+s];
#endif
}
__syncthreads();
}
if( tid == 0 ){
partialSum[blockIdx.x*offset] = asx[0];
#ifdef AMOEBA_OFFSET_3
partialSum[blockIdx.x*3+1] = asy[0];
partialSum[blockIdx.x*3+2] = asz[0];
#endif
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateAmoebaReduce, N2ToNFinal)( float *partialSum, float *final,int num,int numberOfBlocksPerAtom )
{
uint thread_id = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( thread_id > num )return;
float3 sum;
#ifdef AMOEBA_OFFSET_3
int offset = 3;
sum.x = sum.y = sum.z = 0.0f;
#else
int offset = 1;
sum.x = 0.0f;
#endif
int index = thread_id*offset*numberOfBlocksPerAtom;
for( int i=0; i < numberOfBlocksPerAtom; i++ ){
sum.x += partialSum[index + i*offset];
#ifdef AMOEBA_OFFSET_3
sum.y += partialSum[index + i*offset+1];
sum.z += partialSum[index + i*offset+2];
#endif
}
final[thread_id*offset ] = sum.x;
#ifdef AMOEBA_OFFSET_3
final[thread_id*3+1 ] = sum.y;
final[thread_id*3+2 ] = sum.z;
#endif
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment