Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
2b508482
Commit
2b508482
authored
Nov 23, 2011
by
Mark Friedrichs
Browse files
Added copyright
Removed debugging code
parent
36762962
Changes
33
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
336 additions
and
2182 deletions
+336
-2182
platforms/cuda/tests/TestCudaGBVIForce2.cpp
platforms/cuda/tests/TestCudaGBVIForce2.cpp
+33
-4
platforms/opencl/tests/TestOpenCLGBSAOBCForce2.cpp
platforms/opencl/tests/TestOpenCLGBSAOBCForce2.cpp
+33
-4
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
...rms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
+27
-108
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.h
...orms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.h
+1
-6
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
...cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
+26
-145
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.h
.../cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.h
+9
-384
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
...forms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
+25
-106
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.h
...tforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.h
+5
-237
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedFieldParticle.h
...cuda/src/kernels/kCalculateAmoebaCudaFixedFieldParticle.h
+1
-5
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaGrycuk.cu
.../platforms/cuda/src/kernels/kCalculateAmoebaCudaGrycuk.cu
+28
-159
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaGrycukChainRule.h
...ms/cuda/src/kernels/kCalculateAmoebaCudaGrycukChainRule.h
+4
-115
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
...latforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
+28
-126
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.h
...platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.h
+1
-9
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
...rms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
+26
-45
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.h
...orms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.h
+1
-2
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaLocalForces.cu
...forms/cuda/src/kernels/kCalculateAmoebaCudaLocalForces.cu
+0
-12
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMapTorques.cu
...tforms/cuda/src/kernels/kCalculateAmoebaCudaMapTorques.cu
+25
-40
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
...c/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
+30
-312
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.h
...rc/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.h
+7
-98
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
...uda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
+26
-265
No files found.
platforms/cuda/tests/TestCudaGBVIForce2.cpp
View file @
2b508482
...
...
@@ -61,6 +61,17 @@
#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
#include "OpenMM.h"
#if TEST_PLATFORM == TEST_OPENCL_PLATFORM
#include "ReferencePlatform.h"
#include "OpenCLPlatform.h"
#endif
#if TEST_PLATFORM == TEST_CUDA_PLATFORM
#include "ReferencePlatform.h"
#include "CudaPlatform.h"
#endif
#ifdef USE_SOFTCORE
#include "OpenMMFreeEnergy.h"
#include "openmm/freeEnergyKernels.h"
...
...
@@ -2900,23 +2911,32 @@ void runSystemComparisonTest( System& system1, System& system2,
throw
OpenMMException
(
msg
.
str
()
);
}
#if TEST_PLATFORM == TEST_OPENCL_PLATFORM
ReferencePlatform
platform1
;
OpenCLPlatform
platform2
;
#elif TEST_PLATFORM == TEST_CUDA_PLATFORM
ReferencePlatform
platform1
;
CudaPlatform
platform2
;
#else
Platform
&
platform1
=
Platform
::
getPlatformByName
(
platformName1
);
if
(
deviceId1
){
setDeviceId
(
platform1
,
deviceId1
,
log
);
}
setDeviceIdUsingEnvVariable
(
platform1
,
log
);
Context
context1
(
system1
,
integrator1
,
platform1
);
context1
.
setPositions
(
positions
);
State
state1
=
context1
.
getState
(
State
::
Forces
|
State
::
Energy
);
Platform
&
platform2
=
Platform
::
getPlatformByName
(
platformName2
);
if
(
deviceId2
){
setDeviceId
(
platform2
,
deviceId2
,
log
);
}
setDeviceIdUsingEnvVariable
(
platform2
,
log
);
#endif
Context
context1
(
system1
,
integrator1
,
platform1
);
context1
.
setPositions
(
positions
);
State
state1
=
context1
.
getState
(
State
::
Forces
|
State
::
Energy
);
Context
context2
(
system2
,
integrator2
,
platform2
);
context2
.
setPositions
(
positions
);
State
state2
=
context2
.
getState
(
State
::
Forces
|
State
::
Energy
);
double
energyDiff
=
0.0
;
...
...
@@ -3419,10 +3439,19 @@ int main() {
}
}
// if TEST_PLATFORM is not set, then check that required libs are available
#if TEST_PLATFORM != TEST_OPENCL_PLATFORM && TEST_PLATFORM != TEST_CUDA_PLATFORM
envVariableIsSet
=
checkRequiredLibsAreAvailable
(
requiredLibs
,
loadedLibs
,
log
);
if
(
envVariableIsSet
==
0
&&
log
){
(
void
)
fprintf
(
log
,
"Aborting tests due to missing libs.
\n
"
);
}
#else
// unit test path: force tests to run
envVariableIsSet
=
1
;
#endif
if
(
platformId2s
.
size
()
>
0
){
generativeArgumentMaps
[
"platformId2"
]
=
platformId2s
;
...
...
platforms/opencl/tests/TestOpenCLGBSAOBCForce2.cpp
View file @
2b508482
...
...
@@ -61,6 +61,17 @@
#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
#include "OpenMM.h"
#if TEST_PLATFORM == TEST_OPENCL_PLATFORM
#include "ReferencePlatform.h"
#include "OpenCLPlatform.h"
#endif
#if TEST_PLATFORM == TEST_CUDA_PLATFORM
#include "ReferencePlatform.h"
#include "CudaPlatform.h"
#endif
#ifdef USE_SOFTCORE
#include "OpenMMFreeEnergy.h"
#include "openmm/freeEnergyKernels.h"
...
...
@@ -2900,23 +2911,32 @@ void runSystemComparisonTest( System& system1, System& system2,
throw
OpenMMException
(
msg
.
str
()
);
}
#if TEST_PLATFORM == TEST_OPENCL_PLATFORM
ReferencePlatform
platform1
;
OpenCLPlatform
platform2
;
#elif TEST_PLATFORM == TEST_CUDA_PLATFORM
ReferencePlatform
platform1
;
CudaPlatform
platform2
;
#else
Platform
&
platform1
=
Platform
::
getPlatformByName
(
platformName1
);
if
(
deviceId1
){
setDeviceId
(
platform1
,
deviceId1
,
log
);
}
setDeviceIdUsingEnvVariable
(
platform1
,
log
);
Context
context1
(
system1
,
integrator1
,
platform1
);
context1
.
setPositions
(
positions
);
State
state1
=
context1
.
getState
(
State
::
Forces
|
State
::
Energy
);
Platform
&
platform2
=
Platform
::
getPlatformByName
(
platformName2
);
if
(
deviceId2
){
setDeviceId
(
platform2
,
deviceId2
,
log
);
}
setDeviceIdUsingEnvVariable
(
platform2
,
log
);
#endif
Context
context1
(
system1
,
integrator1
,
platform1
);
context1
.
setPositions
(
positions
);
State
state1
=
context1
.
getState
(
State
::
Forces
|
State
::
Energy
);
Context
context2
(
system2
,
integrator2
,
platform2
);
context2
.
setPositions
(
positions
);
State
state2
=
context2
.
getState
(
State
::
Forces
|
State
::
Energy
);
double
energyDiff
=
0.0
;
...
...
@@ -3419,10 +3439,19 @@ int main() {
}
}
// if TEST_PLATFORM is not set, then check that required libs are available
#if TEST_PLATFORM != TEST_OPENCL_PLATFORM && TEST_PLATFORM != TEST_CUDA_PLATFORM
envVariableIsSet
=
checkRequiredLibsAreAvailable
(
requiredLibs
,
loadedLibs
,
log
);
if
(
envVariableIsSet
==
0
&&
log
){
(
void
)
fprintf
(
log
,
"Aborting tests due to missing libs.
\n
"
);
}
#else
// unit test path: force tests to run
envVariableIsSet
=
1
;
#endif
if
(
platformId2s
.
size
()
>
0
){
generativeArgumentMaps
[
"platformId2"
]
=
platformId2s
;
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
View file @
2b508482
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
//#define AMOEBA_DEBUG
static
__constant__
cudaGmxSimulation
cSim
;
static
__constant__
cudaAmoebaGmxSimulation
cAmoebaSim
;
...
...
@@ -591,8 +611,7 @@ __device__ void calculateElectrostaticPairIxn_kernel( ElectrostaticParticle& ato
// reduce psWorkArray_3_1 -> torque
static
void
kReduceTorque
(
amoebaGpuContext
amoebaGpu
)
{
static
void
kReduceTorque
(
amoebaGpuContext
amoebaGpu
){
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
kReduceFields_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
bsf_reduce_threads_per_block
>>>
(
gpu
->
sim
.
paddedNumberOfAtoms
*
3
,
gpu
->
sim
.
outputBuffers
,
...
...
@@ -609,42 +628,14 @@ static void kReduceTorque(amoebaGpuContext amoebaGpu )
--------------------------------------------------------------------------------------- */
void
cudaComputeAmoebaElectrostatic
(
amoebaGpuContext
amoebaGpu
,
int
addTorqueToForce
)
{
void
cudaComputeAmoebaElectrostatic
(
amoebaGpuContext
amoebaGpu
,
int
addTorqueToForce
){
// ---------------------------------------------------------------------------------------
#ifdef AMOEBA_DEBUG
static
const
char
*
methodName
=
"cudaComputeAmoebaElectrostatic"
;
static
int
timestep
=
0
;
std
::
vector
<
int
>
fileId
;
timestep
++
;
fileId
.
resize
(
2
);
fileId
[
0
]
=
timestep
;
fileId
[
1
]
=
1
;
#endif
// ---------------------------------------------------------------------------------------
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
// apparently debug array can take up nontrivial no. registers
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s %d maxCovalentDegreeSz=%d ZZZ
\n
"
,
methodName
,
gpu
->
natoms
,
amoebaGpu
->
maxCovalentDegreeSz
);
}
static
const
int
maxSlots
=
20
;
int
paddedNumberOfAtoms
=
gpu
->
sim
.
paddedNumberOfAtoms
;
CUDAStream
<
float4
>*
debugArray
=
new
CUDAStream
<
float4
>
(
maxSlots
*
paddedNumberOfAtoms
,
1
,
"DebugArray"
);
memset
(
debugArray
->
_pSysData
,
0
,
sizeof
(
float
)
*
4
*
maxSlots
*
paddedNumberOfAtoms
);
debugArray
->
Upload
();
//unsigned int targetAtom = 1137;
unsigned
int
targetAtom
=
1
;
#endif
// on first pass, set threads/block
static
unsigned
int
threadsPerBlock
=
0
;
...
...
@@ -662,20 +653,6 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
kClearFields_3
(
amoebaGpu
,
1
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaCudaElectrostaticN2Forces warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u
\n
"
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
ElectrostaticParticle
),
sizeof
(
ElectrostaticParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
ElectrostaticParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
debugArray
->
_pDevData
,
targetAtom
);
}
else
{
kCalculateAmoebaCudaElectrostaticN2Forces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
ElectrostaticParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
debugArray
->
_pDevData
,
targetAtom
);
}
#else
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
ElectrostaticParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
);
...
...
@@ -683,66 +660,8 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
kCalculateAmoebaCudaElectrostaticN2Forces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
ElectrostaticParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
);
}
#endif
LAUNCHERROR
(
"kCalculateAmoebaCudaElectrostaticN2Forces"
);
#ifdef AMOEBA_DEBUG
if
(
0
){
debugArray
->
Download
();
std
::
vector
<
double
>
conversions
;
conversions
.
push_back
(
0.1
f
/
4.184
f
);
conversions
.
push_back
(
0.1
f
/
4.184
f
);
unsigned
int
kkBlocks
=
4
;
(
void
)
fprintf
(
stderr
,
"
\n
Target atom output %5u
\n
"
,
targetAtom
);
for
(
unsigned
int
ii
=
0
;
ii
<
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
ii
++
){
double
sum
=
0.0
;
for
(
unsigned
int
kk
=
0
;
kk
<
kkBlocks
&&
sum
==
0.0
;
kk
++
){
unsigned
int
index
=
ii
+
kk
*
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
sum
+=
debugArray
->
_pSysData
[
index
].
x
+
debugArray
->
_pSysData
[
index
].
y
+
debugArray
->
_pSysData
[
index
].
z
+
debugArray
->
_pSysData
[
index
].
w
;
}
if
(
sum
>
0.0
){
(
void
)
fprintf
(
stderr
,
"%5u"
,
ii
);
for
(
unsigned
int
kk
=
0
;
kk
<
kkBlocks
;
kk
++
){
unsigned
int
index
=
ii
+
kk
*
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
(
void
)
fprintf
(
stderr
,
" %15.7e %15.7e %15.7e %5.1f"
,
conversions
[
kk
]
*
debugArray
->
_pSysData
[
index
].
x
,
conversions
[
kk
]
*
debugArray
->
_pSysData
[
index
].
y
,
conversions
[
kk
]
*
debugArray
->
_pSysData
[
index
].
z
,
debugArray
->
_pSysData
[
index
].
w
);
if
(
((
kk
+
1
)
%
2
)
==
0
&&
(
kk
!=
(
kkBlocks
-
1
)
)
){
(
void
)
fprintf
(
stderr
,
"
\n
%5u"
,
ii
);
}
}
(
void
)
fprintf
(
stderr
,
"
\n
"
);
if
(
kkBlocks
>
2
){
(
void
)
fprintf
(
stderr
,
"
\n
"
);
}
}
}
}
#endif
#ifdef AMOEBA_DEBUG
if
(
0
){
VectorOfDoubleVectors
outputVector
;
std
::
vector
<
int
>
fileId
;
static
int
call
=
0
;
fileId
.
push_back
(
call
++
);
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
CUDAStream
<
float
>*
temp
=
new
CUDAStream
<
float
>
(
3
*
paddedNumberOfAtoms
,
1
,
"ElectrostaticTemp"
);
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
reduceAndCopyCUDAStreamFloat4
(
gpu
->
psForce4
,
temp
,
1.0
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
temp
,
outputVector
,
NULL
,
1.0
f
/
4.184
f
);
reduceAndCopyCUDAStreamFloat
(
amoebaGpu
->
psWorkArray_3_1
,
temp
,
1.0
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
temp
,
outputVector
,
NULL
,
1.0
f
/
4.184
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaElectrostaticTorque"
,
fileId
,
outputVector
);
delete
temp
;
}
#endif
if
(
addTorqueToForce
){
kReduceTorque
(
amoebaGpu
);
cudaComputeAmoebaMapTorqueAndAddToForce
(
amoebaGpu
,
amoebaGpu
->
psTorque
);
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.h
View file @
2b508482
...
...
@@ -35,12 +35,7 @@ __launch_bounds__(128, 1)
__launch_bounds__
(
64
,
1
)
#endif
void
METHOD_NAME
(
kCalculateAmoebaCudaElectrostatic
,
Forces_kernel
)(
unsigned
int
*
workUnit
,
float
*
outputTorque
#ifdef AMOEBA_DEBUG
,
float4
*
debugArray
,
unsigned
int
targetAtom
#endif
){
unsigned
int
*
workUnit
,
float
*
outputTorque
){
extern
__shared__
volatile
ElectrostaticParticle
sA
[];
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
View file @
2b508482
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
static
__constant__
cudaGmxSimulation
cSim
;
static
__constant__
cudaAmoebaGmxSimulation
cAmoebaSim
;
...
...
@@ -31,6 +49,7 @@ void GetCalculateAmoebaCudaFixedEAndGKFieldSim(amoebaGpuContext amoebaGpu)
status
=
cudaMemcpyFromSymbol
(
&
amoebaGpu
->
amoebaSim
,
cAmoebaSim
,
sizeof
(
cudaAmoebaGmxSimulation
));
RTERROR
(
status
,
"GetCalculateAmoebaCudaFixedEAndGKFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed"
);
}
// reduce psWorkArray_3_1 -> E_Field
// reduce psWorkArray_3_2 -> E_FieldPolar
// reduce psWorkArray_3_3 -> Gk_FieldPolar
...
...
@@ -66,10 +85,6 @@ __device__ void calculateFixedGkFieldPairIxn_kernel( float4 atomCoordinatesI,
float
*
labFrameQuadrupoleI
,
float
*
labFrameQuadrupoleJ
,
float
rb2
,
float
outputField
[
2
][
3
]
#ifdef AMOEBA_DEBUG
,
float4
*
debugArray
#endif
){
float
xi
,
yi
,
zi
;
...
...
@@ -325,32 +340,10 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
// ---------------------------------------------------------------------------------------
#ifdef AMOEBA_DEBUG
static
const
char
*
methodName
=
"computeCudaAmoebaFixedEAndGKFields"
;
#endif
// ---------------------------------------------------------------------------------------
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
%s
\n
"
,
methodName
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
// N2 debug array
int
maxSlots
=
10
;
CUDAStream
<
float4
>*
debugArray
=
new
CUDAStream
<
float4
>
(
maxSlots
*
paddedNumberOfAtoms
,
1
,
"DebugArray"
);
memset
(
debugArray
->
_pSysData
,
0
,
sizeof
(
float
)
*
4
*
maxSlots
*
paddedNumberOfAtoms
);
debugArray
->
Upload
();
// print intermediate results for the targetAtom
unsigned
int
targetAtom
=
0
;
#endif
// on first pass, set threads/block
static
unsigned
int
threadsPerBlock
=
0
;
...
...
@@ -367,133 +360,21 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
kClearFields_3
(
amoebaGpu
,
3
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u
\n
"
,
methodName
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
FixedFieldParticle
),
sizeof
(
FixedFieldParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaFixedEAndGkFieldN2ByWarp_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
FixedFieldParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_2
->
_pDevData
,
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_3
->
_pDevData
,
debugArray
->
_pDevData
,
targetAtom
);
#else
amoebaGpu
->
psWorkArray_3_3
->
_pDevData
);
#endif
}
else
{
kCalculateAmoebaFixedEAndGkFieldN2_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
FixedFieldParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_2
->
_pDevData
,
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_3
->
_pDevData
,
debugArray
->
_pDevData
,
targetAtom
);
#else
amoebaGpu
->
psWorkArray_3_3
->
_pDevData
);
#endif
}
LAUNCHERROR
(
"kCalculateAmoebaFixedEAndGkFieldN2_kernel"
);
kReduceEAndGkFields
(
amoebaGpu
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
gpu
->
psInteractionCount
->
Download
();
(
void
)
fprintf
(
amoebaGpu
->
log
,
"AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u
\n
"
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
FixedFieldParticle
),
sizeof
(
FixedFieldParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
amoebaGpu
->
psWorkArray_3_1
->
Download
();
amoebaGpu
->
psWorkArray_3_2
->
Download
();
amoebaGpu
->
psWorkArray_3_3
->
Download
();
amoebaGpu
->
psE_Field
->
Download
();
amoebaGpu
->
psE_FieldPolar
->
Download
();
amoebaGpu
->
psGk_Field
->
Download
();
gpu
->
psBornRadii
->
Download
();
(
void
)
fprintf
(
amoebaGpu
->
log
,
"OutE & Gk Fields
\n
"
);
int
maxPrint
=
32
;
for
(
int
ii
=
0
;
ii
<
gpu
->
natoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%5d "
,
ii
);
int
indexOffset
=
ii
*
3
;
// E_Field
(
void
)
fprintf
(
amoebaGpu
->
log
,
"E[%16.9e %16.9e %16.9e] "
,
amoebaGpu
->
psE_Field
->
_pSysData
[
indexOffset
],
amoebaGpu
->
psE_Field
->
_pSysData
[
indexOffset
+
1
],
amoebaGpu
->
psE_Field
->
_pSysData
[
indexOffset
+
2
]
);
// E_Field polar
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Epol[%16.9e %16.9e %16.9e] "
,
amoebaGpu
->
psE_FieldPolar
->
_pSysData
[
indexOffset
],
amoebaGpu
->
psE_FieldPolar
->
_pSysData
[
indexOffset
+
1
],
amoebaGpu
->
psE_FieldPolar
->
_pSysData
[
indexOffset
+
2
]
);
// Gk_Field polar
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Gk[%16.9e %16.9e %16.9e] "
,
amoebaGpu
->
psGk_Field
->
_pSysData
[
indexOffset
],
amoebaGpu
->
psGk_Field
->
_pSysData
[
indexOffset
+
1
],
amoebaGpu
->
psGk_Field
->
_pSysData
[
indexOffset
+
2
]
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
if
(
ii
==
maxPrint
&&
(
gpu
->
natoms
-
maxPrint
)
>
ii
){
ii
=
gpu
->
natoms
-
maxPrint
;
}
}
(
void
)
fflush
(
amoebaGpu
->
log
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"EFields End
\n
"
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"DebugQ
\n
"
);
debugArray
->
Download
();
if
(
0
){
int
ii
=
targetAtom
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
for
(
int
jj
=
0
;
jj
<
gpu
->
natoms
;
jj
++
){
int
debugIndex
=
jj
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%4d %4d Qint [%16.9e %16.9e %16.9e %16.9e] %16.9e "
,
ii
,
jj
,
debugArray
->
_pSysData
[
debugIndex
].
x
,
debugArray
->
_pSysData
[
debugIndex
].
y
,
debugArray
->
_pSysData
[
debugIndex
].
z
,
debugArray
->
_pSysData
[
debugIndex
].
w
,
gpu
->
psBornRadii
->
_pSysData
[
jj
]
);
for
(
int
kk
=
0
;
kk
<
2
;
kk
++
){
debugIndex
+=
paddedNumberOfAtoms
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"[%16.9e %16.9e %16.9e] "
,
debugArray
->
_pSysData
[
debugIndex
].
x
,
debugArray
->
_pSysData
[
debugIndex
].
y
,
debugArray
->
_pSysData
[
debugIndex
].
z
);
}
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
}
}
// write results to file
if
(
0
){
std
::
vector
<
int
>
fileId
;
//fileId.push_back( 0 );
VectorOfDoubleVectors
outputVector
;
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psE_Field
,
outputVector
,
NULL
,
1.0
f
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psE_FieldPolar
,
outputVector
,
NULL
,
1.0
f
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psGk_Field
,
outputVector
,
NULL
,
1.0
f
);
cudaLoadCudaFloat4Array
(
gpu
->
natoms
,
3
,
gpu
->
psPosq4
,
outputVector
,
NULL
,
1.0
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaEAndGkField"
,
fileId
,
outputVector
);
}
delete
debugArray
;
}
#endif
}
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.h
View file @
2b508482
...
...
@@ -38,15 +38,7 @@ void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)(
unsigned
int
*
workUnit
,
float
*
outputEField
,
float
*
outputEFieldPolar
,
float
*
outputGkField
#ifdef AMOEBA_DEBUG
,
float4
*
debugArray
,
unsigned
int
targetAtom
#endif
){
#ifdef AMOEBA_DEBUG
float4
pullBack
[
12
];
#endif
float
*
outputGkField
){
extern
__shared__
FixedFieldParticle
sA
[];
...
...
@@ -127,11 +119,7 @@ void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)(
loadFixedFieldParticleData
(
&
(
psA
[
j
]),
&
jCoord
,
jDipole
,
jQuadrupole
,
&
jBornRadius
);
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
j
],
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
j
],
ijField
);
unsigned
int
match
=
(
atomI
==
(
y
+
j
))
?
1
:
0
;
...
...
@@ -150,11 +138,7 @@ void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)(
calculateFixedGkFieldPairIxn_kernel
(
iCoord
,
jCoord
,
&
(
labFrameDipole
[
atomI
*
3
]),
jDipole
,
&
(
labFrameQuadrupole
[
atomI
*
9
]),
jQuadrupole
,
bornRadii
[
atomI
]
*
jBornRadius
,
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
bornRadii
[
atomI
]
*
jBornRadius
,
ijField
);
gkFieldSum
[
0
]
+=
match
?
0
.
0
f
:
ijField
[
0
][
0
];
gkFieldSum
[
1
]
+=
match
?
0
.
0
f
:
ijField
[
0
][
1
];
...
...
@@ -179,11 +163,7 @@ void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)(
loadFixedFieldParticleData
(
&
(
psA
[
j
]),
&
jCoord
,
jDipole
,
jQuadrupole
,
&
jBornRadius
);
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
j
],
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
j
],
ijField
);
float
dScaleVal
;
...
...
@@ -206,85 +186,12 @@ void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)(
eFieldPolarSum
[
1
]
+=
match
?
0
.
0
f
:
pScaleVal
*
ijField
[
0
][
1
];
eFieldPolarSum
[
2
]
+=
match
?
0
.
0
f
:
pScaleVal
*
ijField
[
0
][
2
];
//#ifdef AMOEBA_DEBUG
#if 0
if( 0 && atomI == targetAtom ){
unsigned int index = atomI == targetAtom ? (y + j) : atomI;
unsigned int pullBackIndex = 0;
unsigned int indexI = 0;
unsigned int indexJ = indexI ? 0 : 1;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j);
debugArray[index].z = dScaleVal;
debugArray[index].w = pScaleVal;
pullBackIndex += 2;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
pullBackIndex++;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
pullBackIndex++;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : dScaleVal*ijField[indexI][0];
debugArray[index].y = match ? 0.0f : dScaleVal*ijField[indexI][1];
debugArray[index].z = match ? 0.0f : dScaleVal*ijField[indexI][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : pScaleVal*ijField[indexI][0];
debugArray[index].y = match ? 0.0f : pScaleVal*ijField[indexI][1];
debugArray[index].z = match ? 0.0f : pScaleVal*ijField[indexI][2];
/*
index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = pScaleVal + 10.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z;
*/
}
#endif
// GK field
calculateFixedGkFieldPairIxn_kernel
(
iCoord
,
jCoord
,
&
(
labFrameDipole
[
atomI
*
3
]),
jDipole
,
&
(
labFrameQuadrupole
[
atomI
*
9
]),
jQuadrupole
,
bornRadii
[
atomI
]
*
jBornRadius
,
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
bornRadii
[
atomI
]
*
jBornRadius
,
ijField
);
match
=
(
atomI
>=
cSim
.
atoms
)
||
((
y
+
tj
)
>=
cSim
.
atoms
)
?
1
:
0
;
gkFieldSum
[
0
]
+=
match
?
0
.
0
f
:
ijField
[
0
][
0
];
...
...
@@ -292,45 +199,6 @@ if( 0 && atomI == targetAtom ){
gkFieldSum
[
2
]
+=
match
?
0
.
0
f
:
ijField
[
0
][
2
];
//#ifdef AMOEBA_DEBUG
#if 0
if( atomI == targetAtom ){
unsigned int index = atomI == targetAtom ? (y + j) : atomI;
///unsigned int pullBackIndex = 0;
unsigned int indexI = 0;
unsigned int indexJ = indexI ? 0 : 1;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j);
debugArray[index].z = bornRadii[atomI];
debugArray[index].w = jBornRadius;
/*
pullBackIndex += 2;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
pullBackIndex++;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
pullBackIndex++;
*/
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[indexI][0];
debugArray[index].y = match ? 0.0f : ijField[indexI][1];
debugArray[index].z = match ? 0.0f : ijField[indexI][2];
index += match ? 0.0f : cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[indexJ][0];
debugArray[index].y = match ? 0.0f : ijField[indexJ][1];
debugArray[index].z = match ? 0.0f : ijField[indexJ][2];
}
#endif
}
}
...
...
@@ -377,11 +245,7 @@ if( atomI == targetAtom ){
loadFixedFieldParticleData
(
&
(
psA
[
tj
]),
&
jCoord
,
jDipole
,
jQuadrupole
,
&
jBornRadius
);
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
tj
],
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
tj
],
ijField
);
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
...
...
@@ -403,84 +267,12 @@ if( atomI == targetAtom ){
psA
[
tj
].
eFieldP
[
1
]
+=
ijField
[
1
][
1
];
psA
[
tj
].
eFieldP
[
2
]
+=
ijField
[
1
][
2
];
// #ifdef AMOEBA_DEBUG
#if 0
if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
unsigned int indexI = (atomI == targetAtom) ? 0 : 1;
unsigned int indexJ = (atomI == targetAtom) ? 1 : 0;
//unsigned int pullBackIndex = 0;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
/*
debugArray[index].z = pullBack[pullBackIndex++];
debugArray[index].w = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
*/
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
#if 0
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z;
index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = pScaleVal + 10.0f;
#endif
}
#endif
// Gk field
calculateFixedGkFieldPairIxn_kernel
(
iCoord
,
jCoord
,
&
(
labFrameDipole
[
atomI
*
3
]),
jDipole
,
&
(
labFrameQuadrupole
[
atomI
*
9
]),
jQuadrupole
,
bornRadii
[
atomI
]
*
jBornRadius
,
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
bornRadii
[
atomI
]
*
jBornRadius
,
ijField
);
gkFieldSum
[
0
]
+=
ijField
[
0
][
0
];
gkFieldSum
[
1
]
+=
ijField
[
0
][
1
];
...
...
@@ -491,41 +283,6 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
psA
[
tj
].
gkField
[
2
]
+=
ijField
[
1
][
2
];
//#ifdef AMOEBA_DEBUG
#if 0
if( (atomI == targetAtom || (y + tj) == targetAtom) ){
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
unsigned int indexI = (atomI == targetAtom) ? 0 : 1;
unsigned int indexJ = (atomI == targetAtom) ? 1 : 0;
//unsigned int pullBackIndex = 0;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
debugArray[index].z = bornRadii[atomI];
debugArray[index].w = jBornRadius;
/*
debugArray[index].z = pullBack[pullBackIndex++];
debugArray[index].w = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
*/
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
}
#endif
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
...
...
@@ -548,11 +305,7 @@ if( (atomI == targetAtom || (y + tj) == targetAtom) ){
loadFixedFieldParticleData
(
&
(
psA
[
tj
]),
&
jCoord
,
jDipole
,
jQuadrupole
,
&
jBornRadius
);
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
tj
],
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
tj
],
ijField
);
float
dScaleVal
;
float
pScaleVal
;
...
...
@@ -579,85 +332,12 @@ if( (atomI == targetAtom || (y + tj) == targetAtom) ){
psA
[
tj
].
eFieldP
[
1
]
+=
pScaleVal
*
ijField
[
1
][
1
];
psA
[
tj
].
eFieldP
[
2
]
+=
pScaleVal
*
ijField
[
1
][
2
];
//#ifdef AMOEBA_DEBUG
#if 0
if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
// unsigned int indexI = (atomI == targetAtom) ? 0 : 1;
// unsigned int indexJ = (atomI == targetAtom) ? 1 : 0;
// unsigned int pullBackIndex = 0;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
debugArray[index].z = dScaleVal;
debugArray[index].w = pScaleVal;
// pullBackIndex += 2;
/*
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = dScaleVal*ijField[indexI][0];
debugArray[index].y = dScaleVal*ijField[indexI][1];
debugArray[index].z = dScaleVal*ijField[indexI][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pScaleVal*ijField[indexI][0];
debugArray[index].y = pScaleVal*ijField[indexI][1];
debugArray[index].z = pScaleVal*ijField[indexI][2];
*/
/*
index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = pScaleVal + 10.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z;
*/
}
#endif
// GK field
calculateFixedGkFieldPairIxn_kernel
(
iCoord
,
jCoord
,
&
(
labFrameDipole
[
atomI
*
3
]),
jDipole
,
&
(
labFrameQuadrupole
[
atomI
*
9
]),
jQuadrupole
,
bornRadii
[
atomI
]
*
jBornRadius
,
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
bornRadii
[
atomI
]
*
jBornRadius
,
ijField
);
if
(
(
atomI
<
cSim
.
atoms
)
&&
((
y
+
tj
)
<
cSim
.
atoms
)
){
gkFieldSum
[
0
]
+=
ijField
[
0
][
0
];
...
...
@@ -668,61 +348,6 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
psA
[
tj
].
gkField
[
1
]
+=
ijField
[
1
][
1
];
psA
[
tj
].
gkField
[
2
]
+=
ijField
[
1
][
2
];
}
//#ifdef AMOEBA_DEBUG
#if 0
if( (atomI == targetAtom || (y + tj) == targetAtom) ){
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
unsigned int indexI = (atomI == targetAtom) ? 0 : 1;
unsigned int indexJ = (atomI == targetAtom) ? 1 : 0;
// unsigned int pullBackIndex = 0;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
debugArray[index].z = bornRadii[atomI];
debugArray[index].w = jBornRadius;
// pullBackIndex += 2;
/*
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
*/
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (atomI == targetAtom || (y + tj) == targetAtom) ? ijField[indexI][0] : 0.0f;
debugArray[index].y = (atomI == targetAtom || (y + tj) == targetAtom) ? ijField[indexI][1] : 0.0f;
debugArray[index].z = (atomI == targetAtom || (y + tj) == targetAtom) ? ijField[indexI][2] : 0.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (atomI == targetAtom || (y + tj) == targetAtom) ? ijField[indexJ][0] : 0.0f;
debugArray[index].y = (atomI == targetAtom || (y + tj) == targetAtom) ? ijField[indexJ][1] : 0.0f;
debugArray[index].z = (atomI == targetAtom || (y + tj) == targetAtom) ? ijField[indexJ][2] : 0.0f;
/*
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z;
*/
}
#endif
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
}
}
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
View file @
2b508482
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
//#define AMOEBA_DEBUG
static
__constant__
cudaGmxSimulation
cSim
;
static
__constant__
cudaAmoebaGmxSimulation
cAmoebaSim
;
...
...
@@ -76,24 +95,6 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
#ifdef AMOEBA_DEBUG
static
const
char
*
methodName
=
"computeCudaAmoebaFixedEField"
;
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
%s
\n
"
,
methodName
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
// N2 debug array
CUDAStream
<
float4
>*
debugArray
=
new
CUDAStream
<
float4
>
(
10
*
paddedNumberOfAtoms
,
1
,
"DebugArray"
);
memset
(
debugArray
->
_pSysData
,
0
,
sizeof
(
float
)
*
4
*
paddedNumberOfAtoms
*
paddedNumberOfAtoms
);
debugArray
->
Upload
();
// print intermediate results for the targetAtom
unsigned
int
targetAtom
=
3
;
#endif
kClearFields_3
(
amoebaGpu
,
2
);
static
unsigned
int
threadsPerBlock
=
0
;
...
...
@@ -108,15 +109,6 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
FixedFieldParticle
),
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"cudaComputeAmoebaFixedEField numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%lu ixnCt=%lu workUnits=%lu
\n
"
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
FixedFieldParticle
),
sizeof
(
FixedFieldParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaFixedE_FieldN2ByWarpForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
FixedFieldParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
,
...
...
@@ -131,78 +123,5 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
}
LAUNCHERROR
(
"kCalculateAmoebaFixedE_FieldN2Forces_kernel"
);
#if 0
for( unsigned int ii = 0; ii < gpu->sim.outputBuffers; ii++ ){
//float index = 1.0f;
float index = (float) ii;
for( unsigned int jj = 0; jj < 3*amoebaGpu->paddedNumberOfAtoms; jj += 3 ){
unsigned int kk = 3*ii*amoebaGpu->paddedNumberOfAtoms + jj;
amoebaGpu->psWorkArray_3_1->_pSysData[kk] = index;
amoebaGpu->psWorkArray_3_1->_pSysData[kk+1] = index;
amoebaGpu->psWorkArray_3_1->_pSysData[kk+2] = index;
}
}
amoebaGpu->psWorkArray_3_1->Upload();
#endif
kReduceE_Fields_kernel
(
amoebaGpu
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
gpu
->
psInteractionCount
->
Download
();
(
void
)
fprintf
(
amoebaGpu
->
log
,
"AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%lu ixnCt=%lu workUnits=%lu
\n
"
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
FixedFieldParticle
),
sizeof
(
FixedFieldParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
amoebaGpu
->
psWorkArray_3_1
->
Download
();
amoebaGpu
->
psWorkArray_3_2
->
Download
();
amoebaGpu
->
psE_Field
->
Download
();
amoebaGpu
->
psE_FieldPolar
->
Download
();
(
void
)
fprintf
(
amoebaGpu
->
log
,
"OutEFields
\n
"
);
int
maxPrint
=
32
;
for
(
int
ii
=
0
;
ii
<
gpu
->
natoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%5d "
,
ii
);
int
indexOffset
=
ii
*
3
;
// E_Field
(
void
)
fprintf
(
amoebaGpu
->
log
,
"E[%16.9e %16.9e %16.9e] "
,
amoebaGpu
->
psE_Field
->
_pSysData
[
indexOffset
],
amoebaGpu
->
psE_Field
->
_pSysData
[
indexOffset
+
1
],
amoebaGpu
->
psE_Field
->
_pSysData
[
indexOffset
+
2
]
);
// E_Field polar
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Epol[%16.9e %16.9e %16.9e] "
,
amoebaGpu
->
psE_FieldPolar
->
_pSysData
[
indexOffset
],
amoebaGpu
->
psE_FieldPolar
->
_pSysData
[
indexOffset
+
1
],
amoebaGpu
->
psE_FieldPolar
->
_pSysData
[
indexOffset
+
2
]
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
if
(
ii
==
maxPrint
&&
(
gpu
->
natoms
-
maxPrint
)
>
ii
){
ii
=
gpu
->
natoms
-
maxPrint
;
}
}
(
void
)
fflush
(
amoebaGpu
->
log
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"EFields End
\n
"
);
// write results to file
if
(
0
){
std
::
vector
<
int
>
fileId
;
//fileId.push_back( 0 );
VectorOfDoubleVectors
outputVector
;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psE_Field
,
outputVector
,
NULL
,
1.0
f
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psE_FieldPolar
,
outputVector
,
NULL
,
1.0
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaEField"
,
fileId
,
outputVector
);
}
//delete debugArray;
}
#endif
}
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.h
View file @
2b508482
...
...
@@ -37,15 +37,7 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
void
METHOD_NAME
(
kCalculateAmoebaFixedE_Field
,
Forces_kernel
)(
unsigned
int
*
workUnit
,
float
*
outputEField
,
float
*
outputEFieldPolar
#ifdef AMOEBA_DEBUG
,
float4
*
debugArray
,
unsigned
int
targetAtom
#endif
){
#ifdef AMOEBA_DEBUG
float4
pullBack
[
12
];
#endif
float
*
outputEFieldPolar
){
extern
__shared__
FixedFieldParticle
sA
[];
...
...
@@ -106,11 +98,7 @@ void METHOD_NAME(kCalculateAmoebaFixedE_Field, Forces_kernel)(
float
ijField
[
2
][
3
];
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
j
],
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
j
],
ijField
);
unsigned
int
match
=
(
atomI
==
(
y
+
j
))
?
1
:
0
;
...
...
@@ -142,11 +130,7 @@ void METHOD_NAME(kCalculateAmoebaFixedE_Field, Forces_kernel)(
//loadFixedFieldParticleData( &(psA[j]), &jCoord, jDipole, jQuadrupole );
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
j
],
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
j
],
ijField
);
float
dScaleVal
;
float
pScaleVal
;
...
...
@@ -168,73 +152,6 @@ void METHOD_NAME(kCalculateAmoebaFixedE_Field, Forces_kernel)(
fieldPolarSum
[
1
]
+=
match
?
0
.
0
f
:
pScaleVal
*
ijField
[
0
][
1
];
fieldPolarSum
[
2
]
+=
match
?
0
.
0
f
:
pScaleVal
*
ijField
[
0
][
2
];
#ifdef AMOEBA_DEBUG
if
(
0
&&
atomI
==
targetAtom
){
unsigned
int
index
=
atomI
==
targetAtom
?
(
y
+
j
)
:
atomI
;
// unsigned int pullBackIndex = 0;
unsigned
int
indexI
=
0
;
unsigned
int
indexJ
=
indexI
?
0
:
1
;
debugArray
[
index
].
x
=
(
float
)
atomI
;
debugArray
[
index
].
y
=
(
float
)
(
y
+
j
);
debugArray
[
index
].
z
=
dScaleVal
;
debugArray
[
index
].
w
=
pScaleVal
;
/*
pullBackIndex += 2;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
*/
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexI
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexI
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexI
][
2
];
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexJ
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexJ
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexJ
][
2
];
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
match
?
0
.
0
f
:
dScaleVal
*
ijField
[
indexI
][
0
];
debugArray
[
index
].
y
=
match
?
0
.
0
f
:
dScaleVal
*
ijField
[
indexI
][
1
];
debugArray
[
index
].
z
=
match
?
0
.
0
f
:
dScaleVal
*
ijField
[
indexI
][
2
];
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
match
?
0
.
0
f
:
pScaleVal
*
ijField
[
indexI
][
0
];
debugArray
[
index
].
y
=
match
?
0
.
0
f
:
pScaleVal
*
ijField
[
indexI
][
1
];
debugArray
[
index
].
z
=
match
?
0
.
0
f
:
pScaleVal
*
ijField
[
indexI
][
2
];
/*
index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = pScaleVal + 10.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z;
*/
}
#endif
}
}
...
...
@@ -274,11 +191,7 @@ if( 0 && atomI == targetAtom ){
float
ijField
[
2
][
3
];
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
tj
],
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
tj
],
ijField
);
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
...
...
@@ -300,74 +213,6 @@ if( 0 && atomI == targetAtom ){
psA
[
tj
].
eFieldP
[
1
]
+=
ijField
[
1
][
1
];
psA
[
tj
].
eFieldP
[
2
]
+=
ijField
[
1
][
2
];
#ifdef AMOEBA_DEBUG
if
(
0
&&
(
atomI
==
targetAtom
||
(
y
+
tj
)
==
targetAtom
)
){
unsigned
int
index
=
(
atomI
==
targetAtom
)
?
(
y
+
tj
)
:
atomI
;
unsigned
int
indexI
=
(
atomI
==
targetAtom
)
?
0
:
1
;
unsigned
int
indexJ
=
(
atomI
==
targetAtom
)
?
1
:
0
;
// unsigned int pullBackIndex = 0;
debugArray
[
index
].
x
=
(
float
)
atomI
;
debugArray
[
index
].
y
=
(
float
)
(
y
+
tj
);
/*
debugArray[index].z = pullBack[pullBackIndex++];
debugArray[index].w = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
*/
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexI
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexI
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexI
][
2
];
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexJ
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexJ
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexJ
][
2
];
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexI
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexI
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexI
][
2
];
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexI
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexI
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexI
][
2
];
#if 0
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z;
index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = pScaleVal + 10.0f;
#endif
}
#endif
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
}
...
...
@@ -388,11 +233,7 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
float
ijField
[
2
][
3
];
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
tj
],
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
tj
],
ijField
);
float
dScaleVal
;
float
pScaleVal
;
...
...
@@ -419,79 +260,6 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
psA
[
tj
].
eFieldP
[
1
]
+=
pScaleVal
*
ijField
[
1
][
1
];
psA
[
tj
].
eFieldP
[
2
]
+=
pScaleVal
*
ijField
[
1
][
2
];
#ifdef AMOEBA_DEBUG
if
(
0
&&
(
atomI
==
targetAtom
||
(
y
+
tj
)
==
targetAtom
)
){
unsigned
int
index
=
(
atomI
==
targetAtom
)
?
(
y
+
tj
)
:
atomI
;
unsigned
int
indexI
=
(
atomI
==
targetAtom
)
?
0
:
1
;
unsigned
int
indexJ
=
(
atomI
==
targetAtom
)
?
1
:
0
;
// unsigned int pullBackIndex = 0;
debugArray
[
index
].
x
=
(
float
)
atomI
;
debugArray
[
index
].
y
=
(
float
)
(
y
+
tj
);
debugArray
[
index
].
z
=
dScaleVal
;
debugArray
[
index
].
w
=
pScaleVal
;
/*
pullBackIndex += 2;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
*/
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexI
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexI
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexI
][
2
];
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexJ
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexJ
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexJ
][
2
];
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
dScaleVal
*
ijField
[
indexI
][
0
];
debugArray
[
index
].
y
=
dScaleVal
*
ijField
[
indexI
][
1
];
debugArray
[
index
].
z
=
dScaleVal
*
ijField
[
indexI
][
2
];
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pScaleVal
*
ijField
[
indexI
][
0
];
debugArray
[
index
].
y
=
pScaleVal
*
ijField
[
indexI
][
1
];
debugArray
[
index
].
z
=
pScaleVal
*
ijField
[
indexI
][
2
];
/*
index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = pScaleVal + 10.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z;
*/
}
#endif
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
}
}
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedFieldParticle.h
View file @
2b508482
...
...
@@ -151,11 +151,7 @@ __device__ static void zeroFixedFieldParticleSharedField( struct FixedFieldParti
// body of fixed E-field calculation
__device__
static
void
calculateFixedEFieldPairIxn_kernel
(
FixedFieldParticle
&
atomI
,
FixedFieldParticle
&
atomJ
,
float
field
[
2
][
3
]
#ifdef AMOEBA_DEBUG
,
float4
debugArray
[
12
]
#endif
)
float
field
[
2
][
3
])
{
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaGrycuk.cu
View file @
2b508482
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "cudaKernels.h"
#include "amoebaCudaKernels.h"
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
static
__constant__
cudaGmxSimulation
cSim
;
static
__constant__
cudaAmoebaGmxSimulation
cAmoebaSim
;
...
...
@@ -204,44 +224,12 @@ void kReduceGrycukGbsaBornSum( amoebaGpuContext amoebaGpu )
void
kCalculateAmoebaGrycukBornRadii
(
amoebaGpuContext
amoebaGpu
)
{
#ifdef AMOEBA_DEBUG
static
const
char
*
methodName
=
"kCalculateAmoebaGrycukBornRadii"
;
static
int
timestep
=
0
;
std
::
vector
<
int
>
fileId
;
timestep
++
;
fileId
.
resize
(
2
);
fileId
[
0
]
=
timestep
;
fileId
[
1
]
=
1
;
#endif
// ---------------------------------------------------------------------------------------
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
// apparently debug array can take up nontrivial no. registers
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s %d maxCovalentDegreeSz=%d ZZZ
\n
"
,
methodName
,
gpu
->
natoms
,
amoebaGpu
->
maxCovalentDegreeSz
);
amoebaGpu
->
scalingDistanceCutoff
);
}
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
CUDAStream
<
float4
>*
debugArray
=
new
CUDAStream
<
float4
>
(
paddedNumberOfAtoms
*
paddedNumberOfAtoms
,
1
,
"DebugArray"
);
memset
(
debugArray
->
_pSysData
,
0
,
sizeof
(
float
)
*
4
*
paddedNumberOfAtoms
*
paddedNumberOfAtoms
);
debugArray
->
Upload
();
unsigned
int
targetAtom
=
0
;
gpu
->
psBornRadii
->
Download
();
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Grycuk input
\n
"
);
(
void
)
fflush
(
amoebaGpu
->
log
);
for
(
int
ii
=
0
;
ii
<
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Born %6d %16.9e
\n
"
,
ii
,
gpu
->
psBornRadii
->
_pSysData
[
ii
]
);
}
}
#endif
// on first pass, set threads/block and based on that setting the energy buffer array
static
unsigned
int
threadsPerBlock
=
0
;
...
...
@@ -255,28 +243,7 @@ void kCalculateAmoebaGrycukBornRadii( amoebaGpuContext amoebaGpu )
else
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
GrycukParticle
),
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaCudaGrycuk: blcks=%u tds=%u %u bPrWrp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u
\n
"
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
maxThreads
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
GrycukParticle
),
sizeof
(
GrycukParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
}
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaCudaGrycukN2Forces%swarp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u
\n
"
,
(
gpu
->
bOutputBufferPerWarp
?
" "
:
" no "
),
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
GrycukParticle
),
sizeof
(
GrycukParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaGrycukBornRadiiN2ByWarp_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
GrycukParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
);
...
...
@@ -330,13 +297,7 @@ __device__ void zeroGrycukChainRuleParticleSharedField( struct GrycukChainRulePa
}
//#define AMOEBA_DEBUG
__device__
void
calculateGrycukChainRulePairIxn_kernel
(
GrycukChainRuleParticle
&
atomI
,
GrycukChainRuleParticle
&
atomJ
,
float
force
[
3
]
#ifdef AMOEBA_DEBUG
,
float4
pullDebug
[
5
]
#endif
){
__device__
void
calculateGrycukChainRulePairIxn_kernel
(
GrycukChainRuleParticle
&
atomI
,
GrycukChainRuleParticle
&
atomJ
,
float
force
[
3
]
){
const
float
pi
=
3.1415926535897
f
;
float
third
=
1.0
f
/
3.0
f
;
...
...
@@ -390,18 +351,6 @@ __device__ void calculateGrycukChainRulePairIxn_kernel( GrycukChainRuleParticle&
float
dbr
=
term
*
de
/
r
;
de
=
dbr
*
atomI
.
bornForce
;
#ifdef AMOEBA_DEBUG
pullDebug
[
0
].
x
=
de
;
pullDebug
[
0
].
y
=
r
;
pullDebug
[
0
].
z
=
factor
;
pullDebug
[
0
].
w
=
-
4.0
f
;
pullDebug
[
1
].
x
=
atomI
.
bornForce
/
4.184
f
;
pullDebug
[
1
].
y
=
atomI
.
bornRadius
;
pullDebug
[
1
].
z
=
atomJ
.
bornForce
/
4.184
f
;
pullDebug
[
1
].
w
=
-
5.0
f
;
#endif
force
[
0
]
=
xr
*
de
;
force
[
1
]
=
yr
*
de
;
force
[
2
]
=
zr
*
de
;
...
...
@@ -430,43 +379,12 @@ __device__ void calculateGrycukChainRulePairIxn_kernel( GrycukChainRuleParticle&
void
kCalculateGrycukGbsaForces2
(
amoebaGpuContext
amoebaGpu
)
{
#ifdef AMOEBA_DEBUG
static
const
char
*
methodName
=
"kCalculateGrycukGbsaForces2"
;
static
int
timestep
=
0
;
std
::
vector
<
int
>
fileId
;
timestep
++
;
fileId
.
resize
(
2
);
fileId
[
0
]
=
timestep
;
fileId
[
1
]
=
1
;
#endif
// ---------------------------------------------------------------------------------------
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
// apparently debug array can take up nontrivial no. registers
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s %d maxCovalentDegreeSz=%d ZZZ
\n
"
,
methodName
,
gpu
->
natoms
,
amoebaGpu
->
maxCovalentDegreeSz
);
}
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
CUDAStream
<
float4
>*
debugArray
=
new
CUDAStream
<
float4
>
(
20
*
paddedNumberOfAtoms
,
1
,
"DebugArray"
);
memset
(
debugArray
->
_pSysData
,
0
,
sizeof
(
float
)
*
4
*
20
*
paddedNumberOfAtoms
);
debugArray
->
Upload
();
unsigned
int
targetAtom
=
0
;
gpu
->
psBornRadii
->
Download
();
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Grycuk input
\n
"
);
(
void
)
fflush
(
amoebaGpu
->
log
);
for
(
int
ii
=
0
;
ii
<
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Born %6d %16.9e
\n
"
,
ii
,
gpu
->
psBornRadii
->
_pSysData
[
ii
]
);
}
}
#endif
// on first pass, set threads/block and based on that setting the energy buffer array
static
unsigned
int
threadsPerBlock
=
0
;
...
...
@@ -480,64 +398,15 @@ void kCalculateGrycukGbsaForces2( amoebaGpuContext amoebaGpu )
else
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
GrycukChainRuleParticle
),
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaCudaGrycuk: blcks=%u tds=%u %u bPrWrp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u
\n
"
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
maxThreads
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
GrycukChainRuleParticle
),
sizeof
(
GrycukChainRuleParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
}
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaCudaGrycukN2Forces%swarp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u
\n
"
,
(
gpu
->
bOutputBufferPerWarp
?
" "
:
" no "
),
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
GrycukChainRuleParticle
),
sizeof
(
GrycukChainRuleParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
//kClearForces( gpu );
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaGrycukChainRuleN2ByWarp_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
GrycukChainRuleParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
#ifdef AMOEBA_DEBUG
,
debugArray
->
_pDevData
,
targetAtom
#endif
);
kCalculateAmoebaGrycukChainRuleN2ByWarp_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
GrycukChainRuleParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
);
}
else
{
kCalculateAmoebaGrycukChainRuleN2_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
GrycukChainRuleParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
#ifdef AMOEBA_DEBUG
,
debugArray
->
_pDevData
,
targetAtom
#endif
);
kCalculateAmoebaGrycukChainRuleN2_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
GrycukChainRuleParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
);
}
LAUNCHERROR
(
"kCalculateAmoebaCudaGrycukN2Forces"
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
debugArray
->
Download
();
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
for
(
int
jj
=
0
;
jj
<
gpu
->
natoms
;
jj
++
){
int
debugIndex
=
jj
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%5d %5d DebugGrycukChain
\n
"
,
targetAtom
,
jj
);
for
(
int
kk
=
0
;
kk
<
7
;
kk
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"[%16.9e %16.9e %16.9e %16.9e]
\n
"
,
debugArray
->
_pSysData
[
debugIndex
].
x
,
debugArray
->
_pSysData
[
debugIndex
].
y
,
debugArray
->
_pSysData
[
debugIndex
].
z
,
debugArray
->
_pSysData
[
debugIndex
].
w
);
debugIndex
+=
paddedNumberOfAtoms
;
}
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
}
}
#endif
if
(
0
){
static
int
callId
=
0
;
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaGrycukChainRule.h
View file @
2b508482
...
...
@@ -34,11 +34,7 @@ __launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__
(
G8X_NONBOND_THREADS_PER_BLOCK
,
1
)
#endif
void
METHOD_NAME
(
kCalculateAmoebaGrycukChainRule
,
_kernel
)(
unsigned
int
*
workUnit
#ifdef AMOEBA_DEBUG
,
float4
*
debugArray
,
unsigned
int
targetAtom
#endif
){
void
METHOD_NAME
(
kCalculateAmoebaGrycukChainRule
,
_kernel
)(
unsigned
int
*
workUnit
){
extern
__shared__
GrycukChainRuleParticle
sAChainRule
[];
...
...
@@ -49,10 +45,6 @@ void METHOD_NAME(kCalculateAmoebaGrycukChainRule, _kernel)( unsigned int* workUn
unsigned
int
end
=
(
warp
+
1
)
*
numWorkUnits
/
totalWarps
;
unsigned
int
lasty
=
0xFFFFFFFF
;
#ifdef AMOEBA_DEBUG
float4
pullDebug
[
5
];
#endif
while
(
pos
<
end
)
{
...
...
@@ -85,11 +77,7 @@ void METHOD_NAME(kCalculateAmoebaGrycukChainRule, _kernel)( unsigned int* workUn
for
(
unsigned
int
j
=
(
tgx
+
1
)
&
(
GRID
-
1
);
j
!=
tgx
;
j
=
(
j
+
1
)
&
(
GRID
-
1
))
{
float
localForce
[
3
];
calculateGrycukChainRulePairIxn_kernel
(
localParticle
,
psAChainRule
[
j
],
localForce
#ifdef AMOEBA_DEBUG
,
pullDebug
#endif
);
calculateGrycukChainRulePairIxn_kernel
(
localParticle
,
psAChainRule
[
j
],
localForce
);
if
(
(
atomI
!=
(
y
+
j
))
&&
(
atomI
<
cSim
.
atoms
)
&&
((
y
+
j
)
<
cSim
.
atoms
)
){
localParticle
.
force
[
0
]
-=
localForce
[
0
];
...
...
@@ -100,54 +88,6 @@ void METHOD_NAME(kCalculateAmoebaGrycukChainRule, _kernel)( unsigned int* workUn
psAChainRule
[
j
].
force
[
1
]
+=
localForce
[
1
];
psAChainRule
[
j
].
force
[
2
]
+=
localForce
[
2
];
#ifdef AMOEBA_DEBUG
if
(
atomI
==
targetAtom
||
(
y
+
j
)
==
targetAtom
){
unsigned
int
index
=
(
atomI
==
targetAtom
)
?
(
y
+
j
)
:
atomI
;
debugArray
[
index
].
x
=
(
float
)
atomI
;
debugArray
[
index
].
y
=
(
float
)
(
y
+
j
);
debugArray
[
index
].
z
=
-
1
.
0
f
;
debugArray
[
index
].
w
=
-
1
.
0
f
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
(
float
)
x
;
debugArray
[
index
].
y
=
(
float
)
y
;
debugArray
[
index
].
z
=
(
float
)
tgx
;
debugArray
[
index
].
w
=
-
2
.
0
f
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullDebug
[
0
].
x
;
debugArray
[
index
].
y
=
pullDebug
[
0
].
y
;
debugArray
[
index
].
z
=
pullDebug
[
0
].
z
;
debugArray
[
index
].
w
=
pullDebug
[
0
].
w
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullDebug
[
1
].
x
;
debugArray
[
index
].
y
=
pullDebug
[
1
].
y
;
debugArray
[
index
].
z
=
pullDebug
[
1
].
z
;
debugArray
[
index
].
w
=
pullDebug
[
1
].
w
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
localForce
[
0
];
debugArray
[
index
].
y
=
localForce
[
1
];
debugArray
[
index
].
z
=
localForce
[
2
];
debugArray
[
index
].
w
=
-
12
.
0
f
;
calculateGrycukChainRulePairIxn_kernel
(
psAChainRule
[
j
],
localParticle
,
localForce
,
pullDebug
);
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullDebug
[
0
].
x
;
debugArray
[
index
].
y
=
pullDebug
[
0
].
y
;
debugArray
[
index
].
z
=
pullDebug
[
0
].
z
;
debugArray
[
index
].
w
=
-
13
.
0
f
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
localForce
[
0
];
debugArray
[
index
].
y
=
localForce
[
1
];
debugArray
[
index
].
z
=
localForce
[
2
];
debugArray
[
index
].
w
=
-
14
.
0
f
;
}
#endif
}
}
...
...
@@ -181,11 +121,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
if
(
(
atomI
<
cSim
.
atoms
)
&&
((
y
+
tj
)
<
cSim
.
atoms
)
){
float
localForce
[
3
];
calculateGrycukChainRulePairIxn_kernel
(
localParticle
,
psAChainRule
[
tj
],
localForce
#ifdef AMOEBA_DEBUG
,
pullDebug
#endif
);
calculateGrycukChainRulePairIxn_kernel
(
localParticle
,
psAChainRule
[
tj
],
localForce
);
localParticle
.
force
[
0
]
-=
localForce
[
0
];
localParticle
.
force
[
1
]
-=
localForce
[
1
];
...
...
@@ -195,54 +131,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
psAChainRule
[
tj
].
force
[
1
]
+=
localForce
[
1
];
psAChainRule
[
tj
].
force
[
2
]
+=
localForce
[
2
];
#ifdef AMOEBA_DEBUG
unsigned
int
index
=
(
atomI
==
targetAtom
)
?
(
y
+
tj
)
:
atomI
;
if
(
atomI
==
targetAtom
||
(
y
+
tj
)
==
targetAtom
){
debugArray
[
index
].
x
=
(
float
)
atomI
;
debugArray
[
index
].
y
=
(
float
)
(
y
+
tj
);
debugArray
[
index
].
z
=
-
1
.
0
f
;
debugArray
[
index
].
w
=
-
1
.
0
f
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
(
float
)
x
;
debugArray
[
index
].
y
=
(
float
)
y
;
debugArray
[
index
].
z
=
(
float
)
tgx
;
debugArray
[
index
].
w
=
-
2
.
0
f
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullDebug
[
0
].
x
;
debugArray
[
index
].
y
=
pullDebug
[
0
].
y
;
debugArray
[
index
].
z
=
pullDebug
[
0
].
z
;
debugArray
[
index
].
w
=
pullDebug
[
0
].
w
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullDebug
[
1
].
x
;
debugArray
[
index
].
y
=
pullDebug
[
1
].
y
;
debugArray
[
index
].
z
=
pullDebug
[
1
].
z
;
debugArray
[
index
].
w
=
pullDebug
[
1
].
w
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
localForce
[
0
];
debugArray
[
index
].
y
=
localForce
[
1
];
debugArray
[
index
].
z
=
localForce
[
2
];
debugArray
[
index
].
w
=
-
10
.
0
f
;
}
#endif
calculateGrycukChainRulePairIxn_kernel
(
psAChainRule
[
tj
],
localParticle
,
localForce
#ifdef AMOEBA_DEBUG
,
pullDebug
#endif
);
#ifdef AMOEBA_DEBUG
if
(
atomI
==
targetAtom
||
(
y
+
tj
)
==
targetAtom
){
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullDebug
[
0
].
x
;
debugArray
[
index
].
y
=
localForce
[
1
];
debugArray
[
index
].
z
=
localForce
[
2
];
debugArray
[
index
].
w
=
-
11
.
0
f
;
}
#endif
calculateGrycukChainRulePairIxn_kernel
(
psAChainRule
[
tj
],
localParticle
,
localForce
);
localParticle
.
force
[
0
]
+=
localForce
[
0
];
localParticle
.
force
[
1
]
+=
localForce
[
1
];
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
View file @
2b508482
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
//#include "amoebaGpuTypes.h"
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "cudaKernels.h"
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
//#define INCLUDE_TORQUE
#include "kCalculateAmoebaCudaKirkwoodParticle.h"
extern
void
kCalculateObcGbsaForces2
(
gpuContext
gpu
);
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
static
__constant__
cudaGmxSimulation
cSim
;
static
__constant__
cudaAmoebaGmxSimulation
cAmoebaSim
;
...
...
@@ -1638,26 +1657,6 @@ __device__ void zeroKirkwoodParticleSharedField( struct KirkwoodParticle* sA )
// reduce psWorkArray_1_1 -> dBorn
// reduce psWorkArray_1_2 -> dBornPolar
#ifdef AMOEBA_DEBUG
static
void
kReduce_dBorn
(
amoebaGpuContext
amoebaGpu
)
{
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
/*
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_1_1->_pDevData, amoebaGpu->psBorn->_pDevData, 0 );
LAUNCHERROR("kReduce_dBorn1");
*/
kReduceFields_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
bsf_reduce_threads_per_block
>>>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
gpu
->
sim
.
outputBuffers
,
amoebaGpu
->
psWorkArray_1_2
->
_pDevData
,
amoebaGpu
->
psBornPolar
->
_pDevData
,
0
);
LAUNCHERROR
(
"kReduce_dBorn2"
);
}
#endif
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_THREADS_PER_BLOCK
,
1
)
...
...
@@ -1940,40 +1939,6 @@ static void kReduceToBornForcePrefactor( amoebaGpuContext amoebaGpu )
amoebaGpu
->
gpuContext
->
psBornForce
->
_pDevData
);
}
LAUNCHERROR
(
"kReduceToBornForcePrefactor"
);
//#define AMOEBA_DEBUG
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
// kClearEnergy() should be called prior to kReduceToBornForcePrefactorAndSASA_kernel
double
energy
=
kReduceEnergy
(
amoebaGpu
->
gpuContext
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Born force w/ cavity energy=%15.7e.
\n
"
,
energy
);
(
void
)
fflush
(
amoebaGpu
->
log
);
/*
for( int ii = 0; ii < amoebaGpu->gpuContext->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
(void) fprintf( amoebaGpu->log,"bF %16.9e obc=%16.9e bR=%16.9e\n",
amoebaGpu->gpuContext->psBornForce->_pSysData[ii],
amoebaGpu->gpuContext->psObcData->_pSysData[ii].x,
amoebaGpu->gpuContext->psBornRadii->_pSysData[ii] );
}
(void) fflush( amoebaGpu->log );
*/
if
(
0
){
std
::
vector
<
int
>
fileId
;
//fileId.push_back( 0 );
VectorOfDoubleVectors
outputVector
;
cudaLoadCudaFloat4Array
(
amoebaGpu
->
gpuContext
->
natoms
,
3
,
amoebaGpu
->
gpuContext
->
psPosq4
,
outputVector
,
NULL
,
1.0
f
);
cudaLoadCudaFloatArray
(
amoebaGpu
->
gpuContext
->
natoms
,
1
,
amoebaGpu
->
gpuContext
->
psBornRadii
,
outputVector
,
NULL
,
1.0
f
);
cudaLoadCudaFloat2Array
(
amoebaGpu
->
gpuContext
->
natoms
,
2
,
amoebaGpu
->
gpuContext
->
psObcData
,
outputVector
,
NULL
,
1.0
f
);
cudaLoadCudaFloatArray
(
amoebaGpu
->
gpuContext
->
natoms
,
1
,
amoebaGpu
->
gpuContext
->
psBornForce
,
outputVector
,
NULL
,
1.0
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaBornForce"
,
fileId
,
outputVector
);
}
}
#endif
#undef AMOEBA_DEBUG
}
/**---------------------------------------------------------------------------------------
...
...
@@ -1988,44 +1953,11 @@ static void kReduceToBornForcePrefactor( amoebaGpuContext amoebaGpu )
void
kCalculateAmoebaKirkwood
(
amoebaGpuContext
amoebaGpu
)
{
#ifdef AMOEBA_DEBUG
static
const
char
*
methodName
=
"kCalculateAmoebaKirkwood"
;
static
int
timestep
=
0
;
std
::
vector
<
int
>
fileId
;
timestep
++
;
fileId
.
resize
(
2
);
fileId
[
0
]
=
timestep
;
fileId
[
1
]
=
1
;
#endif
// ---------------------------------------------------------------------------------------
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
// apparently debug array can take up nontrivial no. registers
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s %d maxCovalentDegreeSz=%d ZZZ
\n
"
,
methodName
,
gpu
->
natoms
,
amoebaGpu
->
maxCovalentDegreeSz
);
amoebaGpu
->
scalingDistanceCutoff
);
}
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
CUDAStream
<
float4
>*
debugArray
=
new
CUDAStream
<
float4
>
(
paddedNumberOfAtoms
*
paddedNumberOfAtoms
,
1
,
"DebugArray"
);
memset
(
debugArray
->
_pSysData
,
0
,
sizeof
(
float
)
*
4
*
paddedNumberOfAtoms
*
paddedNumberOfAtoms
);
debugArray
->
Upload
();
unsigned
int
targetAtom
=
0
;
gpu
->
psBornRadii
->
Download
();
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Kirkwood input
\n
"
);
(
void
)
fflush
(
amoebaGpu
->
log
);
for
(
int
ii
=
0
;
ii
<
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Born %6d %16.9e
\n
"
,
ii
,
gpu
->
psBornRadii
->
_pSysData
[
ii
]
);
}
}
#endif
// on first pass, set threads/block and based on that setting the energy buffer array
static
unsigned
int
threadsPerBlock
=
0
;
...
...
@@ -2040,47 +1972,17 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
KirkwoodParticle
),
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaCudaKirkwood: blcks=%u tds=%u %u bPrWrp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u
\n
"
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
maxThreads
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
KirkwoodParticle
),
sizeof
(
KirkwoodParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
}
kClearFields_1
(
amoebaGpu
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaCudaKirkwoodN2Forces%swarp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u
\n
"
,
(
gpu
->
bOutputBufferPerWarp
?
" "
:
" no "
),
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
KirkwoodParticle
),
sizeof
(
KirkwoodParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaCudaKirkwoodN2ByWarpForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
KirkwoodParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
#ifdef AMOEBA_DEBUG
,
debugArray
->
_pDevData
,
targetAtom
);
#else
);
#endif
gpu
->
psWorkUnit
->
_pDevData
);
}
else
{
kCalculateAmoebaCudaKirkwoodN2Forces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
KirkwoodParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
#ifdef AMOEBA_DEBUG
,
debugArray
->
_pDevData
,
targetAtom
);
#else
);
#endif
gpu
->
psWorkUnit
->
_pDevData
);
}
LAUNCHERROR
(
"kCalculateAmoebaCudaKirkwoodN2Forces"
);
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.h
View file @
2b508482
...
...
@@ -35,15 +35,7 @@ __launch_bounds__(128, 1)
__launch_bounds__
(
64
,
1
)
#endif
void
METHOD_NAME
(
kCalculateAmoebaCudaKirkwood
,
Forces_kernel
)(
unsigned
int
*
workUnit
#ifdef AMOEBA_DEBUG
,
float4
*
debugArray
,
unsigned
int
targetAtom
#endif
){
#ifdef AMOEBA_DEBUG
float4
pullBack
[
20
];
#endif
unsigned
int
*
workUnit
){
extern
__shared__
KirkwoodParticle
sA
[];
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
View file @
2b508482
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
//#include "kCalculateAmoebaCudaKirkwoodParticle.h"
#include "kCalculateAmoebaCudaKirkwoodEDiffParticle.h"
//#define AMOEBA_DEBUG
static
__constant__
cudaGmxSimulation
cSim
;
static
__constant__
cudaAmoebaGmxSimulation
cAmoebaSim
;
...
...
@@ -865,19 +884,6 @@ __device__ void calculateKirkwoodEDiffPairIxn_kernel( KirkwoodEDiffParticle& ato
#endif
#ifdef AMOEBA_DEBUG
__device__
static
int
debugAccumulate
(
unsigned
int
index
,
float4
*
debugArray
,
float
*
field
,
unsigned
int
addMask
,
float
idLabel
)
{
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
addMask
?
field
[
0
]
:
0.0
f
;
debugArray
[
index
].
y
=
addMask
?
field
[
1
]
:
0.0
f
;
debugArray
[
index
].
z
=
addMask
?
field
[
2
]
:
0.0
f
;
debugArray
[
index
].
w
=
idLabel
;
return
index
;
}
#endif
__device__
void
zeroKirkwoodEDiffParticleSharedField
(
struct
KirkwoodEDiffParticle
*
sA
)
{
// zero shared fields
...
...
@@ -931,22 +937,10 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
// ---------------------------------------------------------------------------------------
static
int
timestep
=
0
;
timestep
++
;
#ifdef AMOEBA_DEBUG
static
const
char
*
methodName
=
"kCalculateAmoebaKirkwoodEDiff"
;
std
::
vector
<
int
>
fileId
;
fileId
.
resize
(
2
);
fileId
[
0
]
=
timestep
;
fileId
[
1
]
=
1
;
#endif
// ---------------------------------------------------------------------------------------
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
// apparently debug array can take up nontrivial no. registers
//
static
unsigned
int
threadsPerBlock
=
0
;
if
(
threadsPerBlock
==
0
){
unsigned
int
maxThreads
;
...
...
@@ -960,23 +954,10 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
KirkwoodEDiffParticle
),
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaCudaKirkwoodEDiffN2Forces: blocks=%u threads=%u bffr/Warp=%u atm=%lu shrd=%lu"
" ixnCt=%lu workUnits=%u sm=%d device=%d sharedMemoryPerBlock=%u step=%d
\n
"
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
KirkwoodEDiffParticle
),
sizeof
(
KirkwoodEDiffParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
,
gpu
->
sm_version
,
gpu
->
device
,
gpu
->
sharedMemoryPerBlock
,
timestep
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaCudaKirkwoodEDiffN2ByWarpForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
KirkwoodEDiffParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
);
}
else
{
kCalculateAmoebaCudaKirkwoodEDiffN2Forces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
KirkwoodEDiffParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
);
}
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.h
View file @
2b508482
...
...
@@ -35,8 +35,7 @@ __launch_bounds__(96, 1)
__launch_bounds__
(
32
,
1
)
#endif
void
METHOD_NAME
(
kCalculateAmoebaCudaKirkwoodEDiff
,
Forces_kernel
)(
unsigned
int
*
workUnit
,
float
*
outputTorque
){
unsigned
int
*
workUnit
,
float
*
outputTorque
){
extern
__shared__
KirkwoodEDiffParticle
sA
[];
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaLocalForces.cu
View file @
2b508482
...
...
@@ -1616,19 +1616,7 @@ void kCalculateAmoebaLocalForces_kernel()
void
kCalculateAmoebaLocalForces
(
amoebaGpuContext
gpu
)
{
#ifdef AMOEBA_DEBUG
if
(
gpu
->
log
){
static
int
call
=
1
;
if
(
call
==
0
){
(
void
)
fprintf
(
gpu
->
log
,
"kCalculateAmoebaLocalForces: blks=%u thrds/blk=%u
\n
"
,
gpu
->
gpuContext
->
sim
.
blocks
,
gpu
->
gpuContext
->
sim
.
localForces_threads_per_block
);
fflush
(
gpu
->
log
);
call
++
;
}
}
#endif
kCalculateAmoebaLocalForces_kernel
<<<
gpu
->
gpuContext
->
sim
.
blocks
,
gpu
->
gpuContext
->
sim
.
localForces_threads_per_block
>>>
();
LAUNCHERROR
(
"kCalculateAmoebaLocalForces"
);
}
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMapTorques.cu
View file @
2b508482
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "cudaKernels.h"
#include "amoebaCudaKernels.h"
//#define AMOEBA_DEBUG
#define BLOCK_SIZE 128
using
namespace
std
;
static
__constant__
cudaGmxSimulation
cSim
;
...
...
@@ -469,41 +491,4 @@ void cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpuContext amoebaGpu, CUDASt
amoebaAddMapTorqueForceToForce_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
threads_per_block
>>>
(
);
LAUNCHERROR
(
"amoebaAddMapTorqueForceToForce"
);
}
#ifdef AMOEBA_DEBUG
if
(
0
){
VectorOfDoubleVectors
outputVector
;
std
::
vector
<
int
>
fileId
;
static
int
call
=
0
;
fileId
.
push_back
(
call
++
);
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
CUDAStream
<
float
>*
temp
=
new
CUDAStream
<
float
>
(
3
*
paddedNumberOfAtoms
,
1
,
"TempMapTorqueAndAddToForce2"
);
reduceAndCopyCUDAStreamFloat4
(
gpu
->
psForce4
,
temp
,
1.0
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
temp
,
outputVector
,
NULL
,
1.0
f
/
4.184
f
);
reduceAndCopyCUDAStreamFloat4
(
amoebaGpu
->
psTorqueMapForce4
,
temp
,
1.0
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
temp
,
outputVector
,
NULL
,
1.0
f
/
4.184
f
);
for
(
int
pId
=
0
;
pId
<
5
;
pId
++
){
float
sum
[
3
]
=
{
0.0
f
,
0.0
f
,
0.0
f
};
(
void
)
fprintf
(
stderr
,
"
\n\n
TorqueForceToForce for part=%d
\n
"
,
pId
);
for
(
int
ii
=
0
;
ii
<
amoebaGpu
->
amoebaSim
.
maxTorqueBufferIndex
;
ii
++
){
(
void
)
fprintf
(
stderr
,
"%4d [%15.7e %15.7e %15.7e]
\n
"
,
ii
,
amoebaGpu
->
psTorqueMapForce4
->
_pSysStream
[
ii
][
pId
].
x
,
amoebaGpu
->
psTorqueMapForce4
->
_pSysStream
[
ii
][
pId
].
y
,
amoebaGpu
->
psTorqueMapForce4
->
_pSysStream
[
ii
][
pId
].
z
);
sum
[
0
]
+=
amoebaGpu
->
psTorqueMapForce4
->
_pSysStream
[
ii
][
pId
].
x
;
sum
[
1
]
+=
amoebaGpu
->
psTorqueMapForce4
->
_pSysStream
[
ii
][
pId
].
y
;
sum
[
2
]
+=
amoebaGpu
->
psTorqueMapForce4
->
_pSysStream
[
ii
][
pId
].
z
;
}
(
void
)
fprintf
(
stderr
,
"TorqueForceToForce for partcle=%d [%15.7e %15.7e %15.7e] [%15.7e %15.7e %15.7e]
\n
"
,
pId
,
sum
[
0
],
sum
[
1
],
sum
[
2
],
sum
[
0
]
/
4.184
f
,
sum
[
1
]
/
4.184
f
,
sum
[
2
]
/
4.184
f
);
}
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaElectrostatiPostTorqueForce"
,
fileId
,
outputVector
);
delete
temp
;
}
#endif
}
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
View file @
2b508482
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
...
...
@@ -33,21 +55,12 @@ void GetCalculateAmoebaCudaMutualInducedAndGkFieldsSim(amoebaGpuContext amoebaGp
RTERROR
(
status
,
"GetCalculateAmoebaCudaMutualInducedAndGkFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed"
);
}
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
#define GK
#include "kCalculateAmoebaCudaMutualInducedParticle.h"
#undef GK
__device__
void
calculateMutualInducedAndGkFieldsPairIxn_kernel
(
MutualInducedParticle
&
atomI
,
MutualInducedParticle
&
atomJ
,
float
fields
[
8
][
3
]
#ifdef AMOEBA_DEBUG
,
float4
*
debugArray
#endif
)
float
fields
[
8
][
3
]
)
{
float
deltaR
[
3
];
...
...
@@ -119,13 +132,7 @@ __device__ void calculateMutualInducedAndGkFieldsPairIxn_kernel( MutualInducedPa
}
__device__
void
calculateMutualInducedAndGkFieldsGkPairIxn_kernel
(
MutualInducedParticle
&
atomI
,
MutualInducedParticle
&
atomJ
,
float
gkField
[
8
][
3
]
#ifdef AMOEBA_DEBUG
,
float4
*
debugArray
#endif
)
float
gkField
[
8
][
3
]
)
{
float
gux
[
5
];
...
...
@@ -213,20 +220,6 @@ __device__ void calculateMutualInducedAndGkFieldsGkPairIxn_kernel( MutualInduced
}
#ifdef AMOEBA_DEBUG
__device__
static
int
debugAccumulate
(
int
index
,
float4
*
debugArray
,
float
*
field
,
unsigned
int
addMask
,
float
idLabel
)
{
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
addMask
?
field
[
0
]
:
0.0
f
;
debugArray
[
index
].
y
=
addMask
?
field
[
1
]
:
0.0
f
;
debugArray
[
index
].
z
=
addMask
?
field
[
2
]
:
0.0
f
;
debugArray
[
index
].
w
=
idLabel
;
return
index
;
}
#endif
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
...
...
@@ -324,12 +317,6 @@ void kReduceMutualInducedAndGkFieldDelta_kernel( float* arrayOfDeltas1, float* a
epsilon
[
0
]
=
epsilon
[
0
]
<
delta
[
0
].
z
?
delta
[
0
].
z
:
epsilon
[
0
];
epsilon
[
0
]
=
epsilon
[
0
]
<
delta
[
0
].
w
?
delta
[
0
].
w
:
epsilon
[
0
];
epsilon
[
0
]
=
48.033324
f
*
sqrtf
(
epsilon
[
0
]
/
(
(
float
)
cSim
.
atoms
)
);
#ifdef AMOEBA_DEBUG
epsilon
[
1
]
=
48.033324
f
*
sqrtf
(
delta
[
0
].
x
/
(
(
float
)
cSim
.
atoms
)
);
epsilon
[
2
]
=
48.033324
f
*
sqrtf
(
delta
[
0
].
y
/
(
(
float
)
cSim
.
atoms
)
);
epsilon
[
3
]
=
48.033324
f
*
sqrtf
(
delta
[
0
].
z
/
(
(
float
)
cSim
.
atoms
)
);
epsilon
[
4
]
=
48.033324
f
*
sqrtf
(
delta
[
0
].
w
/
(
(
float
)
cSim
.
atoms
)
);
#endif
}
}
...
...
@@ -460,21 +447,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
// ---------------------------------------------------------------------------------------
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
#ifdef AMOEBA_DEBUG
static
int
iteration
=
1
;
int
targetAtom
=
0
;
static
const
char
*
methodName
=
"cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply"
;
if
(
1
&&
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s
\n
"
,
methodName
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
CUDAStream
<
float4
>*
debugArray
=
new
CUDAStream
<
float4
>
(
paddedNumberOfAtoms
*
paddedNumberOfAtoms
,
1
,
"DebugArray"
);
memset
(
debugArray
->
_pSysData
,
0
,
sizeof
(
float
)
*
4
*
paddedNumberOfAtoms
*
paddedNumberOfAtoms
);
debugArray
->
Upload
();
#endif
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
// clear output arrays
...
...
@@ -493,28 +466,13 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
MutualInducedParticle
),
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%lu ixnCt=%lu workUnits=%lu
\n
"
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
MutualInducedParticle
),
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaMutualInducedAndGkFieldsN2ByWarp_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_2
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_3
->
_pDevData
,
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_4
->
_pDevData
,
debugArray
->
_pDevData
,
targetAtom
);
#else
amoebaGpu
->
psWorkArray_3_4
->
_pDevData
);
#endif
}
else
{
kCalculateAmoebaMutualInducedAndGkFieldsN2_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
>>>
(
...
...
@@ -522,120 +480,13 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_2
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_3
->
_pDevData
,
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_4
->
_pDevData
,
debugArray
->
_pDevData
,
targetAtom
);
#else
amoebaGpu
->
psWorkArray_3_4
->
_pDevData
);
#endif
}
LAUNCHERROR
(
"kCalculateAmoebaMutualInducedAndGkFields"
);
kReduceMutualInducedAndGkFields
(
amoebaGpu
,
outputArray
,
outputPolarArray
,
outputArrayS
,
outputPolarArrayS
);
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_1
->
Download
();
amoebaGpu
->
psWorkArray_3_2
->
Download
();
amoebaGpu
->
psWorkArray_3_3
->
Download
();
amoebaGpu
->
psWorkArray_3_4
->
Download
();
if
(
amoebaGpu
->
log
&&
iteration
==
1
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Finished MI kernel execution %d
\n
"
,
iteration
);
(
void
)
fflush
(
amoebaGpu
->
log
);
outputArray
->
Download
();
outputPolarArray
->
Download
();
outputArrayS
->
Download
();
outputPolarArrayS
->
Download
();
debugArray
->
Download
();
int
maxPrint
=
33
;
for
(
int
ii
=
0
;
ii
<
gpu
->
natoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%5d "
,
ii
);
int
indexOffset
=
ii
*
3
;
// Mi
(
void
)
fprintf
(
amoebaGpu
->
log
,
" Mult[%16.9e %16.9e %16.9e] "
,
outputArray
->
_pSysData
[
indexOffset
],
outputArray
->
_pSysData
[
indexOffset
+
1
],
outputArray
->
_pSysData
[
indexOffset
+
2
]
);
// Mi polar
(
void
)
fprintf
(
amoebaGpu
->
log
,
" MultP[%16.9e %16.9e %16.9e]
\n
"
,
outputPolarArray
->
_pSysData
[
indexOffset
],
outputPolarArray
->
_pSysData
[
indexOffset
+
1
],
outputPolarArray
->
_pSysData
[
indexOffset
+
2
]
);
// MiS
(
void
)
fprintf
(
amoebaGpu
->
log
,
" MultS[%16.9e %16.9e %16.9e] "
,
outputArrayS
->
_pSysData
[
indexOffset
],
outputArrayS
->
_pSysData
[
indexOffset
+
1
],
outputArrayS
->
_pSysData
[
indexOffset
+
2
]
);
// Mi polarS
(
void
)
fprintf
(
amoebaGpu
->
log
,
"MultPS[%16.9e %16.9e %16.9e]
\n
"
,
outputPolarArrayS
->
_pSysData
[
indexOffset
],
outputPolarArrayS
->
_pSysData
[
indexOffset
+
1
],
outputPolarArrayS
->
_pSysData
[
indexOffset
+
2
]
);
// coords
#if 0
(void) fprintf( amoebaGpu->log,"x[%16.9e %16.9e %16.9e] ",
gpu->psPosq4->_pSysData[ii].x,
gpu->psPosq4->_pSysData[ii].y,
gpu->psPosq4->_pSysData[ii].z);
for( int jj = 0; jj < gpu->natoms && jj < 5; jj++ ){
int debugIndex = jj*gpu->natoms + ii;
float xx = gpu->psPosq4->_pSysData[jj].x - gpu->psPosq4->_pSysData[ii].x;
float yy = gpu->psPosq4->_pSysData[jj].y - gpu->psPosq4->_pSysData[ii].y;
float zz = gpu->psPosq4->_pSysData[jj].z - gpu->psPosq4->_pSysData[ii].z;
(void) fprintf( amoebaGpu->log,"\n%4d %4d delta [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] ",
ii, jj, xx, yy, zz,
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );
}
#endif
if
(
ii
==
targetAtom
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
for
(
int
jj
=
0
;
jj
<
gpu
->
natoms
;
jj
++
){
int
debugIndex
=
jj
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%4d %4d Rint [%16.9e %16.9e %16.9e %16.9e]
\n
"
,
ii
,
jj
,
debugArray
->
_pSysData
[
debugIndex
].
x
,
debugArray
->
_pSysData
[
debugIndex
].
y
,
debugArray
->
_pSysData
[
debugIndex
].
z
,
debugArray
->
_pSysData
[
debugIndex
].
w
);
for
(
int
kk
=
0
;
kk
<
9
;
kk
++
){
debugIndex
+=
paddedNumberOfAtoms
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"[%16.9e %16.9e %16.9e %5.1f]
\n
"
,
debugArray
->
_pSysData
[
debugIndex
].
x
,
debugArray
->
_pSysData
[
debugIndex
].
y
,
debugArray
->
_pSysData
[
debugIndex
].
z
,
debugArray
->
_pSysData
[
debugIndex
].
w
);
}
}
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
}
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
if
(
ii
==
maxPrint
&&
(
gpu
->
natoms
-
maxPrint
)
>
ii
){
ii
=
gpu
->
natoms
-
maxPrint
;
}
}
(
void
)
fflush
(
amoebaGpu
->
log
);
iteration
++
;
}
delete
debugArray
;
#endif
}
/**---------------------------------------------------------------------------------------
...
...
@@ -649,22 +500,12 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
static
void
cudaComputeAmoebaMutualInducedAndGkFieldBySOR
(
amoebaGpuContext
amoebaGpu
)
{
// ---------------------------------------------------------------------------------------
static
int
timestep
=
0
;
timestep
++
;
#ifdef AMOEBA_DEBUG
static
const
char
*
methodName
=
"cudaComputeAmoebaMutualInducedAndGkFieldBySOR"
;
std
::
vector
<
int
>
fileId
;
fileId
.
resize
(
2
);
fileId
[
0
]
=
timestep
;
fileId
[
1
]
=
1
;
#endif
// ---------------------------------------------------------------------------------------
int
done
;
int
iteration
;
static
int
timestep
=
0
;
timestep
++
;
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
...
...
@@ -685,49 +526,6 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
cudaMemcpy
(
amoebaGpu
->
psInducedDipole
->
_pDevData
,
amoebaGpu
->
psE_Field
->
_pDevData
,
3
*
gpu
->
sim
.
paddedNumberOfAtoms
*
sizeof
(
float
),
cudaMemcpyDeviceToDevice
);
cudaMemcpy
(
amoebaGpu
->
psInducedDipolePolar
->
_pDevData
,
amoebaGpu
->
psE_FieldPolar
->
_pDevData
,
3
*
gpu
->
sim
.
paddedNumberOfAtoms
*
sizeof
(
float
),
cudaMemcpyDeviceToDevice
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"cudaComputeAmoebaMutualInducedAndGkFieldBySOR Initial setup for matrix multiply
\n
"
);
fflush
(
amoebaGpu
->
log
);
gpu
->
psPosq4
->
Download
();
amoebaGpu
->
psE_Field
->
Download
();
amoebaGpu
->
psE_FieldPolar
->
Download
();
amoebaGpu
->
psInducedDipole
->
Download
(),
amoebaGpu
->
psInducedDipolePolar
->
Download
();
amoebaGpu
->
psInducedDipoleS
->
Download
(),
amoebaGpu
->
psInducedDipolePolarS
->
Download
();
amoebaGpu
->
psPolarizability
->
Download
();
int
offset
=
0
;
int
maxPrint
=
10
;
for
(
int
ii
=
0
;
ii
<
gpu
->
natoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
%4d pol=%12.4e
\n
"
,
ii
,
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
]
);
if
(
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
]
!=
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
+
1
]
||
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
]
!=
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
+
2
]
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"PolX!!! %12.4e %12.4e "
,
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
+
2
]
);
}
(
void
)
fprintf
(
amoebaGpu
->
log
,
" E[%14.6e %14.6e %14.6e] Mi[%14.6e %14.6e %14.6e] Pos[%14.6e %14.6e %14.6e]
\n
"
,
amoebaGpu
->
psE_Field
->
_pSysData
[
offset
],
amoebaGpu
->
psE_Field
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psE_Field
->
_pSysData
[
offset
+
2
],
amoebaGpu
->
psInducedDipole
->
_pSysData
[
offset
],
amoebaGpu
->
psInducedDipole
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psInducedDipole
->
_pSysData
[
offset
+
2
],
gpu
->
psPosq4
->
_pSysData
[
ii
].
x
,
gpu
->
psPosq4
->
_pSysData
[
ii
].
y
,
gpu
->
psPosq4
->
_pSysData
[
ii
].
z
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Ep[%14.6e %14.6e %14.6e] Mip[%14.6e %14.6e %14.6e]
\n
"
,
amoebaGpu
->
psE_FieldPolar
->
_pSysData
[
offset
],
amoebaGpu
->
psE_FieldPolar
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psE_FieldPolar
->
_pSysData
[
offset
+
2
],
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
offset
],
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
offset
+
2
]
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Gk[%14.6e %14.6e %14.6e] MiS[%14.6e %14.6e %14.6e] MipS[%14.6e %14.6e %14.6e]
\n
"
,
amoebaGpu
->
psGk_Field
->
_pSysData
[
offset
],
amoebaGpu
->
psGk_Field
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psGk_Field
->
_pSysData
[
offset
+
2
],
amoebaGpu
->
psInducedDipoleS
->
_pSysData
[
offset
],
amoebaGpu
->
psInducedDipoleS
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psInducedDipoleS
->
_pSysData
[
offset
+
2
],
amoebaGpu
->
psInducedDipolePolarS
->
_pSysData
[
offset
],
amoebaGpu
->
psInducedDipolePolarS
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psInducedDipolePolarS
->
_pSysData
[
offset
+
2
]
);
offset
+=
3
;
if
(
ii
==
maxPrint
&&
(
ii
<
(
gpu
->
natoms
-
maxPrint
)
)
)
ii
=
(
gpu
->
natoms
-
maxPrint
);
}
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
// if polarization type is direct, set flags signalling done and return
if
(
amoebaGpu
->
amoebaSim
.
polarizationType
)
...
...
@@ -800,91 +598,11 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
done
=
1
;
}
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"cudaComputeAmoebaMutualInducedAndGkFieldBySOR step=%d iteration=%3d eps %14.6e done=%d
\n
"
,
timestep
,
iteration
,
amoebaGpu
->
mutualInducedCurrentEpsilon
,
done
);
}
if
(
amoebaGpu
->
log
){
amoebaGpu
->
psInducedDipole
->
Download
();
amoebaGpu
->
psInducedDipolePolar
->
Download
();
amoebaGpu
->
psInducedDipoleS
->
Download
();
amoebaGpu
->
psInducedDipolePolarS
->
Download
();
amoebaGpu
->
psWorkVector
[
2
]
->
Download
();
amoebaGpu
->
psWorkVector
[
3
]
->
Download
();
amoebaGpu
->
psGk_Field
->
Download
();
#if 1
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s iteration=%3d eps %14.6e [%14.6e %14.6e] done=%d
\n
"
,
methodName
,
iteration
,
amoebaGpu
->
mutualInducedCurrentEpsilon
,
amoebaGpu
->
psCurrentEpsilon
->
_pSysData
[
1
],
amoebaGpu
->
psCurrentEpsilon
->
_pSysData
[
2
],
done
);
#else
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s iteration=%3d eps %14.6e %14.6e crrntEps=%14.6e %14.6e %14.6e %14.6e done=%d
\n
"
,
methodName
,
iteration
,
sum1
,
sum2
,
amoebaGpu
->
mutualInducedCurrentEpsilon
,
amoebaGpu
->
psCurrentEpsilon
->
_pSysData
[
0
],
amoebaGpu
->
psCurrentEpsilon
->
_pSysData
[
1
],
amoebaGpu
->
psCurrentEpsilon
->
_pSysData
[
2
],
done
);
#endif
(
void
)
fflush
(
amoebaGpu
->
log
);
int
offset
=
0
;
int
maxPrint
=
20
;
for
(
int
ii
=
0
;
ii
<
gpu
->
natoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%4d "
,
ii
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
" Mi[%14.6e %14.6e %14.6e] "
,
amoebaGpu
->
psInducedDipole
->
_pSysData
[
offset
],
amoebaGpu
->
psInducedDipole
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psInducedDipole
->
_pSysData
[
offset
+
2
]
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Mip[%14.6e %14.6e %14.6e]"
,
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
offset
],
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
offset
+
2
]
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
" MiS[%14.6e %14.6e %14.6e] "
,
amoebaGpu
->
psInducedDipoleS
->
_pSysData
[
offset
],
amoebaGpu
->
psInducedDipoleS
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psInducedDipoleS
->
_pSysData
[
offset
+
2
]
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"MipS[%14.6e %14.6e %14.6e]
\n
"
,
amoebaGpu
->
psInducedDipolePolarS
->
_pSysData
[
offset
],
amoebaGpu
->
psInducedDipolePolarS
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psInducedDipolePolarS
->
_pSysData
[
offset
+
2
]
);
/*
(void) fprintf( amoebaGpu->log,"Gk [%14.6e %14.6e %14.6e]\n",
amoebaGpu->psGk_Field->_pSysData[offset], amoebaGpu->psGk_Field->_pSysData[offset+1], amoebaGpu->psGk_Field->_pSysData[offset+2] );
(void) fprintf( amoebaGpu->log,"W2 [%14.6e %14.6e %14.6e] ",
amoebaGpu->psWorkVector[2]->_pSysData[offset], amoebaGpu->psWorkVector[2]->_pSysData[offset+1], amoebaGpu->psWorkVector[2]->_pSysData[offset+2] );
(void) fprintf( amoebaGpu->log,"W3 [%14.6e %14.6e %14.6e]\n",
amoebaGpu->psWorkVector[3]->_pSysData[offset], amoebaGpu->psWorkVector[3]->_pSysData[offset+1], amoebaGpu->psWorkVector[3]->_pSysData[offset+2] );
*/
if
(
ii
==
maxPrint
&&
(
ii
<
(
gpu
->
natoms
-
maxPrint
)
)
){
ii
=
(
gpu
->
natoms
-
maxPrint
);
offset
=
3
*
(
ii
+
1
);
}
else
{
offset
+=
3
;
}
}
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
iteration
++
;
}
amoebaGpu
->
mutualInducedDone
=
done
;
amoebaGpu
->
mutualInducedConverged
=
(
!
done
||
iteration
>
amoebaGpu
->
mutualInducedMaxIterations
)
?
0
:
1
;
#ifdef AMOEBA_DEBUG
if
(
0
&&
amoebaGpu
->
log
){
trackMutualInducedIterations
(
amoebaGpu
,
iteration
);
}
if
(
0
){
std
::
vector
<
int
>
fileId
;
//fileId.push_back( 0 );
VectorOfDoubleVectors
outputVector
;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psInducedDipole
,
outputVector
,
NULL
,
1.0
f
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psInducedDipolePolar
,
outputVector
,
NULL
,
1.0
f
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psInducedDipoleS
,
outputVector
,
NULL
,
1.0
f
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psInducedDipolePolarS
,
outputVector
,
NULL
,
1.0
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"Cuda_GK_MI"
,
fileId
,
outputVector
);
}
#endif
// ---------------------------------------------------------------------------------------
}
void
cudaComputeAmoebaMutualInducedAndGkField
(
amoebaGpuContext
amoebaGpu
)
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.h
View file @
2b508482
...
...
@@ -39,11 +39,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)(
float
*
outputField
,
float
*
outputFieldPolar
,
float
*
outputFieldS
,
float
*
outputFieldPolarS
#ifdef AMOEBA_DEBUG
,
float4
*
debugArray
,
unsigned
int
targetAtom
#endif
){
float
*
outputFieldPolarS
){
extern
__shared__
MutualInducedParticle
sA
[];
...
...
@@ -112,11 +108,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)(
// load coords, charge, ...
calculateMutualInducedAndGkFieldsPairIxn_kernel
(
localParticle
,
psA
[
j
],
ijField
#ifdef AMOEBA_DEBUG
,
debugArray
#endif
);
calculateMutualInducedAndGkFieldsPairIxn_kernel
(
localParticle
,
psA
[
j
],
ijField
);
unsigned
int
mask
=
(
(
atomI
==
(
y
+
j
))
||
(
atomI
>=
cSim
.
atoms
)
||
((
y
+
j
)
>=
cSim
.
atoms
)
)
?
0
:
1
;
...
...
@@ -138,27 +130,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)(
fieldPolarSumS
[
1
]
+=
mask
?
ijField
[
5
][
1
]
:
0
.
0
f
;
fieldPolarSumS
[
2
]
+=
mask
?
ijField
[
5
][
2
]
:
0
.
0
f
;
//#ifdef AMOEBA_DEBUG
#if 0
unsigned int index = y + j;
if( atomI == targetAtom ){
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j);
debugArray[index].z = bornRadii[atomI];
debugArray[index].w = jBornRadius;
index = debugAccumulate( index, debugArray, ijField[0], mask, 1.0f );
index = debugAccumulate( index, debugArray, ijField[1], mask, 2.0f );
index = debugAccumulate( index, debugArray, ijField[4], mask, 3.0f );
index = debugAccumulate( index, debugArray, ijField[5], mask, 4.0f );
}
#endif
calculateMutualInducedAndGkFieldsGkPairIxn_kernel
(
localParticle
,
psA
[
j
],
ijField
#ifdef AMOEBA_DEBUG
,
debugArray
#endif
);
calculateMutualInducedAndGkFieldsGkPairIxn_kernel
(
localParticle
,
psA
[
j
],
ijField
);
// atomI == atomJ contribution included
...
...
@@ -171,24 +143,6 @@ if( atomI == targetAtom ){
fieldPolarSumS
[
1
]
+=
mask
?
ijField
[
2
][
1
]
:
0
.
0
f
;
fieldPolarSumS
[
2
]
+=
mask
?
ijField
[
2
][
2
]
:
0
.
0
f
;
//#ifdef AMOEBA_DEBUG
#if 0
if( atomI == targetAtom ){
index = debugAccumulate( index, debugArray, ijField[0], mask, 5.0f );
index = debugAccumulate( index, debugArray, ijField[2], mask, 6.0f );
index = debugAccumulate( index, debugArray, jDipoleS, 1, 7.0f );
index = debugAccumulate( index, debugArray, jDipolePolarS, 1, 8.0f );
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = bornRadii[atomI];
debugArray[index].y = jBornRadius;
debugArray[index].w = 9.0f;
}
#endif
}
// Write results
...
...
@@ -213,9 +167,8 @@ if( atomI == targetAtom ){
load3dArray
(
offset
,
fieldPolarSumS
,
outputFieldPolarS
);
#endif
}
else
// 100% utilization
{
}
else
{
// Read fixed atom data into registers and GRF
if
(
lasty
!=
y
)
{
...
...
@@ -235,11 +188,7 @@ if( atomI == targetAtom ){
// load coords, charge, ...
calculateMutualInducedAndGkFieldsPairIxn_kernel
(
localParticle
,
psA
[
tj
],
ijField
#ifdef AMOEBA_DEBUG
,
debugArray
#endif
);
calculateMutualInducedAndGkFieldsPairIxn_kernel
(
localParticle
,
psA
[
tj
],
ijField
);
if
(
(
atomI
<
cSim
.
atoms
)
&&
((
y
+
tj
)
<
cSim
.
atoms
)
){
...
...
@@ -289,29 +238,7 @@ if( atomI == targetAtom ){
}
//#ifdef AMOEBA_DEBUG
#if 0
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
if( atomI == targetAtom || (y + tj) == targetAtom ){
unsigned int indexI = (atomI == targetAtom) ? 0 : 2;
unsigned int maskD = (atomI < cSim.atoms) && ((y+tj) < cSim.atoms);
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
debugArray[index].z = bornRadii[atomI];
debugArray[index].w = jBornRadius;
index = debugAccumulate( index, debugArray, ijField[indexI], maskD, -1.0f );
index = debugAccumulate( index, debugArray, ijField[indexI+1], maskD, -2.0f );
index = debugAccumulate( index, debugArray, ijField[indexI+4], maskD, -3.0f );
index = debugAccumulate( index, debugArray, ijField[indexI+5], maskD, -4.0f );
}
#endif
calculateMutualInducedAndGkFieldsGkPairIxn_kernel
(
localParticle
,
psA
[
tj
],
ijField
#ifdef AMOEBA_DEBUG
,
debugArray
#endif
);
calculateMutualInducedAndGkFieldsGkPairIxn_kernel
(
localParticle
,
psA
[
tj
],
ijField
);
if
(
(
atomI
<
cSim
.
atoms
)
&&
((
y
+
tj
)
<
cSim
.
atoms
)
){
...
...
@@ -337,24 +264,6 @@ if( atomI == targetAtom || (y + tj) == targetAtom ){
psA
[
tj
].
fieldPolarS
[
2
]
+=
ijField
[
3
][
2
];
}
//#ifdef AMOEBA_DEBUG
#if 0
if( atomI == targetAtom || (y + tj) == targetAtom ){
unsigned int indexI = (atomI == targetAtom) ? 0 : 1;
unsigned int maskD = (atomI < cSim.atoms) && ((y+tj) < cSim.atoms);
index = debugAccumulate( index, debugArray, ijField[indexI], maskD, -5.0f );
index = debugAccumulate( index, debugArray, ijField[indexI+2], maskD, -6.0f );
index = debugAccumulate( index, debugArray, jDipoleS, 1, -7.0f );
index = debugAccumulate( index, debugArray, jDipolePolarS, 1, -8.0f );
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = bornRadii[atomI];
debugArray[index].y = jBornRadius;
debugArray[index].w = -9.0f;
}
#endif
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
}
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
View file @
2b508482
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
...
...
@@ -33,19 +55,10 @@ void GetCalculateAmoebaCudaMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
RTERROR
(
status
,
"GetCalculateAmoebaCudaMutualInducedFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed"
);
}
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
#include "kCalculateAmoebaCudaMutualInducedParticle.h"
__device__
void
calculateMutualInducedFieldPairIxn_kernel
(
MutualInducedParticle
&
atomI
,
MutualInducedParticle
&
atomJ
,
float
fields
[
4
][
3
]
#ifdef AMOEBA_DEBUG
,
float4
*
debugArray
#endif
)
float
fields
[
4
][
3
]
)
{
float
deltaR
[
3
];
...
...
@@ -252,20 +265,6 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
#ifdef AMOEBA_DEBUG
int
targetAtom
=
1231
;
static
const
char
*
methodName
=
"cudaComputeAmoebaMutualInducedFieldMatrixMultiply"
;
static
int
iteration
=
1
;
if
(
1
&&
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s
\n
"
,
methodName
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
CUDAStream
<
float4
>*
debugArray
=
new
CUDAStream
<
float4
>
(
paddedNumberOfAtoms
*
paddedNumberOfAtoms
,
1
,
"DebugArray"
);
memset
(
debugArray
->
_pSysData
,
0
,
sizeof
(
float
)
*
4
*
paddedNumberOfAtoms
*
paddedNumberOfAtoms
);
debugArray
->
Upload
();
#endif
kClearFields_3
(
amoebaGpu
,
2
);
if
(
threadsPerBlock
==
0
){
...
...
@@ -279,169 +278,23 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
MutualInducedParticle
),
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
#ifdef AMOEBA_DEBUG
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u
\n
"
,
methodName
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
MutualInducedParticle
),
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
#endif
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaMutualInducedFieldN2ByWarp_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_2
->
_pDevData
,
debugArray
->
_pDevData
,
targetAtom
);
#else
amoebaGpu
->
psWorkArray_3_2
->
_pDevData
);
#endif
}
else
{
kCalculateAmoebaMutualInducedFieldN2_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_2
->
_pDevData
,
debugArray
->
_pDevData
,
targetAtom
);
#else
amoebaGpu
->
psWorkArray_3_2
->
_pDevData
);
#endif
}
LAUNCHERROR
(
"kCalculateAmoebaMutualInducedField"
);
kReduceMutualInducedFields
(
amoebaGpu
,
outputArray
,
outputPolarArray
);
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_1
->
Download
();
amoebaGpu
->
psWorkArray_3_2
->
Download
();
if
(
amoebaGpu
->
log
&&
iteration
==
-
1
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Finished MI kernel execution %d
\n
"
,
iteration
);
(
void
)
fflush
(
amoebaGpu
->
log
);
outputArray
->
Download
();
outputPolarArray
->
Download
();
debugArray
->
Download
();
int
maxPrint
=
1400
;
for
(
int
ii
=
0
;
ii
<
gpu
->
natoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%5d "
,
ii
);
int
indexOffset
=
ii
*
3
;
// MI
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Mult[%16.9e %16.9e %16.9e] "
,
outputArray
->
_pSysData
[
indexOffset
],
outputArray
->
_pSysData
[
indexOffset
+
1
],
outputArray
->
_pSysData
[
indexOffset
+
2
]
);
// MI polar
(
void
)
fprintf
(
amoebaGpu
->
log
,
"MultP[%16.9e %16.9e %16.9e] "
,
outputPolarArray
->
_pSysData
[
indexOffset
],
outputPolarArray
->
_pSysData
[
indexOffset
+
1
],
outputPolarArray
->
_pSysData
[
indexOffset
+
2
]
);
// coords
#if 0
(void) fprintf( amoebaGpu->log,"x[%16.9e %16.9e %16.9e] ",
gpu->psPosq4->_pSysData[ii].x,
gpu->psPosq4->_pSysData[ii].y,
gpu->psPosq4->_pSysData[ii].z);
for( int jj = 0; jj < gpu->natoms && jj < 5; jj++ ){
int debugIndex = jj*gpu->natoms + ii;
float xx = gpu->psPosq4->_pSysData[jj].x - gpu->psPosq4->_pSysData[ii].x;
float yy = gpu->psPosq4->_pSysData[jj].y - gpu->psPosq4->_pSysData[ii].y;
float zz = gpu->psPosq4->_pSysData[jj].z - gpu->psPosq4->_pSysData[ii].z;
(void) fprintf( amoebaGpu->log,"\n%4d %4d delta [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] ",
ii, jj, xx, yy, zz,
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );
}
#endif
if
(
ii
==
targetAtom
){
float
sums
[
4
][
3
]
=
{
{
0.0
f
,
0.0
f
,
0.0
f
},
{
0.0
f
,
0.0
f
,
0.0
f
},
{
0.0
f
,
0.0
f
,
0.0
f
},
{
0.0
f
,
0.0
f
,
0.0
f
}
};
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
unsigned
int
count
=
0
;
for
(
int
jj
=
0
;
jj
<
gpu
->
natoms
;
jj
++
){
int
debugIndex
=
jj
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%4d %4d Pint [%16.9e %16.9e %16.9e %16.9e] "
,
ii
,
jj
,
debugArray
->
_pSysData
[
debugIndex
].
x
,
debugArray
->
_pSysData
[
debugIndex
].
y
,
debugArray
->
_pSysData
[
debugIndex
].
z
,
debugArray
->
_pSysData
[
debugIndex
].
w
);
//debugIndex += gpu->natoms;
debugIndex
+=
paddedNumberOfAtoms
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"[%16.9e %16.9e %16.9e] "
,
debugArray
->
_pSysData
[
debugIndex
].
x
,
debugArray
->
_pSysData
[
debugIndex
].
y
,
debugArray
->
_pSysData
[
debugIndex
].
z
);
int
index
=
0
;
sums
[
index
][
0
]
+=
debugArray
->
_pSysData
[
debugIndex
].
x
;
sums
[
index
][
1
]
+=
debugArray
->
_pSysData
[
debugIndex
].
y
;
sums
[
index
][
2
]
+=
debugArray
->
_pSysData
[
debugIndex
].
z
;
if
(
count
&&
(
(
count
%
31
)
==
0
)
){
static
float
saveSum
[
3
]
=
{
0.0
f
,
0.0
f
,
0.0
f
};
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Block sum [%16.9e %16.9e %16.9e] "
,
sums
[
index
][
0
]
-
saveSum
[
0
],
sums
[
index
][
1
]
-
saveSum
[
1
],
sums
[
index
][
2
]
-
saveSum
[
2
]
);
saveSum
[
0
]
=
sums
[
index
][
0
];
saveSum
[
1
]
=
sums
[
index
][
1
];
saveSum
[
2
]
=
sums
[
index
][
2
];
}
debugIndex
+=
paddedNumberOfAtoms
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"[%16.9e %16.9e %16.9e] "
,
debugArray
->
_pSysData
[
debugIndex
].
x
,
debugArray
->
_pSysData
[
debugIndex
].
y
,
debugArray
->
_pSysData
[
debugIndex
].
z
);
index
++
;
sums
[
index
][
0
]
+=
debugArray
->
_pSysData
[
debugIndex
].
x
;
sums
[
index
][
1
]
+=
debugArray
->
_pSysData
[
debugIndex
].
y
;
sums
[
index
][
2
]
+=
debugArray
->
_pSysData
[
debugIndex
].
z
;
if
(
count
&&
(
(
count
%
31
)
==
0
)
){
static
float
saveSum
[
3
]
=
{
0.0
f
,
0.0
f
,
0.0
f
};
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Block sumP [%16.9e %16.9e %16.9e] "
,
sums
[
index
][
0
]
-
saveSum
[
0
],
sums
[
index
][
1
]
-
saveSum
[
1
],
sums
[
index
][
2
]
-
saveSum
[
2
]
);
saveSum
[
0
]
=
sums
[
index
][
0
];
saveSum
[
1
]
=
sums
[
index
][
1
];
saveSum
[
2
]
=
sums
[
index
][
2
];
}
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
count
++
;
}
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
int
index
=
0
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Sum1 [%16.9e %16.9e %16.9e]
\n
"
,
sums
[
index
][
0
],
sums
[
index
][
1
],
sums
[
index
][
2
]
);
index
++
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Sum2 [%16.9e %16.9e %16.9e]
\n
"
,
sums
[
index
][
0
],
sums
[
index
][
1
],
sums
[
index
][
2
]
);
index
++
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Sum3 [%16.9e %16.9e %16.9e]
\n
"
,
sums
[
index
][
0
],
sums
[
index
][
1
],
sums
[
index
][
2
]
);
index
++
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Sum4 [%16.9e %16.9e %16.9e]
\n
"
,
sums
[
index
][
0
],
sums
[
index
][
1
],
sums
[
index
][
2
]
);
index
++
;
}
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
if
(
ii
==
maxPrint
&&
(
gpu
->
natoms
-
maxPrint
)
>
ii
){
ii
=
gpu
->
natoms
-
maxPrint
;
}
}
(
void
)
fflush
(
amoebaGpu
->
log
);
iteration
++
;
}
delete
debugArray
;
#endif
}
/**---------------------------------------------------------------------------------------
...
...
@@ -457,18 +310,6 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
// ---------------------------------------------------------------------------------------
#ifdef AMOEBA_DEBUG
static
const
char
*
methodName
=
"cudaComputeAmoebaMutualInducedFieldBySOR"
;
static
int
timestep
=
0
;
std
::
vector
<
int
>
fileId
;
timestep
++
;
fileId
.
resize
(
2
);
fileId
[
0
]
=
timestep
;
fileId
[
1
]
=
1
;
#endif
// ---------------------------------------------------------------------------------------
int
done
;
int
iteration
;
...
...
@@ -489,38 +330,6 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
cudaMemcpy
(
amoebaGpu
->
psInducedDipole
->
_pDevData
,
amoebaGpu
->
psE_Field
->
_pDevData
,
3
*
gpu
->
sim
.
paddedNumberOfAtoms
*
sizeof
(
float
),
cudaMemcpyDeviceToDevice
);
cudaMemcpy
(
amoebaGpu
->
psInducedDipolePolar
->
_pDevData
,
amoebaGpu
->
psE_FieldPolar
->
_pDevData
,
3
*
gpu
->
sim
.
paddedNumberOfAtoms
*
sizeof
(
float
),
cudaMemcpyDeviceToDevice
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
amoebaGpu
->
psE_Field
->
Download
();
amoebaGpu
->
psE_FieldPolar
->
Download
();
amoebaGpu
->
psInducedDipole
->
Download
(),
amoebaGpu
->
psInducedDipolePolar
->
Download
();
amoebaGpu
->
psPolarizability
->
Download
();
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s Initial setup for matrix multiply
\n
"
,
methodName
);
int
offset
=
0
;
int
maxPrint
=
20000
;
for
(
int
ii
=
0
;
ii
<
gpu
->
natoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%4d pol=%12.4e "
,
ii
,
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
]
);
if
(
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
]
!=
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
+
1
]
||
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
]
!=
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
+
2
]
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"PolX!!! %12.4e %12.4e "
,
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
+
2
]
);
}
(
void
)
fprintf
(
amoebaGpu
->
log
,
" E[%14.6e %14.6e %14.6e] Mi[%14.6e %14.6e %14.6e] "
,
amoebaGpu
->
psE_Field
->
_pSysData
[
offset
],
amoebaGpu
->
psE_Field
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psE_Field
->
_pSysData
[
offset
+
2
],
amoebaGpu
->
psInducedDipole
->
_pSysData
[
offset
],
amoebaGpu
->
psInducedDipole
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psInducedDipole
->
_pSysData
[
offset
+
2
]
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Ep[%14.6e %14.6e %14.6e] Mip[%14.6e %14.6e %14.6e]
\n
"
,
amoebaGpu
->
psE_FieldPolar
->
_pSysData
[
offset
],
amoebaGpu
->
psE_FieldPolar
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psE_FieldPolar
->
_pSysData
[
offset
+
2
],
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
offset
],
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
offset
+
2
]
);
offset
+=
3
;
if
(
ii
==
maxPrint
&&
(
ii
<
(
gpu
->
natoms
-
maxPrint
)
)
)
ii
=
(
gpu
->
natoms
-
maxPrint
);
}
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
// if polarization type is direct, set flags signalling done and return
if
(
amoebaGpu
->
amoebaSim
.
polarizationType
)
...
...
@@ -558,12 +367,6 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
amoebaGpu
->
psCurrentEpsilon
->
_pDevData
);
LAUNCHERROR
(
"kReduceMutualInducedFieldDelta"
);
#ifdef AMOEBA_DEBUG
if
(
0
&&
amoebaGpu
->
log
){
// trackMutualInducedIterations
trackMutualInducedIterations
(
amoebaGpu
,
iteration
);
}
#endif
// Debye=48.033324f
amoebaGpu
->
psCurrentEpsilon
->
Download
();
float
currentEpsilon
=
amoebaGpu
->
psCurrentEpsilon
->
_pSysData
[
0
];
...
...
@@ -572,54 +375,12 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
if
(
iteration
>
amoebaGpu
->
mutualInducedMaxIterations
||
amoebaGpu
->
mutualInducedCurrentEpsilon
<
amoebaGpu
->
mutualInducedTargetEpsilon
){
done
=
1
;
}
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
amoebaGpu
->
psInducedDipole
->
Download
();
amoebaGpu
->
psInducedDipolePolar
->
Download
();
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s iteration=%3d eps %14.6e done=%d
\n
"
,
methodName
,
iteration
,
amoebaGpu
->
mutualInducedCurrentEpsilon
,
done
);
(
void
)
fflush
(
amoebaGpu
->
log
);
int
offset
=
0
;
int
maxPrint
=
20
;
for
(
int
ii
=
0
;
ii
<
gpu
->
natoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%4d "
,
ii
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
" Mi[%14.6e %14.6e %14.6e] "
,
amoebaGpu
->
psInducedDipole
->
_pSysData
[
offset
],
amoebaGpu
->
psInducedDipole
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psInducedDipole
->
_pSysData
[
offset
+
2
]
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Mip[%14.6e %14.6e %14.6e]
\n
"
,
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
offset
],
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
offset
+
2
]
);
if
(
ii
==
maxPrint
&&
(
ii
<
(
gpu
->
natoms
-
maxPrint
)
)
){
ii
=
(
gpu
->
natoms
-
maxPrint
);
offset
=
3
*
(
ii
+
1
);
}
else
{
offset
+=
3
;
}
}
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
iteration
++
;
}
amoebaGpu
->
mutualInducedDone
=
done
;
amoebaGpu
->
mutualInducedConverged
=
(
!
done
||
iteration
>
amoebaGpu
->
mutualInducedMaxIterations
)
?
0
:
1
;
#ifdef AMOEBA_DEBUG
if
(
0
){
std
::
vector
<
int
>
fileId
;
//fileId.push_back( 0 );
VectorOfDoubleVectors
outputVector
;
// cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psInducedDipole
,
outputVector
,
NULL
,
1.0
f
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psInducedDipolePolar
,
outputVector
,
NULL
,
1.0
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaMI"
,
fileId
,
outputVector
);
}
#endif
// ---------------------------------------------------------------------------------------
}
void
cudaComputeAmoebaMutualInducedField
(
amoebaGpuContext
amoebaGpu
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment