Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
2b508482
"platforms/cuda/vscode:/vscode.git/clone" did not exist on "97ebc35690c706424ae09f67e3d5be6abe7d0d5a"
Commit
2b508482
authored
Nov 23, 2011
by
Mark Friedrichs
Browse files
Added copyright
Removed debugging code
parent
36762962
Changes
33
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
336 additions
and
2182 deletions
+336
-2182
platforms/cuda/tests/TestCudaGBVIForce2.cpp
platforms/cuda/tests/TestCudaGBVIForce2.cpp
+33
-4
platforms/opencl/tests/TestOpenCLGBSAOBCForce2.cpp
platforms/opencl/tests/TestOpenCLGBSAOBCForce2.cpp
+33
-4
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
...rms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
+27
-108
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.h
...orms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.h
+1
-6
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
...cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
+26
-145
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.h
.../cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.h
+9
-384
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
...forms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
+25
-106
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.h
...tforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.h
+5
-237
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedFieldParticle.h
...cuda/src/kernels/kCalculateAmoebaCudaFixedFieldParticle.h
+1
-5
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaGrycuk.cu
.../platforms/cuda/src/kernels/kCalculateAmoebaCudaGrycuk.cu
+28
-159
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaGrycukChainRule.h
...ms/cuda/src/kernels/kCalculateAmoebaCudaGrycukChainRule.h
+4
-115
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
...latforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
+28
-126
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.h
...platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.h
+1
-9
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
...rms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
+26
-45
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.h
...orms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.h
+1
-2
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaLocalForces.cu
...forms/cuda/src/kernels/kCalculateAmoebaCudaLocalForces.cu
+0
-12
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMapTorques.cu
...tforms/cuda/src/kernels/kCalculateAmoebaCudaMapTorques.cu
+25
-40
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
...c/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
+30
-312
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.h
...rc/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.h
+7
-98
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
...uda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
+26
-265
No files found.
platforms/cuda/tests/TestCudaGBVIForce2.cpp
View file @
2b508482
...
...
@@ -61,6 +61,17 @@
#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
#include "OpenMM.h"
#if TEST_PLATFORM == TEST_OPENCL_PLATFORM
#include "ReferencePlatform.h"
#include "OpenCLPlatform.h"
#endif
#if TEST_PLATFORM == TEST_CUDA_PLATFORM
#include "ReferencePlatform.h"
#include "CudaPlatform.h"
#endif
#ifdef USE_SOFTCORE
#include "OpenMMFreeEnergy.h"
#include "openmm/freeEnergyKernels.h"
...
...
@@ -2900,23 +2911,32 @@ void runSystemComparisonTest( System& system1, System& system2,
throw
OpenMMException
(
msg
.
str
()
);
}
#if TEST_PLATFORM == TEST_OPENCL_PLATFORM
ReferencePlatform
platform1
;
OpenCLPlatform
platform2
;
#elif TEST_PLATFORM == TEST_CUDA_PLATFORM
ReferencePlatform
platform1
;
CudaPlatform
platform2
;
#else
Platform
&
platform1
=
Platform
::
getPlatformByName
(
platformName1
);
if
(
deviceId1
){
setDeviceId
(
platform1
,
deviceId1
,
log
);
}
setDeviceIdUsingEnvVariable
(
platform1
,
log
);
Context
context1
(
system1
,
integrator1
,
platform1
);
context1
.
setPositions
(
positions
);
State
state1
=
context1
.
getState
(
State
::
Forces
|
State
::
Energy
);
Platform
&
platform2
=
Platform
::
getPlatformByName
(
platformName2
);
if
(
deviceId2
){
setDeviceId
(
platform2
,
deviceId2
,
log
);
}
setDeviceIdUsingEnvVariable
(
platform2
,
log
);
#endif
Context
context1
(
system1
,
integrator1
,
platform1
);
context1
.
setPositions
(
positions
);
State
state1
=
context1
.
getState
(
State
::
Forces
|
State
::
Energy
);
Context
context2
(
system2
,
integrator2
,
platform2
);
context2
.
setPositions
(
positions
);
State
state2
=
context2
.
getState
(
State
::
Forces
|
State
::
Energy
);
double
energyDiff
=
0.0
;
...
...
@@ -3419,10 +3439,19 @@ int main() {
}
}
// if TEST_PLATFORM is not set, then check that required libs are available
#if TEST_PLATFORM != TEST_OPENCL_PLATFORM && TEST_PLATFORM != TEST_CUDA_PLATFORM
envVariableIsSet
=
checkRequiredLibsAreAvailable
(
requiredLibs
,
loadedLibs
,
log
);
if
(
envVariableIsSet
==
0
&&
log
){
(
void
)
fprintf
(
log
,
"Aborting tests due to missing libs.
\n
"
);
}
#else
// unit test path: force tests to run
envVariableIsSet
=
1
;
#endif
if
(
platformId2s
.
size
()
>
0
){
generativeArgumentMaps
[
"platformId2"
]
=
platformId2s
;
...
...
platforms/opencl/tests/TestOpenCLGBSAOBCForce2.cpp
View file @
2b508482
...
...
@@ -61,6 +61,17 @@
#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
#include "OpenMM.h"
#if TEST_PLATFORM == TEST_OPENCL_PLATFORM
#include "ReferencePlatform.h"
#include "OpenCLPlatform.h"
#endif
#if TEST_PLATFORM == TEST_CUDA_PLATFORM
#include "ReferencePlatform.h"
#include "CudaPlatform.h"
#endif
#ifdef USE_SOFTCORE
#include "OpenMMFreeEnergy.h"
#include "openmm/freeEnergyKernels.h"
...
...
@@ -2900,23 +2911,32 @@ void runSystemComparisonTest( System& system1, System& system2,
throw
OpenMMException
(
msg
.
str
()
);
}
#if TEST_PLATFORM == TEST_OPENCL_PLATFORM
ReferencePlatform
platform1
;
OpenCLPlatform
platform2
;
#elif TEST_PLATFORM == TEST_CUDA_PLATFORM
ReferencePlatform
platform1
;
CudaPlatform
platform2
;
#else
Platform
&
platform1
=
Platform
::
getPlatformByName
(
platformName1
);
if
(
deviceId1
){
setDeviceId
(
platform1
,
deviceId1
,
log
);
}
setDeviceIdUsingEnvVariable
(
platform1
,
log
);
Context
context1
(
system1
,
integrator1
,
platform1
);
context1
.
setPositions
(
positions
);
State
state1
=
context1
.
getState
(
State
::
Forces
|
State
::
Energy
);
Platform
&
platform2
=
Platform
::
getPlatformByName
(
platformName2
);
if
(
deviceId2
){
setDeviceId
(
platform2
,
deviceId2
,
log
);
}
setDeviceIdUsingEnvVariable
(
platform2
,
log
);
#endif
Context
context1
(
system1
,
integrator1
,
platform1
);
context1
.
setPositions
(
positions
);
State
state1
=
context1
.
getState
(
State
::
Forces
|
State
::
Energy
);
Context
context2
(
system2
,
integrator2
,
platform2
);
context2
.
setPositions
(
positions
);
State
state2
=
context2
.
getState
(
State
::
Forces
|
State
::
Energy
);
double
energyDiff
=
0.0
;
...
...
@@ -3419,10 +3439,19 @@ int main() {
}
}
// if TEST_PLATFORM is not set, then check that required libs are available
#if TEST_PLATFORM != TEST_OPENCL_PLATFORM && TEST_PLATFORM != TEST_CUDA_PLATFORM
envVariableIsSet
=
checkRequiredLibsAreAvailable
(
requiredLibs
,
loadedLibs
,
log
);
if
(
envVariableIsSet
==
0
&&
log
){
(
void
)
fprintf
(
log
,
"Aborting tests due to missing libs.
\n
"
);
}
#else
// unit test path: force tests to run
envVariableIsSet
=
1
;
#endif
if
(
platformId2s
.
size
()
>
0
){
generativeArgumentMaps
[
"platformId2"
]
=
platformId2s
;
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
View file @
2b508482
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
//#define AMOEBA_DEBUG
static
__constant__
cudaGmxSimulation
cSim
;
static
__constant__
cudaAmoebaGmxSimulation
cAmoebaSim
;
...
...
@@ -591,8 +611,7 @@ __device__ void calculateElectrostaticPairIxn_kernel( ElectrostaticParticle& ato
// reduce psWorkArray_3_1 -> torque
static
void
kReduceTorque
(
amoebaGpuContext
amoebaGpu
)
{
static
void
kReduceTorque
(
amoebaGpuContext
amoebaGpu
){
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
kReduceFields_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
bsf_reduce_threads_per_block
>>>
(
gpu
->
sim
.
paddedNumberOfAtoms
*
3
,
gpu
->
sim
.
outputBuffers
,
...
...
@@ -609,42 +628,14 @@ static void kReduceTorque(amoebaGpuContext amoebaGpu )
--------------------------------------------------------------------------------------- */
void
cudaComputeAmoebaElectrostatic
(
amoebaGpuContext
amoebaGpu
,
int
addTorqueToForce
)
{
void
cudaComputeAmoebaElectrostatic
(
amoebaGpuContext
amoebaGpu
,
int
addTorqueToForce
){
// ---------------------------------------------------------------------------------------
#ifdef AMOEBA_DEBUG
static
const
char
*
methodName
=
"cudaComputeAmoebaElectrostatic"
;
static
int
timestep
=
0
;
std
::
vector
<
int
>
fileId
;
timestep
++
;
fileId
.
resize
(
2
);
fileId
[
0
]
=
timestep
;
fileId
[
1
]
=
1
;
#endif
// ---------------------------------------------------------------------------------------
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
// apparently debug array can take up nontrivial no. registers
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s %d maxCovalentDegreeSz=%d ZZZ
\n
"
,
methodName
,
gpu
->
natoms
,
amoebaGpu
->
maxCovalentDegreeSz
);
}
static
const
int
maxSlots
=
20
;
int
paddedNumberOfAtoms
=
gpu
->
sim
.
paddedNumberOfAtoms
;
CUDAStream
<
float4
>*
debugArray
=
new
CUDAStream
<
float4
>
(
maxSlots
*
paddedNumberOfAtoms
,
1
,
"DebugArray"
);
memset
(
debugArray
->
_pSysData
,
0
,
sizeof
(
float
)
*
4
*
maxSlots
*
paddedNumberOfAtoms
);
debugArray
->
Upload
();
//unsigned int targetAtom = 1137;
unsigned
int
targetAtom
=
1
;
#endif
// on first pass, set threads/block
static
unsigned
int
threadsPerBlock
=
0
;
...
...
@@ -662,20 +653,6 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
kClearFields_3
(
amoebaGpu
,
1
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaCudaElectrostaticN2Forces warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u
\n
"
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
ElectrostaticParticle
),
sizeof
(
ElectrostaticParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
ElectrostaticParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
debugArray
->
_pDevData
,
targetAtom
);
}
else
{
kCalculateAmoebaCudaElectrostaticN2Forces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
ElectrostaticParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
debugArray
->
_pDevData
,
targetAtom
);
}
#else
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
ElectrostaticParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
);
...
...
@@ -683,66 +660,8 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
kCalculateAmoebaCudaElectrostaticN2Forces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
ElectrostaticParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
);
}
#endif
LAUNCHERROR
(
"kCalculateAmoebaCudaElectrostaticN2Forces"
);
#ifdef AMOEBA_DEBUG
if
(
0
){
debugArray
->
Download
();
std
::
vector
<
double
>
conversions
;
conversions
.
push_back
(
0.1
f
/
4.184
f
);
conversions
.
push_back
(
0.1
f
/
4.184
f
);
unsigned
int
kkBlocks
=
4
;
(
void
)
fprintf
(
stderr
,
"
\n
Target atom output %5u
\n
"
,
targetAtom
);
for
(
unsigned
int
ii
=
0
;
ii
<
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
ii
++
){
double
sum
=
0.0
;
for
(
unsigned
int
kk
=
0
;
kk
<
kkBlocks
&&
sum
==
0.0
;
kk
++
){
unsigned
int
index
=
ii
+
kk
*
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
sum
+=
debugArray
->
_pSysData
[
index
].
x
+
debugArray
->
_pSysData
[
index
].
y
+
debugArray
->
_pSysData
[
index
].
z
+
debugArray
->
_pSysData
[
index
].
w
;
}
if
(
sum
>
0.0
){
(
void
)
fprintf
(
stderr
,
"%5u"
,
ii
);
for
(
unsigned
int
kk
=
0
;
kk
<
kkBlocks
;
kk
++
){
unsigned
int
index
=
ii
+
kk
*
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
(
void
)
fprintf
(
stderr
,
" %15.7e %15.7e %15.7e %5.1f"
,
conversions
[
kk
]
*
debugArray
->
_pSysData
[
index
].
x
,
conversions
[
kk
]
*
debugArray
->
_pSysData
[
index
].
y
,
conversions
[
kk
]
*
debugArray
->
_pSysData
[
index
].
z
,
debugArray
->
_pSysData
[
index
].
w
);
if
(
((
kk
+
1
)
%
2
)
==
0
&&
(
kk
!=
(
kkBlocks
-
1
)
)
){
(
void
)
fprintf
(
stderr
,
"
\n
%5u"
,
ii
);
}
}
(
void
)
fprintf
(
stderr
,
"
\n
"
);
if
(
kkBlocks
>
2
){
(
void
)
fprintf
(
stderr
,
"
\n
"
);
}
}
}
}
#endif
#ifdef AMOEBA_DEBUG
if
(
0
){
VectorOfDoubleVectors
outputVector
;
std
::
vector
<
int
>
fileId
;
static
int
call
=
0
;
fileId
.
push_back
(
call
++
);
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
CUDAStream
<
float
>*
temp
=
new
CUDAStream
<
float
>
(
3
*
paddedNumberOfAtoms
,
1
,
"ElectrostaticTemp"
);
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
reduceAndCopyCUDAStreamFloat4
(
gpu
->
psForce4
,
temp
,
1.0
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
temp
,
outputVector
,
NULL
,
1.0
f
/
4.184
f
);
reduceAndCopyCUDAStreamFloat
(
amoebaGpu
->
psWorkArray_3_1
,
temp
,
1.0
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
temp
,
outputVector
,
NULL
,
1.0
f
/
4.184
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaElectrostaticTorque"
,
fileId
,
outputVector
);
delete
temp
;
}
#endif
if
(
addTorqueToForce
){
kReduceTorque
(
amoebaGpu
);
cudaComputeAmoebaMapTorqueAndAddToForce
(
amoebaGpu
,
amoebaGpu
->
psTorque
);
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.h
View file @
2b508482
...
...
@@ -35,12 +35,7 @@ __launch_bounds__(128, 1)
__launch_bounds__
(
64
,
1
)
#endif
void
METHOD_NAME
(
kCalculateAmoebaCudaElectrostatic
,
Forces_kernel
)(
unsigned
int
*
workUnit
,
float
*
outputTorque
#ifdef AMOEBA_DEBUG
,
float4
*
debugArray
,
unsigned
int
targetAtom
#endif
){
unsigned
int
*
workUnit
,
float
*
outputTorque
){
extern
__shared__
volatile
ElectrostaticParticle
sA
[];
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
View file @
2b508482
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
static
__constant__
cudaGmxSimulation
cSim
;
static
__constant__
cudaAmoebaGmxSimulation
cAmoebaSim
;
...
...
@@ -31,6 +49,7 @@ void GetCalculateAmoebaCudaFixedEAndGKFieldSim(amoebaGpuContext amoebaGpu)
status
=
cudaMemcpyFromSymbol
(
&
amoebaGpu
->
amoebaSim
,
cAmoebaSim
,
sizeof
(
cudaAmoebaGmxSimulation
));
RTERROR
(
status
,
"GetCalculateAmoebaCudaFixedEAndGKFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed"
);
}
// reduce psWorkArray_3_1 -> E_Field
// reduce psWorkArray_3_2 -> E_FieldPolar
// reduce psWorkArray_3_3 -> Gk_FieldPolar
...
...
@@ -66,10 +85,6 @@ __device__ void calculateFixedGkFieldPairIxn_kernel( float4 atomCoordinatesI,
float
*
labFrameQuadrupoleI
,
float
*
labFrameQuadrupoleJ
,
float
rb2
,
float
outputField
[
2
][
3
]
#ifdef AMOEBA_DEBUG
,
float4
*
debugArray
#endif
){
float
xi
,
yi
,
zi
;
...
...
@@ -325,32 +340,10 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
// ---------------------------------------------------------------------------------------
#ifdef AMOEBA_DEBUG
static
const
char
*
methodName
=
"computeCudaAmoebaFixedEAndGKFields"
;
#endif
// ---------------------------------------------------------------------------------------
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
%s
\n
"
,
methodName
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
// N2 debug array
int
maxSlots
=
10
;
CUDAStream
<
float4
>*
debugArray
=
new
CUDAStream
<
float4
>
(
maxSlots
*
paddedNumberOfAtoms
,
1
,
"DebugArray"
);
memset
(
debugArray
->
_pSysData
,
0
,
sizeof
(
float
)
*
4
*
maxSlots
*
paddedNumberOfAtoms
);
debugArray
->
Upload
();
// print intermediate results for the targetAtom
unsigned
int
targetAtom
=
0
;
#endif
// on first pass, set threads/block
static
unsigned
int
threadsPerBlock
=
0
;
...
...
@@ -367,133 +360,21 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
kClearFields_3
(
amoebaGpu
,
3
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u
\n
"
,
methodName
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
FixedFieldParticle
),
sizeof
(
FixedFieldParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaFixedEAndGkFieldN2ByWarp_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
FixedFieldParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_2
->
_pDevData
,
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_3
->
_pDevData
,
debugArray
->
_pDevData
,
targetAtom
);
#else
amoebaGpu
->
psWorkArray_3_3
->
_pDevData
);
#endif
}
else
{
kCalculateAmoebaFixedEAndGkFieldN2_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
FixedFieldParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_2
->
_pDevData
,
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_3
->
_pDevData
,
debugArray
->
_pDevData
,
targetAtom
);
#else
amoebaGpu
->
psWorkArray_3_3
->
_pDevData
);
#endif
}
LAUNCHERROR
(
"kCalculateAmoebaFixedEAndGkFieldN2_kernel"
);
kReduceEAndGkFields
(
amoebaGpu
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
gpu
->
psInteractionCount
->
Download
();
(
void
)
fprintf
(
amoebaGpu
->
log
,
"AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u
\n
"
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
FixedFieldParticle
),
sizeof
(
FixedFieldParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
amoebaGpu
->
psWorkArray_3_1
->
Download
();
amoebaGpu
->
psWorkArray_3_2
->
Download
();
amoebaGpu
->
psWorkArray_3_3
->
Download
();
amoebaGpu
->
psE_Field
->
Download
();
amoebaGpu
->
psE_FieldPolar
->
Download
();
amoebaGpu
->
psGk_Field
->
Download
();
gpu
->
psBornRadii
->
Download
();
(
void
)
fprintf
(
amoebaGpu
->
log
,
"OutE & Gk Fields
\n
"
);
int
maxPrint
=
32
;
for
(
int
ii
=
0
;
ii
<
gpu
->
natoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%5d "
,
ii
);
int
indexOffset
=
ii
*
3
;
// E_Field
(
void
)
fprintf
(
amoebaGpu
->
log
,
"E[%16.9e %16.9e %16.9e] "
,
amoebaGpu
->
psE_Field
->
_pSysData
[
indexOffset
],
amoebaGpu
->
psE_Field
->
_pSysData
[
indexOffset
+
1
],
amoebaGpu
->
psE_Field
->
_pSysData
[
indexOffset
+
2
]
);
// E_Field polar
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Epol[%16.9e %16.9e %16.9e] "
,
amoebaGpu
->
psE_FieldPolar
->
_pSysData
[
indexOffset
],
amoebaGpu
->
psE_FieldPolar
->
_pSysData
[
indexOffset
+
1
],
amoebaGpu
->
psE_FieldPolar
->
_pSysData
[
indexOffset
+
2
]
);
// Gk_Field polar
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Gk[%16.9e %16.9e %16.9e] "
,
amoebaGpu
->
psGk_Field
->
_pSysData
[
indexOffset
],
amoebaGpu
->
psGk_Field
->
_pSysData
[
indexOffset
+
1
],
amoebaGpu
->
psGk_Field
->
_pSysData
[
indexOffset
+
2
]
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
if
(
ii
==
maxPrint
&&
(
gpu
->
natoms
-
maxPrint
)
>
ii
){
ii
=
gpu
->
natoms
-
maxPrint
;
}
}
(
void
)
fflush
(
amoebaGpu
->
log
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"EFields End
\n
"
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"DebugQ
\n
"
);
debugArray
->
Download
();
if
(
0
){
int
ii
=
targetAtom
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
for
(
int
jj
=
0
;
jj
<
gpu
->
natoms
;
jj
++
){
int
debugIndex
=
jj
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%4d %4d Qint [%16.9e %16.9e %16.9e %16.9e] %16.9e "
,
ii
,
jj
,
debugArray
->
_pSysData
[
debugIndex
].
x
,
debugArray
->
_pSysData
[
debugIndex
].
y
,
debugArray
->
_pSysData
[
debugIndex
].
z
,
debugArray
->
_pSysData
[
debugIndex
].
w
,
gpu
->
psBornRadii
->
_pSysData
[
jj
]
);
for
(
int
kk
=
0
;
kk
<
2
;
kk
++
){
debugIndex
+=
paddedNumberOfAtoms
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"[%16.9e %16.9e %16.9e] "
,
debugArray
->
_pSysData
[
debugIndex
].
x
,
debugArray
->
_pSysData
[
debugIndex
].
y
,
debugArray
->
_pSysData
[
debugIndex
].
z
);
}
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
}
}
// write results to file
if
(
0
){
std
::
vector
<
int
>
fileId
;
//fileId.push_back( 0 );
VectorOfDoubleVectors
outputVector
;
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psE_Field
,
outputVector
,
NULL
,
1.0
f
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psE_FieldPolar
,
outputVector
,
NULL
,
1.0
f
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psGk_Field
,
outputVector
,
NULL
,
1.0
f
);
cudaLoadCudaFloat4Array
(
gpu
->
natoms
,
3
,
gpu
->
psPosq4
,
outputVector
,
NULL
,
1.0
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaEAndGkField"
,
fileId
,
outputVector
);
}
delete
debugArray
;
}
#endif
}
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.h
View file @
2b508482
...
...
@@ -38,15 +38,7 @@ void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)(
unsigned
int
*
workUnit
,
float
*
outputEField
,
float
*
outputEFieldPolar
,
float
*
outputGkField
#ifdef AMOEBA_DEBUG
,
float4
*
debugArray
,
unsigned
int
targetAtom
#endif
){
#ifdef AMOEBA_DEBUG
float4
pullBack
[
12
];
#endif
float
*
outputGkField
){
extern
__shared__
FixedFieldParticle
sA
[];
...
...
@@ -127,11 +119,7 @@ void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)(
loadFixedFieldParticleData
(
&
(
psA
[
j
]),
&
jCoord
,
jDipole
,
jQuadrupole
,
&
jBornRadius
);
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
j
],
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
j
],
ijField
);
unsigned
int
match
=
(
atomI
==
(
y
+
j
))
?
1
:
0
;
...
...
@@ -150,11 +138,7 @@ void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)(
calculateFixedGkFieldPairIxn_kernel
(
iCoord
,
jCoord
,
&
(
labFrameDipole
[
atomI
*
3
]),
jDipole
,
&
(
labFrameQuadrupole
[
atomI
*
9
]),
jQuadrupole
,
bornRadii
[
atomI
]
*
jBornRadius
,
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
bornRadii
[
atomI
]
*
jBornRadius
,
ijField
);
gkFieldSum
[
0
]
+=
match
?
0
.
0
f
:
ijField
[
0
][
0
];
gkFieldSum
[
1
]
+=
match
?
0
.
0
f
:
ijField
[
0
][
1
];
...
...
@@ -179,11 +163,7 @@ void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)(
loadFixedFieldParticleData
(
&
(
psA
[
j
]),
&
jCoord
,
jDipole
,
jQuadrupole
,
&
jBornRadius
);
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
j
],
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
j
],
ijField
);
float
dScaleVal
;
...
...
@@ -206,85 +186,12 @@ void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)(
eFieldPolarSum
[
1
]
+=
match
?
0
.
0
f
:
pScaleVal
*
ijField
[
0
][
1
];
eFieldPolarSum
[
2
]
+=
match
?
0
.
0
f
:
pScaleVal
*
ijField
[
0
][
2
];
//#ifdef AMOEBA_DEBUG
#if 0
if( 0 && atomI == targetAtom ){
unsigned int index = atomI == targetAtom ? (y + j) : atomI;
unsigned int pullBackIndex = 0;
unsigned int indexI = 0;
unsigned int indexJ = indexI ? 0 : 1;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j);
debugArray[index].z = dScaleVal;
debugArray[index].w = pScaleVal;
pullBackIndex += 2;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
pullBackIndex++;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
pullBackIndex++;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : dScaleVal*ijField[indexI][0];
debugArray[index].y = match ? 0.0f : dScaleVal*ijField[indexI][1];
debugArray[index].z = match ? 0.0f : dScaleVal*ijField[indexI][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : pScaleVal*ijField[indexI][0];
debugArray[index].y = match ? 0.0f : pScaleVal*ijField[indexI][1];
debugArray[index].z = match ? 0.0f : pScaleVal*ijField[indexI][2];
/*
index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = pScaleVal + 10.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z;
*/
}
#endif
// GK field
calculateFixedGkFieldPairIxn_kernel
(
iCoord
,
jCoord
,
&
(
labFrameDipole
[
atomI
*
3
]),
jDipole
,
&
(
labFrameQuadrupole
[
atomI
*
9
]),
jQuadrupole
,
bornRadii
[
atomI
]
*
jBornRadius
,
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
bornRadii
[
atomI
]
*
jBornRadius
,
ijField
);
match
=
(
atomI
>=
cSim
.
atoms
)
||
((
y
+
tj
)
>=
cSim
.
atoms
)
?
1
:
0
;
gkFieldSum
[
0
]
+=
match
?
0
.
0
f
:
ijField
[
0
][
0
];
...
...
@@ -292,45 +199,6 @@ if( 0 && atomI == targetAtom ){
gkFieldSum
[
2
]
+=
match
?
0
.
0
f
:
ijField
[
0
][
2
];
//#ifdef AMOEBA_DEBUG
#if 0
if( atomI == targetAtom ){
unsigned int index = atomI == targetAtom ? (y + j) : atomI;
///unsigned int pullBackIndex = 0;
unsigned int indexI = 0;
unsigned int indexJ = indexI ? 0 : 1;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j);
debugArray[index].z = bornRadii[atomI];
debugArray[index].w = jBornRadius;
/*
pullBackIndex += 2;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
pullBackIndex++;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
pullBackIndex++;
*/
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[indexI][0];
debugArray[index].y = match ? 0.0f : ijField[indexI][1];
debugArray[index].z = match ? 0.0f : ijField[indexI][2];
index += match ? 0.0f : cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[indexJ][0];
debugArray[index].y = match ? 0.0f : ijField[indexJ][1];
debugArray[index].z = match ? 0.0f : ijField[indexJ][2];
}
#endif
}
}
...
...
@@ -377,11 +245,7 @@ if( atomI == targetAtom ){
loadFixedFieldParticleData
(
&
(
psA
[
tj
]),
&
jCoord
,
jDipole
,
jQuadrupole
,
&
jBornRadius
);
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
tj
],
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
tj
],
ijField
);
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
...
...
@@ -403,84 +267,12 @@ if( atomI == targetAtom ){
psA
[
tj
].
eFieldP
[
1
]
+=
ijField
[
1
][
1
];
psA
[
tj
].
eFieldP
[
2
]
+=
ijField
[
1
][
2
];
// #ifdef AMOEBA_DEBUG
#if 0
if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
unsigned int indexI = (atomI == targetAtom) ? 0 : 1;
unsigned int indexJ = (atomI == targetAtom) ? 1 : 0;
//unsigned int pullBackIndex = 0;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
/*
debugArray[index].z = pullBack[pullBackIndex++];
debugArray[index].w = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
*/
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
#if 0
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z;
index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = pScaleVal + 10.0f;
#endif
}
#endif
// Gk field
calculateFixedGkFieldPairIxn_kernel
(
iCoord
,
jCoord
,
&
(
labFrameDipole
[
atomI
*
3
]),
jDipole
,
&
(
labFrameQuadrupole
[
atomI
*
9
]),
jQuadrupole
,
bornRadii
[
atomI
]
*
jBornRadius
,
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
bornRadii
[
atomI
]
*
jBornRadius
,
ijField
);
gkFieldSum
[
0
]
+=
ijField
[
0
][
0
];
gkFieldSum
[
1
]
+=
ijField
[
0
][
1
];
...
...
@@ -491,41 +283,6 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
psA
[
tj
].
gkField
[
2
]
+=
ijField
[
1
][
2
];
//#ifdef AMOEBA_DEBUG
#if 0
if( (atomI == targetAtom || (y + tj) == targetAtom) ){
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
unsigned int indexI = (atomI == targetAtom) ? 0 : 1;
unsigned int indexJ = (atomI == targetAtom) ? 1 : 0;
//unsigned int pullBackIndex = 0;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
debugArray[index].z = bornRadii[atomI];
debugArray[index].w = jBornRadius;
/*
debugArray[index].z = pullBack[pullBackIndex++];
debugArray[index].w = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
*/
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
}
#endif
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
...
...
@@ -548,11 +305,7 @@ if( (atomI == targetAtom || (y + tj) == targetAtom) ){
loadFixedFieldParticleData
(
&
(
psA
[
tj
]),
&
jCoord
,
jDipole
,
jQuadrupole
,
&
jBornRadius
);
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
tj
],
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
tj
],
ijField
);
float
dScaleVal
;
float
pScaleVal
;
...
...
@@ -579,85 +332,12 @@ if( (atomI == targetAtom || (y + tj) == targetAtom) ){
psA
[
tj
].
eFieldP
[
1
]
+=
pScaleVal
*
ijField
[
1
][
1
];
psA
[
tj
].
eFieldP
[
2
]
+=
pScaleVal
*
ijField
[
1
][
2
];
//#ifdef AMOEBA_DEBUG
#if 0
if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
// unsigned int indexI = (atomI == targetAtom) ? 0 : 1;
// unsigned int indexJ = (atomI == targetAtom) ? 1 : 0;
// unsigned int pullBackIndex = 0;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
debugArray[index].z = dScaleVal;
debugArray[index].w = pScaleVal;
// pullBackIndex += 2;
/*
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = dScaleVal*ijField[indexI][0];
debugArray[index].y = dScaleVal*ijField[indexI][1];
debugArray[index].z = dScaleVal*ijField[indexI][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pScaleVal*ijField[indexI][0];
debugArray[index].y = pScaleVal*ijField[indexI][1];
debugArray[index].z = pScaleVal*ijField[indexI][2];
*/
/*
index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = pScaleVal + 10.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z;
*/
}
#endif
// GK field
calculateFixedGkFieldPairIxn_kernel
(
iCoord
,
jCoord
,
&
(
labFrameDipole
[
atomI
*
3
]),
jDipole
,
&
(
labFrameQuadrupole
[
atomI
*
9
]),
jQuadrupole
,
bornRadii
[
atomI
]
*
jBornRadius
,
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
bornRadii
[
atomI
]
*
jBornRadius
,
ijField
);
if
(
(
atomI
<
cSim
.
atoms
)
&&
((
y
+
tj
)
<
cSim
.
atoms
)
){
gkFieldSum
[
0
]
+=
ijField
[
0
][
0
];
...
...
@@ -668,61 +348,6 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
psA
[
tj
].
gkField
[
1
]
+=
ijField
[
1
][
1
];
psA
[
tj
].
gkField
[
2
]
+=
ijField
[
1
][
2
];
}
//#ifdef AMOEBA_DEBUG
#if 0
if( (atomI == targetAtom || (y + tj) == targetAtom) ){
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
unsigned int indexI = (atomI == targetAtom) ? 0 : 1;
unsigned int indexJ = (atomI == targetAtom) ? 1 : 0;
// unsigned int pullBackIndex = 0;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
debugArray[index].z = bornRadii[atomI];
debugArray[index].w = jBornRadius;
// pullBackIndex += 2;
/*
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
*/
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (atomI == targetAtom || (y + tj) == targetAtom) ? ijField[indexI][0] : 0.0f;
debugArray[index].y = (atomI == targetAtom || (y + tj) == targetAtom) ? ijField[indexI][1] : 0.0f;
debugArray[index].z = (atomI == targetAtom || (y + tj) == targetAtom) ? ijField[indexI][2] : 0.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (atomI == targetAtom || (y + tj) == targetAtom) ? ijField[indexJ][0] : 0.0f;
debugArray[index].y = (atomI == targetAtom || (y + tj) == targetAtom) ? ijField[indexJ][1] : 0.0f;
debugArray[index].z = (atomI == targetAtom || (y + tj) == targetAtom) ? ijField[indexJ][2] : 0.0f;
/*
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z;
*/
}
#endif
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
}
}
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
View file @
2b508482
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
//#define AMOEBA_DEBUG
static
__constant__
cudaGmxSimulation
cSim
;
static
__constant__
cudaAmoebaGmxSimulation
cAmoebaSim
;
...
...
@@ -76,24 +95,6 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
#ifdef AMOEBA_DEBUG
static
const
char
*
methodName
=
"computeCudaAmoebaFixedEField"
;
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
%s
\n
"
,
methodName
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
// N2 debug array
CUDAStream
<
float4
>*
debugArray
=
new
CUDAStream
<
float4
>
(
10
*
paddedNumberOfAtoms
,
1
,
"DebugArray"
);
memset
(
debugArray
->
_pSysData
,
0
,
sizeof
(
float
)
*
4
*
paddedNumberOfAtoms
*
paddedNumberOfAtoms
);
debugArray
->
Upload
();
// print intermediate results for the targetAtom
unsigned
int
targetAtom
=
3
;
#endif
kClearFields_3
(
amoebaGpu
,
2
);
static
unsigned
int
threadsPerBlock
=
0
;
...
...
@@ -108,15 +109,6 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
FixedFieldParticle
),
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"cudaComputeAmoebaFixedEField numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%lu ixnCt=%lu workUnits=%lu
\n
"
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
FixedFieldParticle
),
sizeof
(
FixedFieldParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaFixedE_FieldN2ByWarpForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
FixedFieldParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
,
...
...
@@ -131,78 +123,5 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
}
LAUNCHERROR
(
"kCalculateAmoebaFixedE_FieldN2Forces_kernel"
);
#if 0
for( unsigned int ii = 0; ii < gpu->sim.outputBuffers; ii++ ){
//float index = 1.0f;
float index = (float) ii;
for( unsigned int jj = 0; jj < 3*amoebaGpu->paddedNumberOfAtoms; jj += 3 ){
unsigned int kk = 3*ii*amoebaGpu->paddedNumberOfAtoms + jj;
amoebaGpu->psWorkArray_3_1->_pSysData[kk] = index;
amoebaGpu->psWorkArray_3_1->_pSysData[kk+1] = index;
amoebaGpu->psWorkArray_3_1->_pSysData[kk+2] = index;
}
}
amoebaGpu->psWorkArray_3_1->Upload();
#endif
kReduceE_Fields_kernel
(
amoebaGpu
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
gpu
->
psInteractionCount
->
Download
();
(
void
)
fprintf
(
amoebaGpu
->
log
,
"AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%lu ixnCt=%lu workUnits=%lu
\n
"
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
FixedFieldParticle
),
sizeof
(
FixedFieldParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
amoebaGpu
->
psWorkArray_3_1
->
Download
();
amoebaGpu
->
psWorkArray_3_2
->
Download
();
amoebaGpu
->
psE_Field
->
Download
();
amoebaGpu
->
psE_FieldPolar
->
Download
();
(
void
)
fprintf
(
amoebaGpu
->
log
,
"OutEFields
\n
"
);
int
maxPrint
=
32
;
for
(
int
ii
=
0
;
ii
<
gpu
->
natoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%5d "
,
ii
);
int
indexOffset
=
ii
*
3
;
// E_Field
(
void
)
fprintf
(
amoebaGpu
->
log
,
"E[%16.9e %16.9e %16.9e] "
,
amoebaGpu
->
psE_Field
->
_pSysData
[
indexOffset
],
amoebaGpu
->
psE_Field
->
_pSysData
[
indexOffset
+
1
],
amoebaGpu
->
psE_Field
->
_pSysData
[
indexOffset
+
2
]
);
// E_Field polar
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Epol[%16.9e %16.9e %16.9e] "
,
amoebaGpu
->
psE_FieldPolar
->
_pSysData
[
indexOffset
],
amoebaGpu
->
psE_FieldPolar
->
_pSysData
[
indexOffset
+
1
],
amoebaGpu
->
psE_FieldPolar
->
_pSysData
[
indexOffset
+
2
]
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
if
(
ii
==
maxPrint
&&
(
gpu
->
natoms
-
maxPrint
)
>
ii
){
ii
=
gpu
->
natoms
-
maxPrint
;
}
}
(
void
)
fflush
(
amoebaGpu
->
log
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"EFields End
\n
"
);
// write results to file
if
(
0
){
std
::
vector
<
int
>
fileId
;
//fileId.push_back( 0 );
VectorOfDoubleVectors
outputVector
;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psE_Field
,
outputVector
,
NULL
,
1.0
f
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psE_FieldPolar
,
outputVector
,
NULL
,
1.0
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaEField"
,
fileId
,
outputVector
);
}
//delete debugArray;
}
#endif
}
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.h
View file @
2b508482
...
...
@@ -37,15 +37,7 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
void
METHOD_NAME
(
kCalculateAmoebaFixedE_Field
,
Forces_kernel
)(
unsigned
int
*
workUnit
,
float
*
outputEField
,
float
*
outputEFieldPolar
#ifdef AMOEBA_DEBUG
,
float4
*
debugArray
,
unsigned
int
targetAtom
#endif
){
#ifdef AMOEBA_DEBUG
float4
pullBack
[
12
];
#endif
float
*
outputEFieldPolar
){
extern
__shared__
FixedFieldParticle
sA
[];
...
...
@@ -106,11 +98,7 @@ void METHOD_NAME(kCalculateAmoebaFixedE_Field, Forces_kernel)(
float
ijField
[
2
][
3
];
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
j
],
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
j
],
ijField
);
unsigned
int
match
=
(
atomI
==
(
y
+
j
))
?
1
:
0
;
...
...
@@ -142,11 +130,7 @@ void METHOD_NAME(kCalculateAmoebaFixedE_Field, Forces_kernel)(
//loadFixedFieldParticleData( &(psA[j]), &jCoord, jDipole, jQuadrupole );
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
j
],
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
j
],
ijField
);
float
dScaleVal
;
float
pScaleVal
;
...
...
@@ -168,73 +152,6 @@ void METHOD_NAME(kCalculateAmoebaFixedE_Field, Forces_kernel)(
fieldPolarSum
[
1
]
+=
match
?
0
.
0
f
:
pScaleVal
*
ijField
[
0
][
1
];
fieldPolarSum
[
2
]
+=
match
?
0
.
0
f
:
pScaleVal
*
ijField
[
0
][
2
];
#ifdef AMOEBA_DEBUG
if
(
0
&&
atomI
==
targetAtom
){
unsigned
int
index
=
atomI
==
targetAtom
?
(
y
+
j
)
:
atomI
;
// unsigned int pullBackIndex = 0;
unsigned
int
indexI
=
0
;
unsigned
int
indexJ
=
indexI
?
0
:
1
;
debugArray
[
index
].
x
=
(
float
)
atomI
;
debugArray
[
index
].
y
=
(
float
)
(
y
+
j
);
debugArray
[
index
].
z
=
dScaleVal
;
debugArray
[
index
].
w
=
pScaleVal
;
/*
pullBackIndex += 2;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
*/
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexI
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexI
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexI
][
2
];
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexJ
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexJ
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexJ
][
2
];
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
match
?
0
.
0
f
:
dScaleVal
*
ijField
[
indexI
][
0
];
debugArray
[
index
].
y
=
match
?
0
.
0
f
:
dScaleVal
*
ijField
[
indexI
][
1
];
debugArray
[
index
].
z
=
match
?
0
.
0
f
:
dScaleVal
*
ijField
[
indexI
][
2
];
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
match
?
0
.
0
f
:
pScaleVal
*
ijField
[
indexI
][
0
];
debugArray
[
index
].
y
=
match
?
0
.
0
f
:
pScaleVal
*
ijField
[
indexI
][
1
];
debugArray
[
index
].
z
=
match
?
0
.
0
f
:
pScaleVal
*
ijField
[
indexI
][
2
];
/*
index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = pScaleVal + 10.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z;
*/
}
#endif
}
}
...
...
@@ -274,11 +191,7 @@ if( 0 && atomI == targetAtom ){
float
ijField
[
2
][
3
];
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
tj
],
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
tj
],
ijField
);
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
...
...
@@ -300,74 +213,6 @@ if( 0 && atomI == targetAtom ){
psA
[
tj
].
eFieldP
[
1
]
+=
ijField
[
1
][
1
];
psA
[
tj
].
eFieldP
[
2
]
+=
ijField
[
1
][
2
];
#ifdef AMOEBA_DEBUG
if
(
0
&&
(
atomI
==
targetAtom
||
(
y
+
tj
)
==
targetAtom
)
){
unsigned
int
index
=
(
atomI
==
targetAtom
)
?
(
y
+
tj
)
:
atomI
;
unsigned
int
indexI
=
(
atomI
==
targetAtom
)
?
0
:
1
;
unsigned
int
indexJ
=
(
atomI
==
targetAtom
)
?
1
:
0
;
// unsigned int pullBackIndex = 0;
debugArray
[
index
].
x
=
(
float
)
atomI
;
debugArray
[
index
].
y
=
(
float
)
(
y
+
tj
);
/*
debugArray[index].z = pullBack[pullBackIndex++];
debugArray[index].w = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
*/
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexI
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexI
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexI
][
2
];
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexJ
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexJ
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexJ
][
2
];
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexI
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexI
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexI
][
2
];
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexI
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexI
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexI
][
2
];
#if 0
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z;
index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = pScaleVal + 10.0f;
#endif
}
#endif
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
}
...
...
@@ -388,11 +233,7 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
float
ijField
[
2
][
3
];
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
tj
],
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
calculateFixedEFieldPairIxn_kernel
(
localParticle
,
psA
[
tj
],
ijField
);
float
dScaleVal
;
float
pScaleVal
;
...
...
@@ -419,79 +260,6 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
psA
[
tj
].
eFieldP
[
1
]
+=
pScaleVal
*
ijField
[
1
][
1
];
psA
[
tj
].
eFieldP
[
2
]
+=
pScaleVal
*
ijField
[
1
][
2
];
#ifdef AMOEBA_DEBUG
if
(
0
&&
(
atomI
==
targetAtom
||
(
y
+
tj
)
==
targetAtom
)
){
unsigned
int
index
=
(
atomI
==
targetAtom
)
?
(
y
+
tj
)
:
atomI
;
unsigned
int
indexI
=
(
atomI
==
targetAtom
)
?
0
:
1
;
unsigned
int
indexJ
=
(
atomI
==
targetAtom
)
?
1
:
0
;
// unsigned int pullBackIndex = 0;
debugArray
[
index
].
x
=
(
float
)
atomI
;
debugArray
[
index
].
y
=
(
float
)
(
y
+
tj
);
debugArray
[
index
].
z
=
dScaleVal
;
debugArray
[
index
].
w
=
pScaleVal
;
/*
pullBackIndex += 2;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
*/
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexI
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexI
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexI
][
2
];
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexJ
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexJ
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexJ
][
2
];
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
dScaleVal
*
ijField
[
indexI
][
0
];
debugArray
[
index
].
y
=
dScaleVal
*
ijField
[
indexI
][
1
];
debugArray
[
index
].
z
=
dScaleVal
*
ijField
[
indexI
][
2
];
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pScaleVal
*
ijField
[
indexI
][
0
];
debugArray
[
index
].
y
=
pScaleVal
*
ijField
[
indexI
][
1
];
debugArray
[
index
].
z
=
pScaleVal
*
ijField
[
indexI
][
2
];
/*
index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = pScaleVal + 10.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z;
*/
}
#endif
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
}
}
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedFieldParticle.h
View file @
2b508482
...
...
@@ -151,11 +151,7 @@ __device__ static void zeroFixedFieldParticleSharedField( struct FixedFieldParti
// body of fixed E-field calculation
__device__
static
void
calculateFixedEFieldPairIxn_kernel
(
FixedFieldParticle
&
atomI
,
FixedFieldParticle
&
atomJ
,
float
field
[
2
][
3
]
#ifdef AMOEBA_DEBUG
,
float4
debugArray
[
12
]
#endif
)
float
field
[
2
][
3
])
{
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaGrycuk.cu
View file @
2b508482
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "cudaKernels.h"
#include "amoebaCudaKernels.h"
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
static
__constant__
cudaGmxSimulation
cSim
;
static
__constant__
cudaAmoebaGmxSimulation
cAmoebaSim
;
...
...
@@ -204,44 +224,12 @@ void kReduceGrycukGbsaBornSum( amoebaGpuContext amoebaGpu )
void
kCalculateAmoebaGrycukBornRadii
(
amoebaGpuContext
amoebaGpu
)
{
#ifdef AMOEBA_DEBUG
static
const
char
*
methodName
=
"kCalculateAmoebaGrycukBornRadii"
;
static
int
timestep
=
0
;
std
::
vector
<
int
>
fileId
;
timestep
++
;
fileId
.
resize
(
2
);
fileId
[
0
]
=
timestep
;
fileId
[
1
]
=
1
;
#endif
// ---------------------------------------------------------------------------------------
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
// apparently debug array can take up nontrivial no. registers
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s %d maxCovalentDegreeSz=%d ZZZ
\n
"
,
methodName
,
gpu
->
natoms
,
amoebaGpu
->
maxCovalentDegreeSz
);
amoebaGpu
->
scalingDistanceCutoff
);
}
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
CUDAStream
<
float4
>*
debugArray
=
new
CUDAStream
<
float4
>
(
paddedNumberOfAtoms
*
paddedNumberOfAtoms
,
1
,
"DebugArray"
);
memset
(
debugArray
->
_pSysData
,
0
,
sizeof
(
float
)
*
4
*
paddedNumberOfAtoms
*
paddedNumberOfAtoms
);
debugArray
->
Upload
();
unsigned
int
targetAtom
=
0
;
gpu
->
psBornRadii
->
Download
();
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Grycuk input
\n
"
);
(
void
)
fflush
(
amoebaGpu
->
log
);
for
(
int
ii
=
0
;
ii
<
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Born %6d %16.9e
\n
"
,
ii
,
gpu
->
psBornRadii
->
_pSysData
[
ii
]
);
}
}
#endif
// on first pass, set threads/block and based on that setting the energy buffer array
static
unsigned
int
threadsPerBlock
=
0
;
...
...
@@ -255,28 +243,7 @@ void kCalculateAmoebaGrycukBornRadii( amoebaGpuContext amoebaGpu )
else
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
GrycukParticle
),
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaCudaGrycuk: blcks=%u tds=%u %u bPrWrp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u
\n
"
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
maxThreads
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
GrycukParticle
),
sizeof
(
GrycukParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
}
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaCudaGrycukN2Forces%swarp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u
\n
"
,
(
gpu
->
bOutputBufferPerWarp
?
" "
:
" no "
),
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
GrycukParticle
),
sizeof
(
GrycukParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaGrycukBornRadiiN2ByWarp_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
GrycukParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
);
...
...
@@ -330,13 +297,7 @@ __device__ void zeroGrycukChainRuleParticleSharedField( struct GrycukChainRulePa
}
//#define AMOEBA_DEBUG
__device__
void
calculateGrycukChainRulePairIxn_kernel
(
GrycukChainRuleParticle
&
atomI
,
GrycukChainRuleParticle
&
atomJ
,
float
force
[
3
]
#ifdef AMOEBA_DEBUG
,
float4
pullDebug
[
5
]
#endif
){
__device__
void
calculateGrycukChainRulePairIxn_kernel
(
GrycukChainRuleParticle
&
atomI
,
GrycukChainRuleParticle
&
atomJ
,
float
force
[
3
]
){
const
float
pi
=
3.1415926535897
f
;
float
third
=
1.0
f
/
3.0
f
;
...
...
@@ -390,18 +351,6 @@ __device__ void calculateGrycukChainRulePairIxn_kernel( GrycukChainRuleParticle&
float
dbr
=
term
*
de
/
r
;
de
=
dbr
*
atomI
.
bornForce
;
#ifdef AMOEBA_DEBUG
pullDebug
[
0
].
x
=
de
;
pullDebug
[
0
].
y
=
r
;
pullDebug
[
0
].
z
=
factor
;
pullDebug
[
0
].
w
=
-
4.0
f
;
pullDebug
[
1
].
x
=
atomI
.
bornForce
/
4.184
f
;
pullDebug
[
1
].
y
=
atomI
.
bornRadius
;
pullDebug
[
1
].
z
=
atomJ
.
bornForce
/
4.184
f
;
pullDebug
[
1
].
w
=
-
5.0
f
;
#endif
force
[
0
]
=
xr
*
de
;
force
[
1
]
=
yr
*
de
;
force
[
2
]
=
zr
*
de
;
...
...
@@ -430,43 +379,12 @@ __device__ void calculateGrycukChainRulePairIxn_kernel( GrycukChainRuleParticle&
void
kCalculateGrycukGbsaForces2
(
amoebaGpuContext
amoebaGpu
)
{
#ifdef AMOEBA_DEBUG
static
const
char
*
methodName
=
"kCalculateGrycukGbsaForces2"
;
static
int
timestep
=
0
;
std
::
vector
<
int
>
fileId
;
timestep
++
;
fileId
.
resize
(
2
);
fileId
[
0
]
=
timestep
;
fileId
[
1
]
=
1
;
#endif
// ---------------------------------------------------------------------------------------
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
// apparently debug array can take up nontrivial no. registers
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s %d maxCovalentDegreeSz=%d ZZZ
\n
"
,
methodName
,
gpu
->
natoms
,
amoebaGpu
->
maxCovalentDegreeSz
);
}
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
CUDAStream
<
float4
>*
debugArray
=
new
CUDAStream
<
float4
>
(
20
*
paddedNumberOfAtoms
,
1
,
"DebugArray"
);
memset
(
debugArray
->
_pSysData
,
0
,
sizeof
(
float
)
*
4
*
20
*
paddedNumberOfAtoms
);
debugArray
->
Upload
();
unsigned
int
targetAtom
=
0
;
gpu
->
psBornRadii
->
Download
();
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Grycuk input
\n
"
);
(
void
)
fflush
(
amoebaGpu
->
log
);
for
(
int
ii
=
0
;
ii
<
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Born %6d %16.9e
\n
"
,
ii
,
gpu
->
psBornRadii
->
_pSysData
[
ii
]
);
}
}
#endif
// on first pass, set threads/block and based on that setting the energy buffer array
static
unsigned
int
threadsPerBlock
=
0
;
...
...
@@ -480,64 +398,15 @@ void kCalculateGrycukGbsaForces2( amoebaGpuContext amoebaGpu )
else
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
GrycukChainRuleParticle
),
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaCudaGrycuk: blcks=%u tds=%u %u bPrWrp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u
\n
"
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
maxThreads
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
GrycukChainRuleParticle
),
sizeof
(
GrycukChainRuleParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
}
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaCudaGrycukN2Forces%swarp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u
\n
"
,
(
gpu
->
bOutputBufferPerWarp
?
" "
:
" no "
),
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
GrycukChainRuleParticle
),
sizeof
(
GrycukChainRuleParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
//kClearForces( gpu );
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaGrycukChainRuleN2ByWarp_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
GrycukChainRuleParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
#ifdef AMOEBA_DEBUG
,
debugArray
->
_pDevData
,
targetAtom
#endif
);
kCalculateAmoebaGrycukChainRuleN2ByWarp_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
GrycukChainRuleParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
);
}
else
{
kCalculateAmoebaGrycukChainRuleN2_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
GrycukChainRuleParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
#ifdef AMOEBA_DEBUG
,
debugArray
->
_pDevData
,
targetAtom
#endif
);
kCalculateAmoebaGrycukChainRuleN2_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
GrycukChainRuleParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
);
}
LAUNCHERROR
(
"kCalculateAmoebaCudaGrycukN2Forces"
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
debugArray
->
Download
();
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
for
(
int
jj
=
0
;
jj
<
gpu
->
natoms
;
jj
++
){
int
debugIndex
=
jj
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%5d %5d DebugGrycukChain
\n
"
,
targetAtom
,
jj
);
for
(
int
kk
=
0
;
kk
<
7
;
kk
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"[%16.9e %16.9e %16.9e %16.9e]
\n
"
,
debugArray
->
_pSysData
[
debugIndex
].
x
,
debugArray
->
_pSysData
[
debugIndex
].
y
,
debugArray
->
_pSysData
[
debugIndex
].
z
,
debugArray
->
_pSysData
[
debugIndex
].
w
);
debugIndex
+=
paddedNumberOfAtoms
;
}
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
}
}
#endif
if
(
0
){
static
int
callId
=
0
;
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaGrycukChainRule.h
View file @
2b508482
...
...
@@ -34,11 +34,7 @@ __launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__
(
G8X_NONBOND_THREADS_PER_BLOCK
,
1
)
#endif
void
METHOD_NAME
(
kCalculateAmoebaGrycukChainRule
,
_kernel
)(
unsigned
int
*
workUnit
#ifdef AMOEBA_DEBUG
,
float4
*
debugArray
,
unsigned
int
targetAtom
#endif
){
void
METHOD_NAME
(
kCalculateAmoebaGrycukChainRule
,
_kernel
)(
unsigned
int
*
workUnit
){
extern
__shared__
GrycukChainRuleParticle
sAChainRule
[];
...
...
@@ -49,10 +45,6 @@ void METHOD_NAME(kCalculateAmoebaGrycukChainRule, _kernel)( unsigned int* workUn
unsigned
int
end
=
(
warp
+
1
)
*
numWorkUnits
/
totalWarps
;
unsigned
int
lasty
=
0xFFFFFFFF
;
#ifdef AMOEBA_DEBUG
float4
pullDebug
[
5
];
#endif
while
(
pos
<
end
)
{
...
...
@@ -85,11 +77,7 @@ void METHOD_NAME(kCalculateAmoebaGrycukChainRule, _kernel)( unsigned int* workUn
for
(
unsigned
int
j
=
(
tgx
+
1
)
&
(
GRID
-
1
);
j
!=
tgx
;
j
=
(
j
+
1
)
&
(
GRID
-
1
))
{
float
localForce
[
3
];
calculateGrycukChainRulePairIxn_kernel
(
localParticle
,
psAChainRule
[
j
],
localForce
#ifdef AMOEBA_DEBUG
,
pullDebug
#endif
);
calculateGrycukChainRulePairIxn_kernel
(
localParticle
,
psAChainRule
[
j
],
localForce
);
if
(
(
atomI
!=
(
y
+
j
))
&&
(
atomI
<
cSim
.
atoms
)
&&
((
y
+
j
)
<
cSim
.
atoms
)
){
localParticle
.
force
[
0
]
-=
localForce
[
0
];
...
...
@@ -100,54 +88,6 @@ void METHOD_NAME(kCalculateAmoebaGrycukChainRule, _kernel)( unsigned int* workUn
psAChainRule
[
j
].
force
[
1
]
+=
localForce
[
1
];
psAChainRule
[
j
].
force
[
2
]
+=
localForce
[
2
];
#ifdef AMOEBA_DEBUG
if
(
atomI
==
targetAtom
||
(
y
+
j
)
==
targetAtom
){
unsigned
int
index
=
(
atomI
==
targetAtom
)
?
(
y
+
j
)
:
atomI
;
debugArray
[
index
].
x
=
(
float
)
atomI
;
debugArray
[
index
].
y
=
(
float
)
(
y
+
j
);
debugArray
[
index
].
z
=
-
1
.
0
f
;
debugArray
[
index
].
w
=
-
1
.
0
f
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
(
float
)
x
;
debugArray
[
index
].
y
=
(
float
)
y
;
debugArray
[
index
].
z
=
(
float
)
tgx
;
debugArray
[
index
].
w
=
-
2
.
0
f
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullDebug
[
0
].
x
;
debugArray
[
index
].
y
=
pullDebug
[
0
].
y
;
debugArray
[
index
].
z
=
pullDebug
[
0
].
z
;
debugArray
[
index
].
w
=
pullDebug
[
0
].
w
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullDebug
[
1
].
x
;
debugArray
[
index
].
y
=
pullDebug
[
1
].
y
;
debugArray
[
index
].
z
=
pullDebug
[
1
].
z
;
debugArray
[
index
].
w
=
pullDebug
[
1
].
w
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
localForce
[
0
];
debugArray
[
index
].
y
=
localForce
[
1
];
debugArray
[
index
].
z
=
localForce
[
2
];
debugArray
[
index
].
w
=
-
12
.
0
f
;
calculateGrycukChainRulePairIxn_kernel
(
psAChainRule
[
j
],
localParticle
,
localForce
,
pullDebug
);
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullDebug
[
0
].
x
;
debugArray
[
index
].
y
=
pullDebug
[
0
].
y
;
debugArray
[
index
].
z
=
pullDebug
[
0
].
z
;
debugArray
[
index
].
w
=
-
13
.
0
f
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
localForce
[
0
];
debugArray
[
index
].
y
=
localForce
[
1
];
debugArray
[
index
].
z
=
localForce
[
2
];
debugArray
[
index
].
w
=
-
14
.
0
f
;
}
#endif
}
}
...
...
@@ -181,11 +121,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
if
(
(
atomI
<
cSim
.
atoms
)
&&
((
y
+
tj
)
<
cSim
.
atoms
)
){
float
localForce
[
3
];
calculateGrycukChainRulePairIxn_kernel
(
localParticle
,
psAChainRule
[
tj
],
localForce
#ifdef AMOEBA_DEBUG
,
pullDebug
#endif
);
calculateGrycukChainRulePairIxn_kernel
(
localParticle
,
psAChainRule
[
tj
],
localForce
);
localParticle
.
force
[
0
]
-=
localForce
[
0
];
localParticle
.
force
[
1
]
-=
localForce
[
1
];
...
...
@@ -195,54 +131,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
psAChainRule
[
tj
].
force
[
1
]
+=
localForce
[
1
];
psAChainRule
[
tj
].
force
[
2
]
+=
localForce
[
2
];
#ifdef AMOEBA_DEBUG
unsigned
int
index
=
(
atomI
==
targetAtom
)
?
(
y
+
tj
)
:
atomI
;
if
(
atomI
==
targetAtom
||
(
y
+
tj
)
==
targetAtom
){
debugArray
[
index
].
x
=
(
float
)
atomI
;
debugArray
[
index
].
y
=
(
float
)
(
y
+
tj
);
debugArray
[
index
].
z
=
-
1
.
0
f
;
debugArray
[
index
].
w
=
-
1
.
0
f
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
(
float
)
x
;
debugArray
[
index
].
y
=
(
float
)
y
;
debugArray
[
index
].
z
=
(
float
)
tgx
;
debugArray
[
index
].
w
=
-
2
.
0
f
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullDebug
[
0
].
x
;
debugArray
[
index
].
y
=
pullDebug
[
0
].
y
;
debugArray
[
index
].
z
=
pullDebug
[
0
].
z
;
debugArray
[
index
].
w
=
pullDebug
[
0
].
w
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullDebug
[
1
].
x
;
debugArray
[
index
].
y
=
pullDebug
[
1
].
y
;
debugArray
[
index
].
z
=
pullDebug
[
1
].
z
;
debugArray
[
index
].
w
=
pullDebug
[
1
].
w
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
localForce
[
0
];
debugArray
[
index
].
y
=
localForce
[
1
];
debugArray
[
index
].
z
=
localForce
[
2
];
debugArray
[
index
].
w
=
-
10
.
0
f
;
}
#endif
calculateGrycukChainRulePairIxn_kernel
(
psAChainRule
[
tj
],
localParticle
,
localForce
#ifdef AMOEBA_DEBUG
,
pullDebug
#endif
);
#ifdef AMOEBA_DEBUG
if
(
atomI
==
targetAtom
||
(
y
+
tj
)
==
targetAtom
){
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullDebug
[
0
].
x
;
debugArray
[
index
].
y
=
localForce
[
1
];
debugArray
[
index
].
z
=
localForce
[
2
];
debugArray
[
index
].
w
=
-
11
.
0
f
;
}
#endif
calculateGrycukChainRulePairIxn_kernel
(
psAChainRule
[
tj
],
localParticle
,
localForce
);
localParticle
.
force
[
0
]
+=
localForce
[
0
];
localParticle
.
force
[
1
]
+=
localForce
[
1
];
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
View file @
2b508482
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
//#include "amoebaGpuTypes.h"
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "cudaKernels.h"
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
//#define INCLUDE_TORQUE
#include "kCalculateAmoebaCudaKirkwoodParticle.h"
extern
void
kCalculateObcGbsaForces2
(
gpuContext
gpu
);
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
static
__constant__
cudaGmxSimulation
cSim
;
static
__constant__
cudaAmoebaGmxSimulation
cAmoebaSim
;
...
...
@@ -1638,26 +1657,6 @@ __device__ void zeroKirkwoodParticleSharedField( struct KirkwoodParticle* sA )
// reduce psWorkArray_1_1 -> dBorn
// reduce psWorkArray_1_2 -> dBornPolar
#ifdef AMOEBA_DEBUG
static
void
kReduce_dBorn
(
amoebaGpuContext
amoebaGpu
)
{
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
/*
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_1_1->_pDevData, amoebaGpu->psBorn->_pDevData, 0 );
LAUNCHERROR("kReduce_dBorn1");
*/
kReduceFields_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
bsf_reduce_threads_per_block
>>>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
gpu
->
sim
.
outputBuffers
,
amoebaGpu
->
psWorkArray_1_2
->
_pDevData
,
amoebaGpu
->
psBornPolar
->
_pDevData
,
0
);
LAUNCHERROR
(
"kReduce_dBorn2"
);
}
#endif
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_THREADS_PER_BLOCK
,
1
)
...
...
@@ -1940,40 +1939,6 @@ static void kReduceToBornForcePrefactor( amoebaGpuContext amoebaGpu )
amoebaGpu
->
gpuContext
->
psBornForce
->
_pDevData
);
}
LAUNCHERROR
(
"kReduceToBornForcePrefactor"
);
//#define AMOEBA_DEBUG
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
// kClearEnergy() should be called prior to kReduceToBornForcePrefactorAndSASA_kernel
double
energy
=
kReduceEnergy
(
amoebaGpu
->
gpuContext
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Born force w/ cavity energy=%15.7e.
\n
"
,
energy
);
(
void
)
fflush
(
amoebaGpu
->
log
);
/*
for( int ii = 0; ii < amoebaGpu->gpuContext->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
(void) fprintf( amoebaGpu->log,"bF %16.9e obc=%16.9e bR=%16.9e\n",
amoebaGpu->gpuContext->psBornForce->_pSysData[ii],
amoebaGpu->gpuContext->psObcData->_pSysData[ii].x,
amoebaGpu->gpuContext->psBornRadii->_pSysData[ii] );
}
(void) fflush( amoebaGpu->log );
*/
if
(
0
){
std
::
vector
<
int
>
fileId
;
//fileId.push_back( 0 );
VectorOfDoubleVectors
outputVector
;
cudaLoadCudaFloat4Array
(
amoebaGpu
->
gpuContext
->
natoms
,
3
,
amoebaGpu
->
gpuContext
->
psPosq4
,
outputVector
,
NULL
,
1.0
f
);
cudaLoadCudaFloatArray
(
amoebaGpu
->
gpuContext
->
natoms
,
1
,
amoebaGpu
->
gpuContext
->
psBornRadii
,
outputVector
,
NULL
,
1.0
f
);
cudaLoadCudaFloat2Array
(
amoebaGpu
->
gpuContext
->
natoms
,
2
,
amoebaGpu
->
gpuContext
->
psObcData
,
outputVector
,
NULL
,
1.0
f
);
cudaLoadCudaFloatArray
(
amoebaGpu
->
gpuContext
->
natoms
,
1
,
amoebaGpu
->
gpuContext
->
psBornForce
,
outputVector
,
NULL
,
1.0
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaBornForce"
,
fileId
,
outputVector
);
}
}
#endif
#undef AMOEBA_DEBUG
}
/**---------------------------------------------------------------------------------------
...
...
@@ -1988,44 +1953,11 @@ static void kReduceToBornForcePrefactor( amoebaGpuContext amoebaGpu )
void
kCalculateAmoebaKirkwood
(
amoebaGpuContext
amoebaGpu
)
{
#ifdef AMOEBA_DEBUG
static
const
char
*
methodName
=
"kCalculateAmoebaKirkwood"
;
static
int
timestep
=
0
;
std
::
vector
<
int
>
fileId
;
timestep
++
;
fileId
.
resize
(
2
);
fileId
[
0
]
=
timestep
;
fileId
[
1
]
=
1
;
#endif
// ---------------------------------------------------------------------------------------
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
// apparently debug array can take up nontrivial no. registers
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s %d maxCovalentDegreeSz=%d ZZZ
\n
"
,
methodName
,
gpu
->
natoms
,
amoebaGpu
->
maxCovalentDegreeSz
);
amoebaGpu
->
scalingDistanceCutoff
);
}
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
CUDAStream
<
float4
>*
debugArray
=
new
CUDAStream
<
float4
>
(
paddedNumberOfAtoms
*
paddedNumberOfAtoms
,
1
,
"DebugArray"
);
memset
(
debugArray
->
_pSysData
,
0
,
sizeof
(
float
)
*
4
*
paddedNumberOfAtoms
*
paddedNumberOfAtoms
);
debugArray
->
Upload
();
unsigned
int
targetAtom
=
0
;
gpu
->
psBornRadii
->
Download
();
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Kirkwood input
\n
"
);
(
void
)
fflush
(
amoebaGpu
->
log
);
for
(
int
ii
=
0
;
ii
<
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Born %6d %16.9e
\n
"
,
ii
,
gpu
->
psBornRadii
->
_pSysData
[
ii
]
);
}
}
#endif
// on first pass, set threads/block and based on that setting the energy buffer array
static
unsigned
int
threadsPerBlock
=
0
;
...
...
@@ -2040,47 +1972,17 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
KirkwoodParticle
),
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaCudaKirkwood: blcks=%u tds=%u %u bPrWrp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u
\n
"
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
maxThreads
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
KirkwoodParticle
),
sizeof
(
KirkwoodParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
}
kClearFields_1
(
amoebaGpu
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaCudaKirkwoodN2Forces%swarp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u
\n
"
,
(
gpu
->
bOutputBufferPerWarp
?
" "
:
" no "
),
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
KirkwoodParticle
),
sizeof
(
KirkwoodParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaCudaKirkwoodN2ByWarpForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
KirkwoodParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
#ifdef AMOEBA_DEBUG
,
debugArray
->
_pDevData
,
targetAtom
);
#else
);
#endif
gpu
->
psWorkUnit
->
_pDevData
);
}
else
{
kCalculateAmoebaCudaKirkwoodN2Forces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
KirkwoodParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
#ifdef AMOEBA_DEBUG
,
debugArray
->
_pDevData
,
targetAtom
);
#else
);
#endif
gpu
->
psWorkUnit
->
_pDevData
);
}
LAUNCHERROR
(
"kCalculateAmoebaCudaKirkwoodN2Forces"
);
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.h
View file @
2b508482
...
...
@@ -35,15 +35,7 @@ __launch_bounds__(128, 1)
__launch_bounds__
(
64
,
1
)
#endif
void
METHOD_NAME
(
kCalculateAmoebaCudaKirkwood
,
Forces_kernel
)(
unsigned
int
*
workUnit
#ifdef AMOEBA_DEBUG
,
float4
*
debugArray
,
unsigned
int
targetAtom
#endif
){
#ifdef AMOEBA_DEBUG
float4
pullBack
[
20
];
#endif
unsigned
int
*
workUnit
){
extern
__shared__
KirkwoodParticle
sA
[];
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
View file @
2b508482
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
//#include "kCalculateAmoebaCudaKirkwoodParticle.h"
#include "kCalculateAmoebaCudaKirkwoodEDiffParticle.h"
//#define AMOEBA_DEBUG
static
__constant__
cudaGmxSimulation
cSim
;
static
__constant__
cudaAmoebaGmxSimulation
cAmoebaSim
;
...
...
@@ -865,19 +884,6 @@ __device__ void calculateKirkwoodEDiffPairIxn_kernel( KirkwoodEDiffParticle& ato
#endif
#ifdef AMOEBA_DEBUG
__device__
static
int
debugAccumulate
(
unsigned
int
index
,
float4
*
debugArray
,
float
*
field
,
unsigned
int
addMask
,
float
idLabel
)
{
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
addMask
?
field
[
0
]
:
0.0
f
;
debugArray
[
index
].
y
=
addMask
?
field
[
1
]
:
0.0
f
;
debugArray
[
index
].
z
=
addMask
?
field
[
2
]
:
0.0
f
;
debugArray
[
index
].
w
=
idLabel
;
return
index
;
}
#endif
__device__
void
zeroKirkwoodEDiffParticleSharedField
(
struct
KirkwoodEDiffParticle
*
sA
)
{
// zero shared fields
...
...
@@ -931,22 +937,10 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
// ---------------------------------------------------------------------------------------
static
int
timestep
=
0
;
timestep
++
;
#ifdef AMOEBA_DEBUG
static
const
char
*
methodName
=
"kCalculateAmoebaKirkwoodEDiff"
;
std
::
vector
<
int
>
fileId
;
fileId
.
resize
(
2
);
fileId
[
0
]
=
timestep
;
fileId
[
1
]
=
1
;
#endif
// ---------------------------------------------------------------------------------------
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
// apparently debug array can take up nontrivial no. registers
//
static
unsigned
int
threadsPerBlock
=
0
;
if
(
threadsPerBlock
==
0
){
unsigned
int
maxThreads
;
...
...
@@ -960,23 +954,10 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
KirkwoodEDiffParticle
),
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaCudaKirkwoodEDiffN2Forces: blocks=%u threads=%u bffr/Warp=%u atm=%lu shrd=%lu"
" ixnCt=%lu workUnits=%u sm=%d device=%d sharedMemoryPerBlock=%u step=%d
\n
"
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
KirkwoodEDiffParticle
),
sizeof
(
KirkwoodEDiffParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
,
gpu
->
sm_version
,
gpu
->
device
,
gpu
->
sharedMemoryPerBlock
,
timestep
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaCudaKirkwoodEDiffN2ByWarpForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
KirkwoodEDiffParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
);
}
else
{
kCalculateAmoebaCudaKirkwoodEDiffN2Forces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
KirkwoodEDiffParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
);
}
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.h
View file @
2b508482
...
...
@@ -35,8 +35,7 @@ __launch_bounds__(96, 1)
__launch_bounds__
(
32
,
1
)
#endif
void
METHOD_NAME
(
kCalculateAmoebaCudaKirkwoodEDiff
,
Forces_kernel
)(
unsigned
int
*
workUnit
,
float
*
outputTorque
){
unsigned
int
*
workUnit
,
float
*
outputTorque
){
extern
__shared__
KirkwoodEDiffParticle
sA
[];
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaLocalForces.cu
View file @
2b508482
...
...
@@ -1616,19 +1616,7 @@ void kCalculateAmoebaLocalForces_kernel()
void
kCalculateAmoebaLocalForces
(
amoebaGpuContext
gpu
)
{
#ifdef AMOEBA_DEBUG
if
(
gpu
->
log
){
static
int
call
=
1
;
if
(
call
==
0
){
(
void
)
fprintf
(
gpu
->
log
,
"kCalculateAmoebaLocalForces: blks=%u thrds/blk=%u
\n
"
,
gpu
->
gpuContext
->
sim
.
blocks
,
gpu
->
gpuContext
->
sim
.
localForces_threads_per_block
);
fflush
(
gpu
->
log
);
call
++
;
}
}
#endif
kCalculateAmoebaLocalForces_kernel
<<<
gpu
->
gpuContext
->
sim
.
blocks
,
gpu
->
gpuContext
->
sim
.
localForces_threads_per_block
>>>
();
LAUNCHERROR
(
"kCalculateAmoebaLocalForces"
);
}
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMapTorques.cu
View file @
2b508482
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "cudaKernels.h"
#include "amoebaCudaKernels.h"
//#define AMOEBA_DEBUG
#define BLOCK_SIZE 128
using
namespace
std
;
static
__constant__
cudaGmxSimulation
cSim
;
...
...
@@ -469,41 +491,4 @@ void cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpuContext amoebaGpu, CUDASt
amoebaAddMapTorqueForceToForce_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
threads_per_block
>>>
(
);
LAUNCHERROR
(
"amoebaAddMapTorqueForceToForce"
);
}
#ifdef AMOEBA_DEBUG
if
(
0
){
VectorOfDoubleVectors
outputVector
;
std
::
vector
<
int
>
fileId
;
static
int
call
=
0
;
fileId
.
push_back
(
call
++
);
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
CUDAStream
<
float
>*
temp
=
new
CUDAStream
<
float
>
(
3
*
paddedNumberOfAtoms
,
1
,
"TempMapTorqueAndAddToForce2"
);
reduceAndCopyCUDAStreamFloat4
(
gpu
->
psForce4
,
temp
,
1.0
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
temp
,
outputVector
,
NULL
,
1.0
f
/
4.184
f
);
reduceAndCopyCUDAStreamFloat4
(
amoebaGpu
->
psTorqueMapForce4
,
temp
,
1.0
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
temp
,
outputVector
,
NULL
,
1.0
f
/
4.184
f
);
for
(
int
pId
=
0
;
pId
<
5
;
pId
++
){
float
sum
[
3
]
=
{
0.0
f
,
0.0
f
,
0.0
f
};
(
void
)
fprintf
(
stderr
,
"
\n\n
TorqueForceToForce for part=%d
\n
"
,
pId
);
for
(
int
ii
=
0
;
ii
<
amoebaGpu
->
amoebaSim
.
maxTorqueBufferIndex
;
ii
++
){
(
void
)
fprintf
(
stderr
,
"%4d [%15.7e %15.7e %15.7e]
\n
"
,
ii
,
amoebaGpu
->
psTorqueMapForce4
->
_pSysStream
[
ii
][
pId
].
x
,
amoebaGpu
->
psTorqueMapForce4
->
_pSysStream
[
ii
][
pId
].
y
,
amoebaGpu
->
psTorqueMapForce4
->
_pSysStream
[
ii
][
pId
].
z
);
sum
[
0
]
+=
amoebaGpu
->
psTorqueMapForce4
->
_pSysStream
[
ii
][
pId
].
x
;
sum
[
1
]
+=
amoebaGpu
->
psTorqueMapForce4
->
_pSysStream
[
ii
][
pId
].
y
;
sum
[
2
]
+=
amoebaGpu
->
psTorqueMapForce4
->
_pSysStream
[
ii
][
pId
].
z
;
}
(
void
)
fprintf
(
stderr
,
"TorqueForceToForce for partcle=%d [%15.7e %15.7e %15.7e] [%15.7e %15.7e %15.7e]
\n
"
,
pId
,
sum
[
0
],
sum
[
1
],
sum
[
2
],
sum
[
0
]
/
4.184
f
,
sum
[
1
]
/
4.184
f
,
sum
[
2
]
/
4.184
f
);
}
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaElectrostatiPostTorqueForce"
,
fileId
,
outputVector
);
delete
temp
;
}
#endif
}
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
View file @
2b508482
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
...
...
@@ -33,21 +55,12 @@ void GetCalculateAmoebaCudaMutualInducedAndGkFieldsSim(amoebaGpuContext amoebaGp
RTERROR
(
status
,
"GetCalculateAmoebaCudaMutualInducedAndGkFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed"
);
}
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
#define GK
#include "kCalculateAmoebaCudaMutualInducedParticle.h"
#undef GK
__device__
void
calculateMutualInducedAndGkFieldsPairIxn_kernel
(
MutualInducedParticle
&
atomI
,
MutualInducedParticle
&
atomJ
,
float
fields
[
8
][
3
]
#ifdef AMOEBA_DEBUG
,
float4
*
debugArray
#endif
)
float
fields
[
8
][
3
]
)
{
float
deltaR
[
3
];
...
...
@@ -119,13 +132,7 @@ __device__ void calculateMutualInducedAndGkFieldsPairIxn_kernel( MutualInducedPa
}
__device__
void
calculateMutualInducedAndGkFieldsGkPairIxn_kernel
(
MutualInducedParticle
&
atomI
,
MutualInducedParticle
&
atomJ
,
float
gkField
[
8
][
3
]
#ifdef AMOEBA_DEBUG
,
float4
*
debugArray
#endif
)
float
gkField
[
8
][
3
]
)
{
float
gux
[
5
];
...
...
@@ -213,20 +220,6 @@ __device__ void calculateMutualInducedAndGkFieldsGkPairIxn_kernel( MutualInduced
}
#ifdef AMOEBA_DEBUG
__device__
static
int
debugAccumulate
(
int
index
,
float4
*
debugArray
,
float
*
field
,
unsigned
int
addMask
,
float
idLabel
)
{
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
addMask
?
field
[
0
]
:
0.0
f
;
debugArray
[
index
].
y
=
addMask
?
field
[
1
]
:
0.0
f
;
debugArray
[
index
].
z
=
addMask
?
field
[
2
]
:
0.0
f
;
debugArray
[
index
].
w
=
idLabel
;
return
index
;
}
#endif
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
...
...
@@ -324,12 +317,6 @@ void kReduceMutualInducedAndGkFieldDelta_kernel( float* arrayOfDeltas1, float* a
epsilon
[
0
]
=
epsilon
[
0
]
<
delta
[
0
].
z
?
delta
[
0
].
z
:
epsilon
[
0
];
epsilon
[
0
]
=
epsilon
[
0
]
<
delta
[
0
].
w
?
delta
[
0
].
w
:
epsilon
[
0
];
epsilon
[
0
]
=
48.033324
f
*
sqrtf
(
epsilon
[
0
]
/
(
(
float
)
cSim
.
atoms
)
);
#ifdef AMOEBA_DEBUG
epsilon
[
1
]
=
48.033324
f
*
sqrtf
(
delta
[
0
].
x
/
(
(
float
)
cSim
.
atoms
)
);
epsilon
[
2
]
=
48.033324
f
*
sqrtf
(
delta
[
0
].
y
/
(
(
float
)
cSim
.
atoms
)
);
epsilon
[
3
]
=
48.033324
f
*
sqrtf
(
delta
[
0
].
z
/
(
(
float
)
cSim
.
atoms
)
);
epsilon
[
4
]
=
48.033324
f
*
sqrtf
(
delta
[
0
].
w
/
(
(
float
)
cSim
.
atoms
)
);
#endif
}
}
...
...
@@ -460,21 +447,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
// ---------------------------------------------------------------------------------------
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
#ifdef AMOEBA_DEBUG
static
int
iteration
=
1
;
int
targetAtom
=
0
;
static
const
char
*
methodName
=
"cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply"
;
if
(
1
&&
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s
\n
"
,
methodName
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
CUDAStream
<
float4
>*
debugArray
=
new
CUDAStream
<
float4
>
(
paddedNumberOfAtoms
*
paddedNumberOfAtoms
,
1
,
"DebugArray"
);
memset
(
debugArray
->
_pSysData
,
0
,
sizeof
(
float
)
*
4
*
paddedNumberOfAtoms
*
paddedNumberOfAtoms
);
debugArray
->
Upload
();
#endif
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
// clear output arrays
...
...
@@ -493,28 +466,13 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
MutualInducedParticle
),
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%lu ixnCt=%lu workUnits=%lu
\n
"
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
MutualInducedParticle
),
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaMutualInducedAndGkFieldsN2ByWarp_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_2
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_3
->
_pDevData
,
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_4
->
_pDevData
,
debugArray
->
_pDevData
,
targetAtom
);
#else
amoebaGpu
->
psWorkArray_3_4
->
_pDevData
);
#endif
}
else
{
kCalculateAmoebaMutualInducedAndGkFieldsN2_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
>>>
(
...
...
@@ -522,120 +480,13 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_2
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_3
->
_pDevData
,
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_4
->
_pDevData
,
debugArray
->
_pDevData
,
targetAtom
);
#else
amoebaGpu
->
psWorkArray_3_4
->
_pDevData
);
#endif
}
LAUNCHERROR
(
"kCalculateAmoebaMutualInducedAndGkFields"
);
kReduceMutualInducedAndGkFields
(
amoebaGpu
,
outputArray
,
outputPolarArray
,
outputArrayS
,
outputPolarArrayS
);
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_1
->
Download
();
amoebaGpu
->
psWorkArray_3_2
->
Download
();
amoebaGpu
->
psWorkArray_3_3
->
Download
();
amoebaGpu
->
psWorkArray_3_4
->
Download
();
if
(
amoebaGpu
->
log
&&
iteration
==
1
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Finished MI kernel execution %d
\n
"
,
iteration
);
(
void
)
fflush
(
amoebaGpu
->
log
);
outputArray
->
Download
();
outputPolarArray
->
Download
();
outputArrayS
->
Download
();
outputPolarArrayS
->
Download
();
debugArray
->
Download
();
int
maxPrint
=
33
;
for
(
int
ii
=
0
;
ii
<
gpu
->
natoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%5d "
,
ii
);
int
indexOffset
=
ii
*
3
;
// Mi
(
void
)
fprintf
(
amoebaGpu
->
log
,
" Mult[%16.9e %16.9e %16.9e] "
,
outputArray
->
_pSysData
[
indexOffset
],
outputArray
->
_pSysData
[
indexOffset
+
1
],
outputArray
->
_pSysData
[
indexOffset
+
2
]
);
// Mi polar
(
void
)
fprintf
(
amoebaGpu
->
log
,
" MultP[%16.9e %16.9e %16.9e]
\n
"
,
outputPolarArray
->
_pSysData
[
indexOffset
],
outputPolarArray
->
_pSysData
[
indexOffset
+
1
],
outputPolarArray
->
_pSysData
[
indexOffset
+
2
]
);
// MiS
(
void
)
fprintf
(
amoebaGpu
->
log
,
" MultS[%16.9e %16.9e %16.9e] "
,
outputArrayS
->
_pSysData
[
indexOffset
],
outputArrayS
->
_pSysData
[
indexOffset
+
1
],
outputArrayS
->
_pSysData
[
indexOffset
+
2
]
);
// Mi polarS
(
void
)
fprintf
(
amoebaGpu
->
log
,
"MultPS[%16.9e %16.9e %16.9e]
\n
"
,
outputPolarArrayS
->
_pSysData
[
indexOffset
],
outputPolarArrayS
->
_pSysData
[
indexOffset
+
1
],
outputPolarArrayS
->
_pSysData
[
indexOffset
+
2
]
);
// coords
#if 0
(void) fprintf( amoebaGpu->log,"x[%16.9e %16.9e %16.9e] ",
gpu->psPosq4->_pSysData[ii].x,
gpu->psPosq4->_pSysData[ii].y,
gpu->psPosq4->_pSysData[ii].z);
for( int jj = 0; jj < gpu->natoms && jj < 5; jj++ ){
int debugIndex = jj*gpu->natoms + ii;
float xx = gpu->psPosq4->_pSysData[jj].x - gpu->psPosq4->_pSysData[ii].x;
float yy = gpu->psPosq4->_pSysData[jj].y - gpu->psPosq4->_pSysData[ii].y;
float zz = gpu->psPosq4->_pSysData[jj].z - gpu->psPosq4->_pSysData[ii].z;
(void) fprintf( amoebaGpu->log,"\n%4d %4d delta [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] ",
ii, jj, xx, yy, zz,
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );
}
#endif
if
(
ii
==
targetAtom
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
for
(
int
jj
=
0
;
jj
<
gpu
->
natoms
;
jj
++
){
int
debugIndex
=
jj
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%4d %4d Rint [%16.9e %16.9e %16.9e %16.9e]
\n
"
,
ii
,
jj
,
debugArray
->
_pSysData
[
debugIndex
].
x
,
debugArray
->
_pSysData
[
debugIndex
].
y
,
debugArray
->
_pSysData
[
debugIndex
].
z
,
debugArray
->
_pSysData
[
debugIndex
].
w
);
for
(
int
kk
=
0
;
kk
<
9
;
kk
++
){
debugIndex
+=
paddedNumberOfAtoms
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"[%16.9e %16.9e %16.9e %5.1f]
\n
"
,
debugArray
->
_pSysData
[
debugIndex
].
x
,
debugArray
->
_pSysData
[
debugIndex
].
y
,
debugArray
->
_pSysData
[
debugIndex
].
z
,
debugArray
->
_pSysData
[
debugIndex
].
w
);
}
}
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
}
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
if
(
ii
==
maxPrint
&&
(
gpu
->
natoms
-
maxPrint
)
>
ii
){
ii
=
gpu
->
natoms
-
maxPrint
;
}
}
(
void
)
fflush
(
amoebaGpu
->
log
);
iteration
++
;
}
delete
debugArray
;
#endif
}
/**---------------------------------------------------------------------------------------
...
...
@@ -649,22 +500,12 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
static
void
cudaComputeAmoebaMutualInducedAndGkFieldBySOR
(
amoebaGpuContext
amoebaGpu
)
{
// ---------------------------------------------------------------------------------------
static
int
timestep
=
0
;
timestep
++
;
#ifdef AMOEBA_DEBUG
static
const
char
*
methodName
=
"cudaComputeAmoebaMutualInducedAndGkFieldBySOR"
;
std
::
vector
<
int
>
fileId
;
fileId
.
resize
(
2
);
fileId
[
0
]
=
timestep
;
fileId
[
1
]
=
1
;
#endif
// ---------------------------------------------------------------------------------------
int
done
;
int
iteration
;
static
int
timestep
=
0
;
timestep
++
;
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
...
...
@@ -685,49 +526,6 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
cudaMemcpy
(
amoebaGpu
->
psInducedDipole
->
_pDevData
,
amoebaGpu
->
psE_Field
->
_pDevData
,
3
*
gpu
->
sim
.
paddedNumberOfAtoms
*
sizeof
(
float
),
cudaMemcpyDeviceToDevice
);
cudaMemcpy
(
amoebaGpu
->
psInducedDipolePolar
->
_pDevData
,
amoebaGpu
->
psE_FieldPolar
->
_pDevData
,
3
*
gpu
->
sim
.
paddedNumberOfAtoms
*
sizeof
(
float
),
cudaMemcpyDeviceToDevice
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"cudaComputeAmoebaMutualInducedAndGkFieldBySOR Initial setup for matrix multiply
\n
"
);
fflush
(
amoebaGpu
->
log
);
gpu
->
psPosq4
->
Download
();
amoebaGpu
->
psE_Field
->
Download
();
amoebaGpu
->
psE_FieldPolar
->
Download
();
amoebaGpu
->
psInducedDipole
->
Download
(),
amoebaGpu
->
psInducedDipolePolar
->
Download
();
amoebaGpu
->
psInducedDipoleS
->
Download
(),
amoebaGpu
->
psInducedDipolePolarS
->
Download
();
amoebaGpu
->
psPolarizability
->
Download
();
int
offset
=
0
;
int
maxPrint
=
10
;
for
(
int
ii
=
0
;
ii
<
gpu
->
natoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
%4d pol=%12.4e
\n
"
,
ii
,
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
]
);
if
(
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
]
!=
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
+
1
]
||
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
]
!=
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
+
2
]
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"PolX!!! %12.4e %12.4e "
,
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
+
2
]
);
}
(
void
)
fprintf
(
amoebaGpu
->
log
,
" E[%14.6e %14.6e %14.6e] Mi[%14.6e %14.6e %14.6e] Pos[%14.6e %14.6e %14.6e]
\n
"
,
amoebaGpu
->
psE_Field
->
_pSysData
[
offset
],
amoebaGpu
->
psE_Field
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psE_Field
->
_pSysData
[
offset
+
2
],
amoebaGpu
->
psInducedDipole
->
_pSysData
[
offset
],
amoebaGpu
->
psInducedDipole
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psInducedDipole
->
_pSysData
[
offset
+
2
],
gpu
->
psPosq4
->
_pSysData
[
ii
].
x
,
gpu
->
psPosq4
->
_pSysData
[
ii
].
y
,
gpu
->
psPosq4
->
_pSysData
[
ii
].
z
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Ep[%14.6e %14.6e %14.6e] Mip[%14.6e %14.6e %14.6e]
\n
"
,
amoebaGpu
->
psE_FieldPolar
->
_pSysData
[
offset
],
amoebaGpu
->
psE_FieldPolar
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psE_FieldPolar
->
_pSysData
[
offset
+
2
],
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
offset
],
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
offset
+
2
]
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Gk[%14.6e %14.6e %14.6e] MiS[%14.6e %14.6e %14.6e] MipS[%14.6e %14.6e %14.6e]
\n
"
,
amoebaGpu
->
psGk_Field
->
_pSysData
[
offset
],
amoebaGpu
->
psGk_Field
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psGk_Field
->
_pSysData
[
offset
+
2
],
amoebaGpu
->
psInducedDipoleS
->
_pSysData
[
offset
],
amoebaGpu
->
psInducedDipoleS
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psInducedDipoleS
->
_pSysData
[
offset
+
2
],
amoebaGpu
->
psInducedDipolePolarS
->
_pSysData
[
offset
],
amoebaGpu
->
psInducedDipolePolarS
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psInducedDipolePolarS
->
_pSysData
[
offset
+
2
]
);
offset
+=
3
;
if
(
ii
==
maxPrint
&&
(
ii
<
(
gpu
->
natoms
-
maxPrint
)
)
)
ii
=
(
gpu
->
natoms
-
maxPrint
);
}
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
// if polarization type is direct, set flags signalling done and return
if
(
amoebaGpu
->
amoebaSim
.
polarizationType
)
...
...
@@ -800,91 +598,11 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
done
=
1
;
}
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"cudaComputeAmoebaMutualInducedAndGkFieldBySOR step=%d iteration=%3d eps %14.6e done=%d
\n
"
,
timestep
,
iteration
,
amoebaGpu
->
mutualInducedCurrentEpsilon
,
done
);
}
if
(
amoebaGpu
->
log
){
amoebaGpu
->
psInducedDipole
->
Download
();
amoebaGpu
->
psInducedDipolePolar
->
Download
();
amoebaGpu
->
psInducedDipoleS
->
Download
();
amoebaGpu
->
psInducedDipolePolarS
->
Download
();
amoebaGpu
->
psWorkVector
[
2
]
->
Download
();
amoebaGpu
->
psWorkVector
[
3
]
->
Download
();
amoebaGpu
->
psGk_Field
->
Download
();
#if 1
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s iteration=%3d eps %14.6e [%14.6e %14.6e] done=%d
\n
"
,
methodName
,
iteration
,
amoebaGpu
->
mutualInducedCurrentEpsilon
,
amoebaGpu
->
psCurrentEpsilon
->
_pSysData
[
1
],
amoebaGpu
->
psCurrentEpsilon
->
_pSysData
[
2
],
done
);
#else
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s iteration=%3d eps %14.6e %14.6e crrntEps=%14.6e %14.6e %14.6e %14.6e done=%d
\n
"
,
methodName
,
iteration
,
sum1
,
sum2
,
amoebaGpu
->
mutualInducedCurrentEpsilon
,
amoebaGpu
->
psCurrentEpsilon
->
_pSysData
[
0
],
amoebaGpu
->
psCurrentEpsilon
->
_pSysData
[
1
],
amoebaGpu
->
psCurrentEpsilon
->
_pSysData
[
2
],
done
);
#endif
(
void
)
fflush
(
amoebaGpu
->
log
);
int
offset
=
0
;
int
maxPrint
=
20
;
for
(
int
ii
=
0
;
ii
<
gpu
->
natoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%4d "
,
ii
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
" Mi[%14.6e %14.6e %14.6e] "
,
amoebaGpu
->
psInducedDipole
->
_pSysData
[
offset
],
amoebaGpu
->
psInducedDipole
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psInducedDipole
->
_pSysData
[
offset
+
2
]
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Mip[%14.6e %14.6e %14.6e]"
,
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
offset
],
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
offset
+
2
]
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
" MiS[%14.6e %14.6e %14.6e] "
,
amoebaGpu
->
psInducedDipoleS
->
_pSysData
[
offset
],
amoebaGpu
->
psInducedDipoleS
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psInducedDipoleS
->
_pSysData
[
offset
+
2
]
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"MipS[%14.6e %14.6e %14.6e]
\n
"
,
amoebaGpu
->
psInducedDipolePolarS
->
_pSysData
[
offset
],
amoebaGpu
->
psInducedDipolePolarS
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psInducedDipolePolarS
->
_pSysData
[
offset
+
2
]
);
/*
(void) fprintf( amoebaGpu->log,"Gk [%14.6e %14.6e %14.6e]\n",
amoebaGpu->psGk_Field->_pSysData[offset], amoebaGpu->psGk_Field->_pSysData[offset+1], amoebaGpu->psGk_Field->_pSysData[offset+2] );
(void) fprintf( amoebaGpu->log,"W2 [%14.6e %14.6e %14.6e] ",
amoebaGpu->psWorkVector[2]->_pSysData[offset], amoebaGpu->psWorkVector[2]->_pSysData[offset+1], amoebaGpu->psWorkVector[2]->_pSysData[offset+2] );
(void) fprintf( amoebaGpu->log,"W3 [%14.6e %14.6e %14.6e]\n",
amoebaGpu->psWorkVector[3]->_pSysData[offset], amoebaGpu->psWorkVector[3]->_pSysData[offset+1], amoebaGpu->psWorkVector[3]->_pSysData[offset+2] );
*/
if
(
ii
==
maxPrint
&&
(
ii
<
(
gpu
->
natoms
-
maxPrint
)
)
){
ii
=
(
gpu
->
natoms
-
maxPrint
);
offset
=
3
*
(
ii
+
1
);
}
else
{
offset
+=
3
;
}
}
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
iteration
++
;
}
amoebaGpu
->
mutualInducedDone
=
done
;
amoebaGpu
->
mutualInducedConverged
=
(
!
done
||
iteration
>
amoebaGpu
->
mutualInducedMaxIterations
)
?
0
:
1
;
#ifdef AMOEBA_DEBUG
if
(
0
&&
amoebaGpu
->
log
){
trackMutualInducedIterations
(
amoebaGpu
,
iteration
);
}
if
(
0
){
std
::
vector
<
int
>
fileId
;
//fileId.push_back( 0 );
VectorOfDoubleVectors
outputVector
;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psInducedDipole
,
outputVector
,
NULL
,
1.0
f
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psInducedDipolePolar
,
outputVector
,
NULL
,
1.0
f
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psInducedDipoleS
,
outputVector
,
NULL
,
1.0
f
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psInducedDipolePolarS
,
outputVector
,
NULL
,
1.0
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"Cuda_GK_MI"
,
fileId
,
outputVector
);
}
#endif
// ---------------------------------------------------------------------------------------
}
void
cudaComputeAmoebaMutualInducedAndGkField
(
amoebaGpuContext
amoebaGpu
)
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.h
View file @
2b508482
...
...
@@ -39,11 +39,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)(
float
*
outputField
,
float
*
outputFieldPolar
,
float
*
outputFieldS
,
float
*
outputFieldPolarS
#ifdef AMOEBA_DEBUG
,
float4
*
debugArray
,
unsigned
int
targetAtom
#endif
){
float
*
outputFieldPolarS
){
extern
__shared__
MutualInducedParticle
sA
[];
...
...
@@ -112,11 +108,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)(
// load coords, charge, ...
calculateMutualInducedAndGkFieldsPairIxn_kernel
(
localParticle
,
psA
[
j
],
ijField
#ifdef AMOEBA_DEBUG
,
debugArray
#endif
);
calculateMutualInducedAndGkFieldsPairIxn_kernel
(
localParticle
,
psA
[
j
],
ijField
);
unsigned
int
mask
=
(
(
atomI
==
(
y
+
j
))
||
(
atomI
>=
cSim
.
atoms
)
||
((
y
+
j
)
>=
cSim
.
atoms
)
)
?
0
:
1
;
...
...
@@ -138,27 +130,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)(
fieldPolarSumS
[
1
]
+=
mask
?
ijField
[
5
][
1
]
:
0
.
0
f
;
fieldPolarSumS
[
2
]
+=
mask
?
ijField
[
5
][
2
]
:
0
.
0
f
;
//#ifdef AMOEBA_DEBUG
#if 0
unsigned int index = y + j;
if( atomI == targetAtom ){
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j);
debugArray[index].z = bornRadii[atomI];
debugArray[index].w = jBornRadius;
index = debugAccumulate( index, debugArray, ijField[0], mask, 1.0f );
index = debugAccumulate( index, debugArray, ijField[1], mask, 2.0f );
index = debugAccumulate( index, debugArray, ijField[4], mask, 3.0f );
index = debugAccumulate( index, debugArray, ijField[5], mask, 4.0f );
}
#endif
calculateMutualInducedAndGkFieldsGkPairIxn_kernel
(
localParticle
,
psA
[
j
],
ijField
#ifdef AMOEBA_DEBUG
,
debugArray
#endif
);
calculateMutualInducedAndGkFieldsGkPairIxn_kernel
(
localParticle
,
psA
[
j
],
ijField
);
// atomI == atomJ contribution included
...
...
@@ -171,24 +143,6 @@ if( atomI == targetAtom ){
fieldPolarSumS
[
1
]
+=
mask
?
ijField
[
2
][
1
]
:
0
.
0
f
;
fieldPolarSumS
[
2
]
+=
mask
?
ijField
[
2
][
2
]
:
0
.
0
f
;
//#ifdef AMOEBA_DEBUG
#if 0
if( atomI == targetAtom ){
index = debugAccumulate( index, debugArray, ijField[0], mask, 5.0f );
index = debugAccumulate( index, debugArray, ijField[2], mask, 6.0f );
index = debugAccumulate( index, debugArray, jDipoleS, 1, 7.0f );
index = debugAccumulate( index, debugArray, jDipolePolarS, 1, 8.0f );
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = bornRadii[atomI];
debugArray[index].y = jBornRadius;
debugArray[index].w = 9.0f;
}
#endif
}
// Write results
...
...
@@ -213,9 +167,8 @@ if( atomI == targetAtom ){
load3dArray
(
offset
,
fieldPolarSumS
,
outputFieldPolarS
);
#endif
}
else
// 100% utilization
{
}
else
{
// Read fixed atom data into registers and GRF
if
(
lasty
!=
y
)
{
...
...
@@ -235,11 +188,7 @@ if( atomI == targetAtom ){
// load coords, charge, ...
calculateMutualInducedAndGkFieldsPairIxn_kernel
(
localParticle
,
psA
[
tj
],
ijField
#ifdef AMOEBA_DEBUG
,
debugArray
#endif
);
calculateMutualInducedAndGkFieldsPairIxn_kernel
(
localParticle
,
psA
[
tj
],
ijField
);
if
(
(
atomI
<
cSim
.
atoms
)
&&
((
y
+
tj
)
<
cSim
.
atoms
)
){
...
...
@@ -289,29 +238,7 @@ if( atomI == targetAtom ){
}
//#ifdef AMOEBA_DEBUG
#if 0
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
if( atomI == targetAtom || (y + tj) == targetAtom ){
unsigned int indexI = (atomI == targetAtom) ? 0 : 2;
unsigned int maskD = (atomI < cSim.atoms) && ((y+tj) < cSim.atoms);
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
debugArray[index].z = bornRadii[atomI];
debugArray[index].w = jBornRadius;
index = debugAccumulate( index, debugArray, ijField[indexI], maskD, -1.0f );
index = debugAccumulate( index, debugArray, ijField[indexI+1], maskD, -2.0f );
index = debugAccumulate( index, debugArray, ijField[indexI+4], maskD, -3.0f );
index = debugAccumulate( index, debugArray, ijField[indexI+5], maskD, -4.0f );
}
#endif
calculateMutualInducedAndGkFieldsGkPairIxn_kernel
(
localParticle
,
psA
[
tj
],
ijField
#ifdef AMOEBA_DEBUG
,
debugArray
#endif
);
calculateMutualInducedAndGkFieldsGkPairIxn_kernel
(
localParticle
,
psA
[
tj
],
ijField
);
if
(
(
atomI
<
cSim
.
atoms
)
&&
((
y
+
tj
)
<
cSim
.
atoms
)
){
...
...
@@ -337,24 +264,6 @@ if( atomI == targetAtom || (y + tj) == targetAtom ){
psA
[
tj
].
fieldPolarS
[
2
]
+=
ijField
[
3
][
2
];
}
//#ifdef AMOEBA_DEBUG
#if 0
if( atomI == targetAtom || (y + tj) == targetAtom ){
unsigned int indexI = (atomI == targetAtom) ? 0 : 1;
unsigned int maskD = (atomI < cSim.atoms) && ((y+tj) < cSim.atoms);
index = debugAccumulate( index, debugArray, ijField[indexI], maskD, -5.0f );
index = debugAccumulate( index, debugArray, ijField[indexI+2], maskD, -6.0f );
index = debugAccumulate( index, debugArray, jDipoleS, 1, -7.0f );
index = debugAccumulate( index, debugArray, jDipolePolarS, 1, -8.0f );
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = bornRadii[atomI];
debugArray[index].y = jBornRadius;
debugArray[index].w = -9.0f;
}
#endif
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
}
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
View file @
2b508482
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
...
...
@@ -33,19 +55,10 @@ void GetCalculateAmoebaCudaMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
RTERROR
(
status
,
"GetCalculateAmoebaCudaMutualInducedFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed"
);
}
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
#include "kCalculateAmoebaCudaMutualInducedParticle.h"
__device__
void
calculateMutualInducedFieldPairIxn_kernel
(
MutualInducedParticle
&
atomI
,
MutualInducedParticle
&
atomJ
,
float
fields
[
4
][
3
]
#ifdef AMOEBA_DEBUG
,
float4
*
debugArray
#endif
)
float
fields
[
4
][
3
]
)
{
float
deltaR
[
3
];
...
...
@@ -252,20 +265,6 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
#ifdef AMOEBA_DEBUG
int
targetAtom
=
1231
;
static
const
char
*
methodName
=
"cudaComputeAmoebaMutualInducedFieldMatrixMultiply"
;
static
int
iteration
=
1
;
if
(
1
&&
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s
\n
"
,
methodName
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
CUDAStream
<
float4
>*
debugArray
=
new
CUDAStream
<
float4
>
(
paddedNumberOfAtoms
*
paddedNumberOfAtoms
,
1
,
"DebugArray"
);
memset
(
debugArray
->
_pSysData
,
0
,
sizeof
(
float
)
*
4
*
paddedNumberOfAtoms
*
paddedNumberOfAtoms
);
debugArray
->
Upload
();
#endif
kClearFields_3
(
amoebaGpu
,
2
);
if
(
threadsPerBlock
==
0
){
...
...
@@ -279,169 +278,23 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
MutualInducedParticle
),
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
#ifdef AMOEBA_DEBUG
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u
\n
"
,
methodName
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
MutualInducedParticle
),
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
#endif
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaMutualInducedFieldN2ByWarp_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_2
->
_pDevData
,
debugArray
->
_pDevData
,
targetAtom
);
#else
amoebaGpu
->
psWorkArray_3_2
->
_pDevData
);
#endif
}
else
{
kCalculateAmoebaMutualInducedFieldN2_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
>>>
(
gpu
->
psWorkUnit
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_2
->
_pDevData
,
debugArray
->
_pDevData
,
targetAtom
);
#else
amoebaGpu
->
psWorkArray_3_2
->
_pDevData
);
#endif
}
LAUNCHERROR
(
"kCalculateAmoebaMutualInducedField"
);
kReduceMutualInducedFields
(
amoebaGpu
,
outputArray
,
outputPolarArray
);
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_1
->
Download
();
amoebaGpu
->
psWorkArray_3_2
->
Download
();
if
(
amoebaGpu
->
log
&&
iteration
==
-
1
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Finished MI kernel execution %d
\n
"
,
iteration
);
(
void
)
fflush
(
amoebaGpu
->
log
);
outputArray
->
Download
();
outputPolarArray
->
Download
();
debugArray
->
Download
();
int
maxPrint
=
1400
;
for
(
int
ii
=
0
;
ii
<
gpu
->
natoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%5d "
,
ii
);
int
indexOffset
=
ii
*
3
;
// MI
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Mult[%16.9e %16.9e %16.9e] "
,
outputArray
->
_pSysData
[
indexOffset
],
outputArray
->
_pSysData
[
indexOffset
+
1
],
outputArray
->
_pSysData
[
indexOffset
+
2
]
);
// MI polar
(
void
)
fprintf
(
amoebaGpu
->
log
,
"MultP[%16.9e %16.9e %16.9e] "
,
outputPolarArray
->
_pSysData
[
indexOffset
],
outputPolarArray
->
_pSysData
[
indexOffset
+
1
],
outputPolarArray
->
_pSysData
[
indexOffset
+
2
]
);
// coords
#if 0
(void) fprintf( amoebaGpu->log,"x[%16.9e %16.9e %16.9e] ",
gpu->psPosq4->_pSysData[ii].x,
gpu->psPosq4->_pSysData[ii].y,
gpu->psPosq4->_pSysData[ii].z);
for( int jj = 0; jj < gpu->natoms && jj < 5; jj++ ){
int debugIndex = jj*gpu->natoms + ii;
float xx = gpu->psPosq4->_pSysData[jj].x - gpu->psPosq4->_pSysData[ii].x;
float yy = gpu->psPosq4->_pSysData[jj].y - gpu->psPosq4->_pSysData[ii].y;
float zz = gpu->psPosq4->_pSysData[jj].z - gpu->psPosq4->_pSysData[ii].z;
(void) fprintf( amoebaGpu->log,"\n%4d %4d delta [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] ",
ii, jj, xx, yy, zz,
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );
}
#endif
if
(
ii
==
targetAtom
){
float
sums
[
4
][
3
]
=
{
{
0.0
f
,
0.0
f
,
0.0
f
},
{
0.0
f
,
0.0
f
,
0.0
f
},
{
0.0
f
,
0.0
f
,
0.0
f
},
{
0.0
f
,
0.0
f
,
0.0
f
}
};
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
unsigned
int
count
=
0
;
for
(
int
jj
=
0
;
jj
<
gpu
->
natoms
;
jj
++
){
int
debugIndex
=
jj
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%4d %4d Pint [%16.9e %16.9e %16.9e %16.9e] "
,
ii
,
jj
,
debugArray
->
_pSysData
[
debugIndex
].
x
,
debugArray
->
_pSysData
[
debugIndex
].
y
,
debugArray
->
_pSysData
[
debugIndex
].
z
,
debugArray
->
_pSysData
[
debugIndex
].
w
);
//debugIndex += gpu->natoms;
debugIndex
+=
paddedNumberOfAtoms
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"[%16.9e %16.9e %16.9e] "
,
debugArray
->
_pSysData
[
debugIndex
].
x
,
debugArray
->
_pSysData
[
debugIndex
].
y
,
debugArray
->
_pSysData
[
debugIndex
].
z
);
int
index
=
0
;
sums
[
index
][
0
]
+=
debugArray
->
_pSysData
[
debugIndex
].
x
;
sums
[
index
][
1
]
+=
debugArray
->
_pSysData
[
debugIndex
].
y
;
sums
[
index
][
2
]
+=
debugArray
->
_pSysData
[
debugIndex
].
z
;
if
(
count
&&
(
(
count
%
31
)
==
0
)
){
static
float
saveSum
[
3
]
=
{
0.0
f
,
0.0
f
,
0.0
f
};
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Block sum [%16.9e %16.9e %16.9e] "
,
sums
[
index
][
0
]
-
saveSum
[
0
],
sums
[
index
][
1
]
-
saveSum
[
1
],
sums
[
index
][
2
]
-
saveSum
[
2
]
);
saveSum
[
0
]
=
sums
[
index
][
0
];
saveSum
[
1
]
=
sums
[
index
][
1
];
saveSum
[
2
]
=
sums
[
index
][
2
];
}
debugIndex
+=
paddedNumberOfAtoms
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"[%16.9e %16.9e %16.9e] "
,
debugArray
->
_pSysData
[
debugIndex
].
x
,
debugArray
->
_pSysData
[
debugIndex
].
y
,
debugArray
->
_pSysData
[
debugIndex
].
z
);
index
++
;
sums
[
index
][
0
]
+=
debugArray
->
_pSysData
[
debugIndex
].
x
;
sums
[
index
][
1
]
+=
debugArray
->
_pSysData
[
debugIndex
].
y
;
sums
[
index
][
2
]
+=
debugArray
->
_pSysData
[
debugIndex
].
z
;
if
(
count
&&
(
(
count
%
31
)
==
0
)
){
static
float
saveSum
[
3
]
=
{
0.0
f
,
0.0
f
,
0.0
f
};
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Block sumP [%16.9e %16.9e %16.9e] "
,
sums
[
index
][
0
]
-
saveSum
[
0
],
sums
[
index
][
1
]
-
saveSum
[
1
],
sums
[
index
][
2
]
-
saveSum
[
2
]
);
saveSum
[
0
]
=
sums
[
index
][
0
];
saveSum
[
1
]
=
sums
[
index
][
1
];
saveSum
[
2
]
=
sums
[
index
][
2
];
}
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
count
++
;
}
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
int
index
=
0
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Sum1 [%16.9e %16.9e %16.9e]
\n
"
,
sums
[
index
][
0
],
sums
[
index
][
1
],
sums
[
index
][
2
]
);
index
++
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Sum2 [%16.9e %16.9e %16.9e]
\n
"
,
sums
[
index
][
0
],
sums
[
index
][
1
],
sums
[
index
][
2
]
);
index
++
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Sum3 [%16.9e %16.9e %16.9e]
\n
"
,
sums
[
index
][
0
],
sums
[
index
][
1
],
sums
[
index
][
2
]
);
index
++
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Sum4 [%16.9e %16.9e %16.9e]
\n
"
,
sums
[
index
][
0
],
sums
[
index
][
1
],
sums
[
index
][
2
]
);
index
++
;
}
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
if
(
ii
==
maxPrint
&&
(
gpu
->
natoms
-
maxPrint
)
>
ii
){
ii
=
gpu
->
natoms
-
maxPrint
;
}
}
(
void
)
fflush
(
amoebaGpu
->
log
);
iteration
++
;
}
delete
debugArray
;
#endif
}
/**---------------------------------------------------------------------------------------
...
...
@@ -457,18 +310,6 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
// ---------------------------------------------------------------------------------------
#ifdef AMOEBA_DEBUG
static
const
char
*
methodName
=
"cudaComputeAmoebaMutualInducedFieldBySOR"
;
static
int
timestep
=
0
;
std
::
vector
<
int
>
fileId
;
timestep
++
;
fileId
.
resize
(
2
);
fileId
[
0
]
=
timestep
;
fileId
[
1
]
=
1
;
#endif
// ---------------------------------------------------------------------------------------
int
done
;
int
iteration
;
...
...
@@ -489,38 +330,6 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
cudaMemcpy
(
amoebaGpu
->
psInducedDipole
->
_pDevData
,
amoebaGpu
->
psE_Field
->
_pDevData
,
3
*
gpu
->
sim
.
paddedNumberOfAtoms
*
sizeof
(
float
),
cudaMemcpyDeviceToDevice
);
cudaMemcpy
(
amoebaGpu
->
psInducedDipolePolar
->
_pDevData
,
amoebaGpu
->
psE_FieldPolar
->
_pDevData
,
3
*
gpu
->
sim
.
paddedNumberOfAtoms
*
sizeof
(
float
),
cudaMemcpyDeviceToDevice
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
amoebaGpu
->
psE_Field
->
Download
();
amoebaGpu
->
psE_FieldPolar
->
Download
();
amoebaGpu
->
psInducedDipole
->
Download
(),
amoebaGpu
->
psInducedDipolePolar
->
Download
();
amoebaGpu
->
psPolarizability
->
Download
();
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s Initial setup for matrix multiply
\n
"
,
methodName
);
int
offset
=
0
;
int
maxPrint
=
20000
;
for
(
int
ii
=
0
;
ii
<
gpu
->
natoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%4d pol=%12.4e "
,
ii
,
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
]
);
if
(
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
]
!=
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
+
1
]
||
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
]
!=
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
+
2
]
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"PolX!!! %12.4e %12.4e "
,
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psPolarizability
->
_pSysData
[
offset
+
2
]
);
}
(
void
)
fprintf
(
amoebaGpu
->
log
,
" E[%14.6e %14.6e %14.6e] Mi[%14.6e %14.6e %14.6e] "
,
amoebaGpu
->
psE_Field
->
_pSysData
[
offset
],
amoebaGpu
->
psE_Field
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psE_Field
->
_pSysData
[
offset
+
2
],
amoebaGpu
->
psInducedDipole
->
_pSysData
[
offset
],
amoebaGpu
->
psInducedDipole
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psInducedDipole
->
_pSysData
[
offset
+
2
]
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Ep[%14.6e %14.6e %14.6e] Mip[%14.6e %14.6e %14.6e]
\n
"
,
amoebaGpu
->
psE_FieldPolar
->
_pSysData
[
offset
],
amoebaGpu
->
psE_FieldPolar
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psE_FieldPolar
->
_pSysData
[
offset
+
2
],
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
offset
],
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
offset
+
2
]
);
offset
+=
3
;
if
(
ii
==
maxPrint
&&
(
ii
<
(
gpu
->
natoms
-
maxPrint
)
)
)
ii
=
(
gpu
->
natoms
-
maxPrint
);
}
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
// if polarization type is direct, set flags signalling done and return
if
(
amoebaGpu
->
amoebaSim
.
polarizationType
)
...
...
@@ -558,12 +367,6 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
amoebaGpu
->
psCurrentEpsilon
->
_pDevData
);
LAUNCHERROR
(
"kReduceMutualInducedFieldDelta"
);
#ifdef AMOEBA_DEBUG
if
(
0
&&
amoebaGpu
->
log
){
// trackMutualInducedIterations
trackMutualInducedIterations
(
amoebaGpu
,
iteration
);
}
#endif
// Debye=48.033324f
amoebaGpu
->
psCurrentEpsilon
->
Download
();
float
currentEpsilon
=
amoebaGpu
->
psCurrentEpsilon
->
_pSysData
[
0
];
...
...
@@ -572,54 +375,12 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
if
(
iteration
>
amoebaGpu
->
mutualInducedMaxIterations
||
amoebaGpu
->
mutualInducedCurrentEpsilon
<
amoebaGpu
->
mutualInducedTargetEpsilon
){
done
=
1
;
}
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
amoebaGpu
->
psInducedDipole
->
Download
();
amoebaGpu
->
psInducedDipolePolar
->
Download
();
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s iteration=%3d eps %14.6e done=%d
\n
"
,
methodName
,
iteration
,
amoebaGpu
->
mutualInducedCurrentEpsilon
,
done
);
(
void
)
fflush
(
amoebaGpu
->
log
);
int
offset
=
0
;
int
maxPrint
=
20
;
for
(
int
ii
=
0
;
ii
<
gpu
->
natoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%4d "
,
ii
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
" Mi[%14.6e %14.6e %14.6e] "
,
amoebaGpu
->
psInducedDipole
->
_pSysData
[
offset
],
amoebaGpu
->
psInducedDipole
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psInducedDipole
->
_pSysData
[
offset
+
2
]
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Mip[%14.6e %14.6e %14.6e]
\n
"
,
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
offset
],
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
offset
+
1
],
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
offset
+
2
]
);
if
(
ii
==
maxPrint
&&
(
ii
<
(
gpu
->
natoms
-
maxPrint
)
)
){
ii
=
(
gpu
->
natoms
-
maxPrint
);
offset
=
3
*
(
ii
+
1
);
}
else
{
offset
+=
3
;
}
}
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
iteration
++
;
}
amoebaGpu
->
mutualInducedDone
=
done
;
amoebaGpu
->
mutualInducedConverged
=
(
!
done
||
iteration
>
amoebaGpu
->
mutualInducedMaxIterations
)
?
0
:
1
;
#ifdef AMOEBA_DEBUG
if
(
0
){
std
::
vector
<
int
>
fileId
;
//fileId.push_back( 0 );
VectorOfDoubleVectors
outputVector
;
// cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psInducedDipole
,
outputVector
,
NULL
,
1.0
f
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psInducedDipolePolar
,
outputVector
,
NULL
,
1.0
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaMI"
,
fileId
,
outputVector
);
}
#endif
// ---------------------------------------------------------------------------------------
}
void
cudaComputeAmoebaMutualInducedField
(
amoebaGpuContext
amoebaGpu
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment