Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
761d7e17
Commit
761d7e17
authored
Apr 07, 2011
by
Mark Friedrichs
Browse files
Removal of limitation for 'long-range in sequence' covalent bonds
Reduced memory footprint
parent
80c4976e
Changes
36
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
524 additions
and
1034 deletions
+524
-1034
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.h
...rc/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.h
+14
-14
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
...uda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
+8
-7
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.h
...cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.h
+15
-15
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
...eba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
+18
-4
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
...src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
+24
-310
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
.../src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
+37
-121
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
...ms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
+19
-36
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.h
...rms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.h
+24
-24
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
.../src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
+18
-27
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
...a/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
+23
-23
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRotateFrame.cu
...forms/cuda/src/kernels/kCalculateAmoebaCudaRotateFrame.cu
+254
-391
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaUtilities.cu
...atforms/cuda/src/kernels/kCalculateAmoebaCudaUtilities.cu
+14
-9
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.cu
...platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.cu
+10
-9
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.h
.../platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.h
+17
-17
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.cu
...rms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.cu
+9
-7
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.h
...orms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.h
+20
-20
No files found.
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.h
View file @
761d7e17
...
...
@@ -118,7 +118,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)(
#endif
);
unsigned
int
mask
=
(
(
atomI
==
(
y
+
j
))
||
(
atomI
>=
c
AmoebaSim
.
numberOfA
toms
)
||
((
y
+
j
)
>=
c
AmoebaSim
.
numberOfA
toms
)
)
?
0
:
1
;
unsigned
int
mask
=
(
(
atomI
==
(
y
+
j
))
||
(
atomI
>=
c
Sim
.
a
toms
)
||
((
y
+
j
)
>=
c
Sim
.
a
toms
)
)
?
0
:
1
;
// add to field at atomI the field due atomJ's dipole
...
...
@@ -162,7 +162,7 @@ if( atomI == targetAtom ){
// atomI == atomJ contribution included
mask
=
(
(
atomI
>=
c
AmoebaSim
.
numberOfA
toms
)
||
((
y
+
j
)
>=
c
AmoebaSim
.
numberOfA
toms
)
)
?
0
:
1
;
mask
=
(
(
atomI
>=
c
Sim
.
a
toms
)
||
((
y
+
j
)
>=
c
Sim
.
a
toms
)
)
?
0
:
1
;
fieldSumS
[
0
]
+=
mask
?
ijField
[
0
][
0
]
:
0
.
0
f
;
fieldSumS
[
1
]
+=
mask
?
ijField
[
0
][
1
]
:
0
.
0
f
;
fieldSumS
[
2
]
+=
mask
?
ijField
[
0
][
2
]
:
0
.
0
f
;
...
...
@@ -181,7 +181,7 @@ if( atomI == targetAtom ){
index = debugAccumulate( index, debugArray, jDipoleS, 1, 7.0f );
index = debugAccumulate( index, debugArray, jDipolePolarS, 1, 8.0f );
index += c
Amoeba
Sim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = bornRadii[atomI];
debugArray[index].y = jBornRadius;
debugArray[index].w = 9.0f;
...
...
@@ -195,7 +195,7 @@ if( atomI == targetAtom ){
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
cSim
.
paddedNumberOfAtoms
);
load3dArrayBufferPerWarp
(
offset
,
fieldSum
,
outputField
);
load3dArrayBufferPerWarp
(
offset
,
fieldPolarSum
,
outputFieldPolar
);
...
...
@@ -204,7 +204,7 @@ if( atomI == targetAtom ){
load3dArrayBufferPerWarp
(
offset
,
fieldPolarSumS
,
outputFieldPolarS
);
#else
unsigned
int
offset
=
3
*
(
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
unsigned
int
offset
=
3
*
(
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
paddedNumberOfAtoms
);
load3dArray
(
offset
,
fieldSum
,
outputField
);
load3dArray
(
offset
,
fieldPolarSum
,
outputFieldPolar
);
...
...
@@ -241,7 +241,7 @@ if( atomI == targetAtom ){
#endif
);
if
(
(
atomI
<
c
AmoebaSim
.
numberOfA
toms
)
&&
((
y
+
tj
)
<
c
AmoebaSim
.
numberOfA
toms
)
){
if
(
(
atomI
<
c
Sim
.
a
toms
)
&&
((
y
+
tj
)
<
c
Sim
.
a
toms
)
){
// add to field at atomI the field due atomJ's dipole
...
...
@@ -294,7 +294,7 @@ if( atomI == targetAtom ){
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
if( atomI == targetAtom || (y + tj) == targetAtom ){
unsigned int indexI = (atomI == targetAtom) ? 0 : 2;
unsigned int maskD = (atomI < c
AmoebaSim.numberOfA
toms) && ((y+tj) < c
AmoebaSim.numberOfA
toms);
unsigned int maskD = (atomI < c
Sim.a
toms) && ((y+tj) < c
Sim.a
toms);
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
...
...
@@ -314,7 +314,7 @@ if( atomI == targetAtom || (y + tj) == targetAtom ){
);
if
(
(
atomI
<
c
AmoebaSim
.
numberOfA
toms
)
&&
((
y
+
tj
)
<
c
AmoebaSim
.
numberOfA
toms
)
){
if
(
(
atomI
<
c
Sim
.
a
toms
)
&&
((
y
+
tj
)
<
c
Sim
.
a
toms
)
){
fieldSumS
[
0
]
+=
ijField
[
0
][
0
];
fieldSumS
[
1
]
+=
ijField
[
0
][
1
];
...
...
@@ -341,14 +341,14 @@ if( atomI == targetAtom || (y + tj) == targetAtom ){
#if 0
if( atomI == targetAtom || (y + tj) == targetAtom ){
unsigned int indexI = (atomI == targetAtom) ? 0 : 1;
unsigned int maskD = (atomI < c
AmoebaSim.numberOfA
toms) && ((y+tj) < c
AmoebaSim.numberOfA
toms);
unsigned int maskD = (atomI < c
Sim.a
toms) && ((y+tj) < c
Sim.a
toms);
index = debugAccumulate( index, debugArray, ijField[indexI], maskD, -5.0f );
index = debugAccumulate( index, debugArray, ijField[indexI+2], maskD, -6.0f );
index = debugAccumulate( index, debugArray, jDipoleS, 1, -7.0f );
index = debugAccumulate( index, debugArray, jDipolePolarS, 1, -8.0f );
index += c
Amoeba
Sim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = bornRadii[atomI];
debugArray[index].y = jBornRadius;
debugArray[index].w = -9.0f;
...
...
@@ -362,13 +362,13 @@ if( atomI == targetAtom || (y + tj) == targetAtom ){
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
cSim
.
paddedNumberOfAtoms
);
load3dArrayBufferPerWarp
(
offset
,
fieldSum
,
outputField
);
load3dArrayBufferPerWarp
(
offset
,
fieldPolarSum
,
outputFieldPolar
);
load3dArrayBufferPerWarp
(
offset
,
fieldSumS
,
outputFieldS
);
load3dArrayBufferPerWarp
(
offset
,
fieldPolarSumS
,
outputFieldPolarS
);
offset
=
3
*
(
y
+
tgx
+
warp
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
offset
=
3
*
(
y
+
tgx
+
warp
*
cSim
.
paddedNumberOfAtoms
);
load3dArrayBufferPerWarp
(
offset
,
sA
[
threadIdx
.
x
].
field
,
outputField
);
load3dArrayBufferPerWarp
(
offset
,
sA
[
threadIdx
.
x
].
fieldPolar
,
outputFieldPolar
);
...
...
@@ -376,13 +376,13 @@ if( atomI == targetAtom || (y + tj) == targetAtom ){
load3dArrayBufferPerWarp
(
offset
,
sA
[
threadIdx
.
x
].
fieldPolarS
,
outputFieldPolarS
);
#else
unsigned
int
offset
=
3
*
(
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
unsigned
int
offset
=
3
*
(
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
paddedNumberOfAtoms
);
load3dArray
(
offset
,
fieldSum
,
outputField
);
load3dArray
(
offset
,
fieldPolarSum
,
outputFieldPolar
);
load3dArray
(
offset
,
fieldSumS
,
outputFieldS
);
load3dArray
(
offset
,
fieldPolarSumS
,
outputFieldPolarS
);
offset
=
3
*
(
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
offset
=
3
*
(
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
paddedNumberOfAtoms
);
load3dArray
(
offset
,
sA
[
threadIdx
.
x
].
field
,
outputField
);
load3dArray
(
offset
,
sA
[
threadIdx
.
x
].
fieldPolar
,
outputFieldPolar
);
load3dArray
(
offset
,
sA
[
threadIdx
.
x
].
fieldS
,
outputFieldS
);
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
View file @
761d7e17
...
...
@@ -217,13 +217,14 @@ void kSorUpdateMutualInducedField_kernel(
static
void
kReduceMutualInducedFields
(
amoebaGpuContext
amoebaGpu
,
CUDAStream
<
float
>*
outputArray
,
CUDAStream
<
float
>*
outputPolarArray
)
{
kReduceFields_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
amoebaGpu
->
fieldReduceThreadsPerBlock
>>>
(
amoebaGpu
->
paddedNumberOfAtoms
*
3
,
amoebaGpu
->
outputBuffers
,
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
kReduceFields_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
bsf_reduce_threads_per_block
>>>
(
gpu
->
sim
.
paddedNumberOfAtoms
*
3
,
gpu
->
sim
.
outputBuffers
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
outputArray
->
_pDevData
);
LAUNCHERROR
(
"kReduceMI_Fields1"
);
kReduceFields_kernel
<<<
amoebaGpu
->
nonbond
B
locks
,
amoebaGpu
->
fieldR
educe
T
hreads
PerB
lock
>>>
(
amoebaGpu
->
paddedNumberOfAtoms
*
3
,
amoebaGpu
->
outputBuffers
,
kReduceFields_kernel
<<<
gpu
->
sim
.
nonbond
_b
locks
,
gpu
->
sim
.
bsf_r
educe
_t
hreads
_per_b
lock
>>>
(
gpu
->
sim
.
paddedNumberOfAtoms
*
3
,
gpu
->
sim
.
outputBuffers
,
amoebaGpu
->
psWorkArray_3_2
->
_pDevData
,
outputPolarArray
->
_pDevData
);
LAUNCHERROR
(
"kReduceMI_Fields2"
);
}
...
...
@@ -277,14 +278,14 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
#ifdef AMOEBA_DEBUG
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u
\n
"
,
methodName
,
amoebaGpu
->
nonbond
B
locks
,
threadsPerBlock
,
amoebaG
pu
->
bOutputBufferPerWarp
,
gpu
->
sim
.
nonbond
_b
locks
,
threadsPerBlock
,
g
pu
->
bOutputBufferPerWarp
,
sizeof
(
MutualInducedParticle
),
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
#endif
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaMutualInducedFieldN2ByWarp_kernel
<<<
amoebaGpu
->
nonbond
B
locks
,
threadsPerBlock
,
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
>>>
(
kCalculateAmoebaMutualInducedFieldN2ByWarp_kernel
<<<
gpu
->
sim
.
nonbond
_b
locks
,
threadsPerBlock
,
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
>>>
(
amoebaGpu
->
psWorkUnit
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
#ifdef AMOEBA_DEBUG
...
...
@@ -296,7 +297,7 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
}
else
{
kCalculateAmoebaMutualInducedFieldN2_kernel
<<<
amoebaGpu
->
nonbond
B
locks
,
threadsPerBlock
,
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
>>>
(
kCalculateAmoebaMutualInducedFieldN2_kernel
<<<
gpu
->
sim
.
nonbond
_b
locks
,
threadsPerBlock
,
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
>>>
(
amoebaGpu
->
psWorkUnit
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
#ifdef AMOEBA_DEBUG
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.h
View file @
761d7e17
...
...
@@ -105,7 +105,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)(
#endif
);
unsigned
int
mask
=
(
(
atomI
==
(
y
+
j
))
||
(
atomI
>=
c
AmoebaSim
.
numberOfA
toms
)
||
((
y
+
j
)
>=
c
AmoebaSim
.
numberOfA
toms
)
)
?
0
:
1
;
unsigned
int
mask
=
(
(
atomI
==
(
y
+
j
))
||
(
atomI
>=
c
Sim
.
a
toms
)
||
((
y
+
j
)
>=
c
Sim
.
a
toms
)
)
?
0
:
1
;
// add to field at atomI the field due atomJ's dipole
...
...
@@ -126,20 +126,20 @@ if( atomI == targetAtom ){
debugArray
[
index
].
x
=
(
float
)
atomI
;
debugArray
[
index
].
y
=
(
float
)
(
y
+
j
);
//debugArray[index].z = cAmoebaSim.pDampingFactorAndThole[atomI].x;
debugArray
[
index
].
z
=
(
float
)
c
AmoebaSim
.
numberOfA
toms
;
debugArray
[
index
].
z
=
(
float
)
c
Sim
.
a
toms
;
debugArray
[
index
].
w
=
(
float
)
(
mask
+
1
);
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
mask
?
ijField
[
indexI
][
0
]
:
0
.
0
f
;
debugArray
[
index
].
y
=
mask
?
ijField
[
indexI
][
1
]
:
0
.
0
f
;
debugArray
[
index
].
z
=
mask
?
ijField
[
indexI
][
2
]
:
0
.
0
f
;
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
mask
?
ijField
[
indexI
+
1
][
0
]
:
0
.
0
f
;
debugArray
[
index
].
y
=
mask
?
ijField
[
indexI
+
1
][
1
]
:
0
.
0
f
;
debugArray
[
index
].
z
=
mask
?
ijField
[
indexI
+
1
][
2
]
:
0
.
0
f
;
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
(
float
)
x
;
debugArray
[
index
].
y
=
(
float
)
y
;
debugArray
[
index
].
z
=
(
float
)
1
.
0
f
;
...
...
@@ -150,12 +150,12 @@ if( atomI == targetAtom ){
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
cSim
.
paddedNumberOfAtoms
);
load3dArrayBufferPerWarp
(
offset
,
fieldSum
,
outputField
);
load3dArrayBufferPerWarp
(
offset
,
fieldPolarSum
,
outputFieldPolar
);
#else
unsigned
int
offset
=
3
*
(
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
unsigned
int
offset
=
3
*
(
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
paddedNumberOfAtoms
);
load3dArray
(
offset
,
fieldSum
,
outputField
);
load3dArray
(
offset
,
fieldPolarSum
,
outputFieldPolar
);
...
...
@@ -191,7 +191,7 @@ if( atomI == targetAtom ){
#endif
);
unsigned
int
mask
=
(
(
atomI
>=
c
AmoebaSim
.
numberOfA
toms
)
||
((
y
+
tj
)
>=
c
AmoebaSim
.
numberOfA
toms
)
)
?
0
:
1
;
unsigned
int
mask
=
(
(
atomI
>=
c
Sim
.
a
toms
)
||
((
y
+
tj
)
>=
c
Sim
.
a
toms
)
)
?
0
:
1
;
// add to field at atomI the field due atomJ's dipole
...
...
@@ -229,17 +229,17 @@ if( atomI == targetAtom || (y + tj) == targetAtom ){
debugArray
[
index
].
z
=
cAmoebaSim
.
pDampingFactorAndThole
[
atomI
].
x
;
debugArray
[
index
].
w
=
(
float
)
(
mask
+
1
);
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
mask
?
ijField
[
indexI
][
0
]
:
0
.
0
f
;
debugArray
[
index
].
y
=
mask
?
ijField
[
indexI
][
1
]
:
0
.
0
f
;
debugArray
[
index
].
z
=
mask
?
ijField
[
indexI
][
2
]
:
0
.
0
f
;
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
mask
?
ijField
[
indexI
+
1
][
0
]
:
0
.
0
f
;
debugArray
[
index
].
y
=
mask
?
ijField
[
indexI
+
1
][
1
]
:
0
.
0
f
;
debugArray
[
index
].
z
=
mask
?
ijField
[
indexI
+
1
][
2
]
:
0
.
0
f
;
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
(
float
)
x
;
debugArray
[
index
].
y
=
(
float
)
y
;
debugArray
[
index
].
z
=
(
float
)
-
1
.
0
f
;
...
...
@@ -253,21 +253,21 @@ if( atomI == targetAtom || (y + tj) == targetAtom ){
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
cSim
.
paddedNumberOfAtoms
);
load3dArrayBufferPerWarp
(
offset
,
fieldSum
,
outputField
);
load3dArrayBufferPerWarp
(
offset
,
fieldPolarSum
,
outputFieldPolar
);
offset
=
3
*
(
y
+
tgx
+
warp
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
offset
=
3
*
(
y
+
tgx
+
warp
*
cSim
.
paddedNumberOfAtoms
);
load3dArrayBufferPerWarp
(
offset
,
sA
[
threadIdx
.
x
].
field
,
outputField
);
load3dArrayBufferPerWarp
(
offset
,
sA
[
threadIdx
.
x
].
fieldPolar
,
outputFieldPolar
);
#else
unsigned
int
offset
=
3
*
(
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
unsigned
int
offset
=
3
*
(
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
paddedNumberOfAtoms
);
load3dArray
(
offset
,
fieldSum
,
outputField
);
load3dArray
(
offset
,
fieldPolarSum
,
outputFieldPolar
);
offset
=
3
*
(
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
offset
=
3
*
(
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
paddedNumberOfAtoms
);
load3dArray
(
offset
,
sA
[
threadIdx
.
x
].
field
,
outputField
);
load3dArray
(
offset
,
sA
[
threadIdx
.
x
].
fieldPolar
,
outputFieldPolar
);
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
View file @
761d7e17
...
...
@@ -3,6 +3,7 @@
//-----------------------------------------------------------------------------------------
#include "amoebaGpuTypes.h"
#include "cudaKernels.h"
#include "amoebaCudaKernels.h"
#include "bbsort.h"
...
...
@@ -982,7 +983,7 @@ void kRecordInducedDipoleField_kernel(float* output, float* outputPolar)
}
}
extern
void
cudaComputeAmoebaMapTorque
s
AndAddTo
tal
Force
2
(
amoebaGpuContext
gpu
,
CUDAStream
<
float
>*
psTorque
,
CUDAStream
<
float4
>*
psOutputForce
);
extern
void
cudaComputeAmoebaMapTorqueAndAddToForce
(
amoebaGpuContext
gpu
,
CUDAStream
<
float
>*
psTorque
);
/**
* Compute the potential and forces due to the reciprocal space PME calculation for fixed multipoles.
...
...
@@ -1014,7 +1015,21 @@ void kCalculateAmoebaPMEFixedMultipoles(amoebaGpuContext amoebaGpu)
LAUNCHERROR
(
"kRecordFixedMultipoleField"
);
kComputeFixedMultipoleForceAndEnergy_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kComputeFixedMultipoleForceAndEnergy"
);
cudaComputeAmoebaMapTorquesAndAddTotalForce2
(
amoebaGpu
,
amoebaGpu
->
psTorque
,
gpu
->
psForce4
);
if
(
0
){
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
std
::
vector
<
int
>
fileId
;
fileId
.
push_back
(
0
);
VectorOfDoubleVectors
outputVector
;
kReduceForces
(
gpu
);
cudaLoadCudaFloat4Array
(
gpu
->
natoms
,
3
,
gpu
->
psForce4
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaLoadCudaFloat4Array
(
gpu
->
natoms
,
3
,
gpu
->
psPosq4
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaRecipForceOnlyFixed"
,
fileId
,
outputVector
);
kClearForces
(
gpu
);
}
cudaComputeAmoebaMapTorqueAndAddToForce
(
amoebaGpu
,
amoebaGpu
->
psTorque
);
}
/**
...
...
@@ -1047,6 +1062,5 @@ void kCalculateAmoebaPMEInducedDipoleForces(amoebaGpuContext amoebaGpu)
kComputeInducedDipoleForceAndEnergy_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kComputeInducedDipoleForceAndEnergy"
);
cudaComputeAmoebaMapTorquesAndAddTotalForce2
(
amoebaGpu
,
amoebaGpu
->
psTorque
,
gpu
->
psForce4
);
LAUNCHERROR
(
"cudaComputeAmoebaMapTorquesAndAddTotalForce2_kCalculateAmoebaPMEInducedDipoleForces"
);
cudaComputeAmoebaMapTorqueAndAddToForce
(
amoebaGpu
,
amoebaGpu
->
psTorque
);
}
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
View file @
761d7e17
...
...
@@ -3,6 +3,7 @@
//-----------------------------------------------------------------------------------------
#include "amoebaGpuTypes.h"
#include "cudaKernels.h"
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
...
...
@@ -91,7 +92,7 @@ __device__ void sumTempBuffer( PmeDirectElectrostaticParticle& atomI, PmeDirectE
__device__ static void debugSetup( unsigned int atomI, unsigned int atomJ,
float4* debugArray, float4* pullBack )
{
unsigned int index = atomI + atomJ*c
Amoeba
Sim.paddedNumberOfAtoms;
unsigned int index = atomI + atomJ*cSim.paddedNumberOfAtoms;
float blockId = 111.0f;
debugArray[index].x = (float) atomI;
...
...
@@ -100,7 +101,7 @@ __device__ static void debugSetup( unsigned int atomI, unsigned int atomJ,
debugArray[index].w = blockId;
for( int pullIndex = 0; pullIndex < 1; pullIndex++ ){
index += c
Amoeba
Sim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
...
...
@@ -1082,66 +1083,17 @@ __device__ void loadPmeDirectElectrostaticShared( struct PmeDirectElectrostaticP
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateAmoebaCudaPmeDirectElectrostatic.h"
// reduce psWorkArray_3_1 -> force
// reduce psWorkArray_3_2 -> torque
// reduce psWorkArray_3_1 -> torque
static
void
kReduce
Force
Torque
(
amoebaGpuContext
amoebaGpu
)
static
void
kReduceTorque
(
amoebaGpuContext
amoebaGpu
)
{
kReduceFields_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
amoebaGpu
->
fieldReduceThreadsPerBlock
>>>
(
amoebaGpu
->
paddedNumberOfAtoms
*
3
,
amoebaGpu
->
outputBuffers
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
amoebaGpu
->
psForce
->
_pDevData
);
LAUNCHERROR
(
"kReducePmeDirectElectrostaticForce"
);
kReduceFields_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
amoebaGpu
->
fieldReduceThreadsPerBlock
>>>
(
amoebaGpu
->
paddedNumberOfAtoms
*
3
,
amoebaGpu
->
outputBuffers
,
amoebaGpu
->
psWorkArray_3_2
->
_pDevData
,
amoebaGpu
->
psTorque
->
_pDevData
);
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
kReduceFields_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
bsf_reduce_threads_per_block
>>>
(
gpu
->
sim
.
paddedNumberOfAtoms
*
3
,
gpu
->
sim
.
outputBuffers
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
amoebaGpu
->
psTorque
->
_pDevData
);
LAUNCHERROR
(
"kReducePmeDirectElectrostaticTorque"
);
}
/**---------------------------------------------------------------------------------------
Zero gpu->psForce4
@param amoebaGpu amoebaGpu context
--------------------------------------------------------------------------------------- */
static
void
zeroForce
(
amoebaGpuContext
amoebaGpu
)
{
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
for
(
int
ii
=
0
;
ii
<
amoebaGpu
->
gpuContext
->
natoms
;
ii
++
){
gpu
->
psForce4
->
_pSysData
[
ii
].
x
=
0.0
f
;
gpu
->
psForce4
->
_pSysData
[
ii
].
y
=
0.0
f
;
gpu
->
psForce4
->
_pSysData
[
ii
].
z
=
0.0
f
;
}
gpu
->
psForce4
->
Upload
();
}
/**---------------------------------------------------------------------------------------
Copy gpu->psForce4 to amoebaGpu->psForce
@param amoebaGpu amoebaGpu context
--------------------------------------------------------------------------------------- */
static
void
copyForce
(
amoebaGpuContext
amoebaGpu
,
float
conversion
)
{
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
gpu
->
psForce4
->
Download
();
int
indexOffset
=
0
;
for
(
int
ii
=
0
;
ii
<
amoebaGpu
->
gpuContext
->
natoms
;
ii
++
){
amoebaGpu
->
psForce
->
_pSysData
[
indexOffset
]
=
conversion
*
(
gpu
->
psForce4
->
_pSysData
[
ii
].
x
);
amoebaGpu
->
psForce
->
_pSysData
[
indexOffset
+
1
]
=
conversion
*
(
gpu
->
psForce4
->
_pSysData
[
ii
].
y
);
amoebaGpu
->
psForce
->
_pSysData
[
indexOffset
+
2
]
=
conversion
*
(
gpu
->
psForce4
->
_pSysData
[
ii
].
z
);
indexOffset
+=
3
;
}
amoebaGpu
->
psForce
->
Upload
();
}
//#define GET_INDUCED_DIPOLE_FROM_FILE
#ifdef GET_INDUCED_DIPOLE_FROM_FILE
#include <stdlib.h>
#endif
/**---------------------------------------------------------------------------------------
Compute Amoeba dirrect space portion of electrostatic force & torque
...
...
@@ -1153,10 +1105,6 @@ static void copyForce( amoebaGpuContext amoebaGpu, float conversion )
void
cudaComputeAmoebaPmeDirectElectrostatic
(
amoebaGpuContext
amoebaGpu
)
{
// ---------------------------------------------------------------------------------------
static
unsigned
int
threadsPerBlock
=
0
;
#ifdef AMOEBA_DEBUG
static
const
char
*
methodName
=
"cudaComputeAmoebaPmeDirectElectrostatic"
;
static
int
timestep
=
0
;
...
...
@@ -1186,42 +1134,9 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
unsigned
int
targetAtom
=
49
;
#endif
#ifdef GET_INDUCED_DIPOLE_FROM_FILE
//std::string fileName = "waterInducedDipole.txt";
std
::
string
fileName
=
"water_3_MI.txt"
;
StringVectorVector
fileContents
;
readFile
(
fileName
,
fileContents
);
unsigned
int
offset
=
0
;
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Read file: %s %u
\n
"
,
fileName
.
c_str
(),
fileContents
.
size
()
);
fflush
(
amoebaGpu
->
log
);
}
for
(
unsigned
int
ii
=
1
;
ii
<
fileContents
.
size
()
-
1
;
ii
++
){
StringVector
lineTokens
=
fileContents
[
ii
];
unsigned
int
lineTokenIndex
=
1
;
// (void) fprintf( amoebaGpu->log, " %u %s %s\n", ii, lineTokens[0].c_str(), lineTokens[lineTokenIndex].c_str() ); fflush( amoebaGpu->log );
amoebaGpu
->
psInducedDipole
->
_pSysData
[
offset
++
]
=
static_cast
<
float
>
(
atof
(
lineTokens
[
lineTokenIndex
++
].
c_str
()));
amoebaGpu
->
psInducedDipole
->
_pSysData
[
offset
++
]
=
static_cast
<
float
>
(
atof
(
lineTokens
[
lineTokenIndex
++
].
c_str
()));
amoebaGpu
->
psInducedDipole
->
_pSysData
[
offset
++
]
=
static_cast
<
float
>
(
atof
(
lineTokens
[
lineTokenIndex
++
].
c_str
()));
offset
-=
3
;
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
offset
++
]
=
static_cast
<
float
>
(
atof
(
lineTokens
[
lineTokenIndex
++
].
c_str
()));
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
offset
++
]
=
static_cast
<
float
>
(
atof
(
lineTokens
[
lineTokenIndex
++
].
c_str
()));
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
offset
++
]
=
static_cast
<
float
>
(
atof
(
lineTokens
[
lineTokenIndex
++
].
c_str
()));
}
float
conversion
=
0.1
f
;
for
(
int
ii
=
0
;
ii
<
3
*
gpu
->
natoms
;
ii
++
){
amoebaGpu
->
psInducedDipole
->
_pSysData
[
ii
]
*=
conversion
;
amoebaGpu
->
psInducedDipolePolar
->
_pSysData
[
ii
]
*=
conversion
;
}
amoebaGpu
->
gpuContext
->
sim
.
alphaEwald
=
5.4459052e+00
f
;
SetCalculateAmoebaPmeDirectElectrostaticSim
(
amoebaGpu
);
amoebaGpu
->
psInducedDipole
->
Upload
();
amoebaGpu
->
psInducedDipolePolar
->
Upload
();
#endif
// on first pass, set threads/block
static
unsigned
int
threadsPerBlock
=
0
;
if
(
threadsPerBlock
==
0
){
unsigned
int
maxThreads
;
if
(
gpu
->
sm_version
>=
SM_20
)
...
...
@@ -1233,154 +1148,44 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
PmeDirectElectrostaticParticle
)),
maxThreads
);
}
kClearFields_3
(
amoebaGpu
,
2
);
kClearFields_3
(
amoebaGpu
,
1
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaPmeDirectElectrostaticCutoffForces: threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u
\n
"
,
threadsPerBlock
,
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
PmeDirectElectrostaticParticle
)),
sizeof
(
PmeDirectElectrostaticParticle
)
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaPmeDirectElectrostaticCutoffForces: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u gpu->nonbond_threads_per_block=%u
\n
"
,
amoebaGpu
->
nonbondBlocks
,
threadsPerBlock
,
amoebaGpu
->
bOutputBufferPerWarp
,
sizeof
(
PmeDirectElectrostaticParticle
),
(
sizeof
(
PmeDirectElectrostaticParticle
))
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
,
gpu
->
sim
.
nonbond_threads_per_block
);
(
void
)
fflush
(
amoebaGpu
->
log
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaPmeDirectElectrostaticCutoffForces: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u gpu->nonbond_threads_per_block=%u
\n
"
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
PmeDirectElectrostaticParticle
),
(
sizeof
(
PmeDirectElectrostaticParticle
))
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
,
gpu
->
sim
.
nonbond_threads_per_block
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaPmeDirectElectrostaticCutoffByWarpForces_kernel
<<<
amoebaGpu
->
nonbond
B
locks
,
threadsPerBlock
,
sizeof
(
PmeDirectElectrostaticParticle
)
*
threadsPerBlock
>>>
(
kCalculateAmoebaPmeDirectElectrostaticCutoffByWarpForces_kernel
<<<
gpu
->
sim
.
nonbond
_b
locks
,
threadsPerBlock
,
sizeof
(
PmeDirectElectrostaticParticle
)
*
threadsPerBlock
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_
2
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_
1
->
_pDevData
,
debugArray
->
_pDevData
,
targetAtom
);
#else
amoebaGpu
->
psWorkArray_3_
2
->
_pDevData
);
amoebaGpu
->
psWorkArray_3_
1
->
_pDevData
);
#endif
}
else
{
kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel
<<<
amoebaGpu
->
nonbond
B
locks
,
threadsPerBlock
,
sizeof
(
PmeDirectElectrostaticParticle
)
*
threadsPerBlock
>>>
(
kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel
<<<
gpu
->
sim
.
nonbond
_b
locks
,
threadsPerBlock
,
sizeof
(
PmeDirectElectrostaticParticle
)
*
threadsPerBlock
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_
2
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_
1
->
_pDevData
,
debugArray
->
_pDevData
,
targetAtom
);
#else
amoebaGpu
->
psWorkArray_3_
2
->
_pDevData
);
amoebaGpu
->
psWorkArray_3_
1
->
_pDevData
);
#endif
}
LAUNCHERROR
(
"kCalculateAmoebaPmeDirectElectrostaticCutoffForces"
);
kReduceForceTorque
(
amoebaGpu
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
amoebaGpu
->
psForce
->
Download
();
amoebaGpu
->
psTorque
->
Download
();
debugArray
->
Download
();
int
maxPrint
=
5
;
float
conversion
=
1.0
f
/
41.84
f
;
float
forceSum
[
3
]
=
{
0.0
f
,
0.0
f
,
0.0
f
};
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Finished PmeDirectElectrostatic kernel execution conversion=%.5f
\n
"
,
conversion
);
(
void
)
fflush
(
amoebaGpu
->
log
);
for
(
int
ii
=
0
;
ii
<
gpu
->
natoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%5d "
,
ii
);
int
indexOffset
=
ii
*
3
;
// force
(
void
)
fprintf
(
amoebaGpu
->
log
,
"PmeDirectElectrostaticF [%16.9e %16.9e %16.9e] "
,
conversion
*
amoebaGpu
->
psForce
->
_pSysData
[
indexOffset
],
conversion
*
amoebaGpu
->
psForce
->
_pSysData
[
indexOffset
+
1
],
conversion
*
amoebaGpu
->
psForce
->
_pSysData
[
indexOffset
+
2
]
);
forceSum
[
0
]
+=
amoebaGpu
->
psForce
->
_pSysData
[
indexOffset
];
forceSum
[
1
]
+=
amoebaGpu
->
psForce
->
_pSysData
[
indexOffset
+
1
];
forceSum
[
2
]
+=
amoebaGpu
->
psForce
->
_pSysData
[
indexOffset
+
2
];
// torque
(
void
)
fprintf
(
amoebaGpu
->
log
,
"PmeDirectElectrostaticT [%16.9e %16.9e %16.9e] "
,
conversion
*
amoebaGpu
->
psTorque
->
_pSysData
[
indexOffset
],
conversion
*
amoebaGpu
->
psTorque
->
_pSysData
[
indexOffset
+
1
],
conversion
*
amoebaGpu
->
psTorque
->
_pSysData
[
indexOffset
+
2
]
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
if
(
ii
==
maxPrint
&&
(
gpu
->
natoms
-
maxPrint
)
>
ii
){
ii
=
gpu
->
natoms
-
maxPrint
;
}
}
(
void
)
fflush
(
amoebaGpu
->
log
);
if
(
0
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"DebugElec
\n
"
);
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
float
torqueSum0
[
3
]
=
{
0.0
f
,
0.0
f
,
0.0
f
};
float
torqueSum1
[
3
]
=
{
0.0
f
,
0.0
f
,
0.0
f
};
int
offset0
=
7
*
paddedNumberOfAtoms
;
int
offset1
=
8
*
paddedNumberOfAtoms
;
int
offset2
=
7
*
paddedNumberOfAtoms
;
int
offset3
=
8
*
paddedNumberOfAtoms
;
for
(
int
jj
=
0
;
jj
<
gpu
->
natoms
;
jj
++
){
int
debugIndex
=
jj
;
if
(
fabs
(
debugArray
->
_pSysData
[
debugIndex
+
5
*
paddedNumberOfAtoms
].
y
)
<
1.0e-10
)
continue
;
if
(
jj
!=
targetAtom
){
torqueSum0
[
0
]
+=
debugArray
->
_pSysData
[
debugIndex
+
offset0
].
x
+
debugArray
->
_pSysData
[
debugIndex
+
offset1
].
x
;
torqueSum0
[
1
]
+=
debugArray
->
_pSysData
[
debugIndex
+
offset0
].
y
+
debugArray
->
_pSysData
[
debugIndex
+
offset1
].
y
;
torqueSum0
[
2
]
+=
debugArray
->
_pSysData
[
debugIndex
+
offset0
].
z
+
debugArray
->
_pSysData
[
debugIndex
+
offset1
].
z
;
torqueSum1
[
0
]
+=
debugArray
->
_pSysData
[
debugIndex
+
offset2
].
x
+
debugArray
->
_pSysData
[
debugIndex
+
offset3
].
x
;
torqueSum1
[
1
]
+=
debugArray
->
_pSysData
[
debugIndex
+
offset2
].
y
+
debugArray
->
_pSysData
[
debugIndex
+
offset3
].
y
;
torqueSum1
[
2
]
+=
debugArray
->
_pSysData
[
debugIndex
+
offset2
].
z
+
debugArray
->
_pSysData
[
debugIndex
+
offset3
].
z
;
}
if
(
jj
==
2
){
offset0
+=
2
*
paddedNumberOfAtoms
;
offset1
+=
2
*
paddedNumberOfAtoms
;
}
for
(
int
kk
=
0
;
kk
<
12
;
kk
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%5d %5d %5d [%16.9e %16.9e %16.9e %16.9e] E11
\n
"
,
targetAtom
,
jj
,
kk
,
debugArray
->
_pSysData
[
debugIndex
].
x
,
debugArray
->
_pSysData
[
debugIndex
].
y
,
debugArray
->
_pSysData
[
debugIndex
].
z
,
debugArray
->
_pSysData
[
debugIndex
].
w
);
debugIndex
+=
paddedNumberOfAtoms
;
}
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%5d %5d [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] Sum
\n
"
,
targetAtom
,
jj
,
torqueSum0
[
0
],
torqueSum0
[
1
],
torqueSum0
[
2
],
torqueSum1
[
0
],
torqueSum1
[
1
],
torqueSum1
[
2
]);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
}
}
(
void
)
fflush
(
amoebaGpu
->
log
);
if
(
1
){
std
::
vector
<
int
>
fileId
;
//fileId.push_back( 0 );
VectorOfDoubleVectors
outputVector
;
cudaLoadCudaFloat4Array
(
gpu
->
natoms
,
3
,
gpu
->
psPosq4
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psForce
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psTorque
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaPmeDirectForceTorque"
,
fileId
,
outputVector
);
}
}
delete
debugArray
;
#endif
cudaComputeAmoebaMapTorquesAndAddTotalForce
(
amoebaGpu
,
amoebaGpu
->
psTorque
,
amoebaGpu
->
psForce
,
gpu
->
psForce4
);
if
(
0
){
std
::
vector
<
int
>
fileId
;
//fileId.push_back( 0 );
VectorOfDoubleVectors
outputVector
;
copyForce
(
amoebaGpu
,
-
1.0
f
/
41.84
f
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psForce
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaPmeDirectForce"
,
fileId
,
outputVector
);
}
kReduceTorque
(
amoebaGpu
);
cudaComputeAmoebaMapTorqueAndAddToForce
(
amoebaGpu
,
amoebaGpu
->
psTorque
);
}
...
...
@@ -1391,100 +1196,9 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
@param amoebaGpu amoebaGpu context
--------------------------------------------------------------------------------------- */
extern
double
kReduceEnergy
(
gpuContext
gpu
);
void
cudaComputeAmoebaPmeElectrostatic
(
amoebaGpuContext
amoebaGpu
)
{
#ifdef AMOEBA_DEBUG
if
(
0
){
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
std
::
vector
<
int
>
fileId
;
float
conversion
=
-
1.0
f
/
41.84
;
copyForce
(
amoebaGpu
,
conversion
);
VectorOfDoubleVectors
outputVector
;
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psForce
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaPmeRecipDemForce"
,
fileId
,
outputVector
);
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
CUDAStream
<
float
>*
debugArray
=
new
CUDAStream
<
float
>
(
paddedNumberOfAtoms
*
3
,
1
,
"FArray"
);
int
index
=
0
;
for
(
int
ii
=
0
;
ii
<
amoebaGpu
->
gpuContext
->
natoms
;
ii
++
){
debugArray
->
_pSysData
[
index
]
=
amoebaGpu
->
psForce
->
_pSysData
[
index
];
debugArray
->
_pSysData
[
index
+
1
]
=
amoebaGpu
->
psForce
->
_pSysData
[
index
+
1
];
debugArray
->
_pSysData
[
index
+
2
]
=
amoebaGpu
->
psForce
->
_pSysData
[
index
+
2
];
index
+=
3
;
}
//zeroForce( amoebaGpu );
double
dem
=
kReduceEnergy
(
gpu
);
kCalculateAmoebaPMEInducedDipoleForces
(
amoebaGpu
);
double
dep
=
kReduceEnergy
(
gpu
);
fprintf
(
stderr
,
"Recip Em=%15.7e ep=%15.7e ttl=%15.7e"
,
dem
/
4.184
,
(
dep
-
dem
)
/
4.184
,
dep
/
4.184
);
copyForce
(
amoebaGpu
,
conversion
);
VectorOfDoubleVectors
outputVector1
;
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psForce
,
outputVector1
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaPmeRecipForce"
,
fileId
,
outputVector1
);
VectorOfDoubleVectors
outputVector2
;
index
=
0
;
for
(
int
ii
=
0
;
ii
<
amoebaGpu
->
gpuContext
->
natoms
;
ii
++
){
amoebaGpu
->
psForce
->
_pSysData
[
index
]
-=
debugArray
->
_pSysData
[
index
];
amoebaGpu
->
psForce
->
_pSysData
[
index
+
1
]
-=
debugArray
->
_pSysData
[
index
+
1
];
amoebaGpu
->
psForce
->
_pSysData
[
index
+
2
]
-=
debugArray
->
_pSysData
[
index
+
2
];
index
+=
3
;
}
amoebaGpu
->
psForce
->
Upload
();
outputVector
.
resize
(
0
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psForce
,
outputVector2
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaPmeRecipDepForce"
,
fileId
,
outputVector2
);
zeroForce
(
amoebaGpu
);
kCalculateAmoebaPMEInducedDipoleForces
(
amoebaGpu
);
//zeroForce( amoebaGpu );
exit
(
0
);
}
#endif
#ifdef AMOEBA_DEBUG
if
(
0
){
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
std
::
vector
<
int
>
fileId
;
zeroForce
(
amoebaGpu
);
cudaComputeAmoebaPmeDirectElectrostatic
(
amoebaGpu
);
copyForce
(
amoebaGpu
,
-
1.0
f
/
41.84
f
);
VectorOfDoubleVectors
outputVector
;
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psForce
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"yCudaPmeDirectForce"
,
fileId
,
outputVector
);
zeroForce
(
amoebaGpu
);
}
if
(
0
){
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
std
::
vector
<
int
>
fileId
;
zeroForce
(
amoebaGpu
);
cudaComputeAmoebaPmeDirectElectrostatic
(
amoebaGpu
);
kCalculateAmoebaPMEInducedDipoleForces
(
amoebaGpu
);
copyForce
(
amoebaGpu
,
-
1.0
f
/
41.84
f
);
VectorOfDoubleVectors
outputVector
;
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psForce
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaPmeForce"
,
fileId
,
outputVector
);
}
if
(
0
){
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
std
::
vector
<
int
>
fileId
;
copyForce
(
amoebaGpu
,
-
1.0
f
/
41.84
f
);
VectorOfDoubleVectors
outputVector
;
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psForce
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaPrePmeForce"
,
fileId
,
outputVector
);
}
#endif
cudaComputeAmoebaPmeDirectElectrostatic
(
amoebaGpu
);
kCalculateAmoebaPMEInducedDipoleForces
(
amoebaGpu
);
}
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
View file @
761d7e17
...
...
@@ -35,13 +35,12 @@ __launch_bounds__(128, 1)
__launch_bounds__
(
64
,
1
)
#endif
void
METHOD_NAME
(
kCalculateAmoebaPmeDirectElectrostatic
,
Forces_kernel
)(
unsigned
int
*
workUnit
,
float
*
outputForce
,
float
*
outputTorque
unsigned
int
*
workUnit
,
float
*
outputTorque
#ifdef AMOEBA_DEBUG
,
float4
*
debugArray
,
unsigned
int
targetAtom
#endif
){
#ifdef AMOEBA_DEBUG
int
maxPullIndex
=
7
;
float4
pullBack
[
12
];
...
...
@@ -103,7 +102,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
if
(
bExclusionFlag
)
{
unsigned
int
xi
=
x
>>
GRIDBITS
;
unsigned
int
cell
=
xi
+
xi
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
/
GRID
-
xi
*
(
xi
+
1
)
/
2
;
unsigned
int
cell
=
xi
+
xi
*
cSim
.
paddedNumberOfAtoms
/
GRID
-
xi
*
(
xi
+
1
)
/
2
;
dScaleMask
=
cAmoebaSim
.
pD_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
pScaleMask
=
cAmoebaSim
.
pP_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
mScaleMask
=
cAmoebaSim
.
pM_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
...
...
@@ -136,7 +135,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
// nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
// by setting match flag
if
(
(
atomI
!=
atomJ
)
&&
(
atomI
<
c
AmoebaSim
.
numberOfAtoms
)
&&
(
atomJ
<
cAmoebaSim
.
numberOfA
toms
)
)
if
(
(
atomI
!=
atomJ
)
&&
(
atomI
<
c
Sim
.
atoms
)
&&
(
atomJ
<
cSim
.
a
toms
)
)
{
localParticle
.
force
[
0
]
+=
forceTorqueEnergy
[
0
].
x
;
localParticle
.
force
[
1
]
+=
forceTorqueEnergy
[
0
].
y
;
...
...
@@ -154,7 +153,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
#ifdef AMOEBA_DEBUG
if
(
atomI
==
targetAtom
||
atomJ
==
targetAtom
){
unsigned
int
mask
=
(
(
atomI
==
atomJ
)
||
(
atomI
>=
c
AmoebaSim
.
numberOfA
toms
)
||
(
atomJ
>=
c
AmoebaSim
.
numberOfA
toms
)
)
?
0
:
1
;
unsigned
int
mask
=
(
(
atomI
==
atomJ
)
||
(
atomI
>=
c
Sim
.
a
toms
)
||
(
atomJ
>=
c
Sim
.
a
toms
)
)
?
0
:
1
;
unsigned
int
index
=
(
atomI
==
targetAtom
)
?
atomJ
:
atomI
;
float
blockId
=
1
.
0
f
;
...
...
@@ -163,28 +162,28 @@ if( atomI == targetAtom || atomJ == targetAtom ){
debugArray
[
index
].
z
=
(
float
)
y
;
debugArray
[
index
].
w
=
blockId
;
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
mask
?
forceTorqueEnergy
[
0
].
x
:
0
.
0
f
;
debugArray
[
index
].
y
=
mask
?
forceTorqueEnergy
[
0
].
y
:
0
.
0
f
;
debugArray
[
index
].
z
=
mask
?
forceTorqueEnergy
[
0
].
z
:
0
.
0
f
;
debugArray
[
index
].
w
=
mask
?
forceTorqueEnergy
[
0
].
w
:
0
.
0
f
;
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
mask
?
forceTorqueEnergy
[
1
].
x
:
0
.
0
f
;
debugArray
[
index
].
y
=
mask
?
forceTorqueEnergy
[
1
].
y
:
0
.
0
f
;
debugArray
[
index
].
z
=
mask
?
forceTorqueEnergy
[
1
].
z
:
0
.
0
f
;
float
offsetF
=
(
float
)(
3
*
(
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
));
float
offsetF
=
(
float
)(
3
*
(
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
paddedNumberOfAtoms
));
debugArray
[
index
].
w
=
offsetF
;
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
mask
?
forceTorqueEnergy
[
2
].
x
:
0
.
0
f
;
debugArray
[
index
].
y
=
mask
?
forceTorqueEnergy
[
2
].
y
:
0
.
0
f
;
debugArray
[
index
].
z
=
mask
?
forceTorqueEnergy
[
2
].
z
:
0
.
0
f
;
debugArray
[
index
].
w
=
offsetF
;
for
(
int
pullIndex
=
0
;
pullIndex
<
maxPullIndex
;
pullIndex
++
){
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullBack
[
pullIndex
].
x
;
debugArray
[
index
].
y
=
pullBack
[
pullIndex
].
y
;
debugArray
[
index
].
z
=
pullBack
[
pullIndex
].
z
;
...
...
@@ -197,7 +196,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
// include self energy and self torque
if
(
atomI
<
c
AmoebaSim
.
numberOfA
toms
){
if
(
atomI
<
c
Sim
.
a
toms
){
calculatePmeSelfTorqueElectrostaticPairIxn_kernel
(
localParticle
);
float
energy
;
calculatePmeSelfEnergyElectrostaticPairIxn_kernel
(
localParticle
,
&
energy
);
...
...
@@ -207,41 +206,13 @@ if( atomI == targetAtom || atomJ == targetAtom ){
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
float
of
;
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
cAmoebaSim
.
paddedNumberOfAtoms
);
of
=
outputForce
[
offset
];
of
+=
localParticle
.
force
[
0
];
outputForce
[
offset
]
=
of
;
of
=
outputForce
[
offset
+
1
];
of
+=
localParticle
.
force
[
1
];
outputForce
[
offset
+
1
]
=
of
;
of
=
outputForce
[
offset
+
2
];
of
+=
localParticle
.
force
[
2
];
outputForce
[
offset
+
2
]
=
of
;
of
=
outputTorque
[
offset
];
of
+=
localParticle
.
torque
[
0
];
outputTorque
[
offset
]
=
of
;
of
=
outputTorque
[
offset
+
1
];
of
+=
localParticle
.
torque
[
1
];
outputTorque
[
offset
+
1
]
=
of
;
of
=
outputTorque
[
offset
+
2
];
of
+=
localParticle
.
torque
[
2
];
outputTorque
[
offset
+
2
]
=
of
;
unsigned
int
offset
=
(
x
+
tgx
+
warp
*
cSim
.
paddedNumberOfAtoms
);
add3dArrayToFloat4
(
offset
,
localParticle
.
force
,
cSim
.
pForce4
);
add3dArray
(
3
*
offset
,
localParticle
.
torque
,
outputTorque
);
#else
unsigned
int
offset
=
3
*
(
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cAmoebaSim
.
paddedNumberOfAtoms
);
outputForce
[
offset
]
=
localParticle
.
force
[
0
];
outputForce
[
offset
+
1
]
=
localParticle
.
force
[
1
];
outputForce
[
offset
+
2
]
=
localParticle
.
force
[
2
];
outputTorque
[
offset
]
=
localParticle
.
torque
[
0
];
outputTorque
[
offset
+
1
]
=
localParticle
.
torque
[
1
];
outputTorque
[
offset
+
2
]
=
localParticle
.
torque
[
2
];
unsigned
int
offset
=
(
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
paddedNumberOfAtoms
);
add3dArrayToFloat4
(
offset
,
localParticle
.
force
,
cSim
.
pForce4
);
load3dArray
(
3
*
offset
,
localParticle
.
torque
,
outputTorque
);
#endif
}
else
{
...
...
@@ -271,7 +242,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
{
unsigned
int
xi
=
x
>>
GRIDBITS
;
unsigned
int
yi
=
y
>>
GRIDBITS
;
unsigned
int
cell
=
xi
+
yi
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
/
GRID
-
yi
*
(
yi
+
1
)
/
2
;
unsigned
int
cell
=
xi
+
yi
*
cSim
.
paddedNumberOfAtoms
/
GRID
-
yi
*
(
yi
+
1
)
/
2
;
dScaleMask
=
cAmoebaSim
.
pD_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
pScaleMask
=
cAmoebaSim
.
pP_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
mScaleMask
=
cAmoebaSim
.
pM_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
...
...
@@ -306,7 +277,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
// check if atoms out-of-bounds
if
(
(
atomI
<
c
AmoebaSim
.
numberOfAtoms
)
&&
(
atomJ
<
cAmoebaSim
.
numberOfA
toms
)
)
if
(
(
atomI
<
c
Sim
.
atoms
)
&&
(
atomJ
<
cSim
.
a
toms
)
)
{
// add force and torque to atom I due atom J
...
...
@@ -372,7 +343,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
#ifdef AMOEBA_DEBUG
unsigned
int
jIdx
=
(
flags
==
0xFFFFFFFF
)
?
tj
:
j
;
unsigned
int
atomJ
=
y
+
jIdx
;
unsigned
int
mask
=
(
(
atomI
>=
c
AmoebaSim
.
numberOfA
toms
)
||
(
atomJ
>=
c
AmoebaSim
.
numberOfA
toms
)
)
?
0
:
1
;
unsigned
int
mask
=
(
(
atomI
>=
c
Sim
.
a
toms
)
||
(
atomJ
>=
c
Sim
.
a
toms
)
)
?
0
:
1
;
if
(
atomI
==
targetAtom
||
atomJ
==
targetAtom
){
unsigned
int
index
=
(
atomI
==
targetAtom
)
?
atomJ
:
atomI
;
...
...
@@ -381,29 +352,29 @@ if( atomI == targetAtom || atomJ == targetAtom ){
debugArray
[
index
].
z
=
(
float
)
y
;
debugArray
[
index
].
w
=
(
flags
==
0xFFFFFFFF
)
?
(
float
)
-
141
.
0
f
:
-
151
.
0
f
;
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
mask
?
forceTorqueEnergy
[
0
].
x
:
0
.
0
f
;
debugArray
[
index
].
y
=
mask
?
forceTorqueEnergy
[
0
].
y
:
0
.
0
f
;
debugArray
[
index
].
z
=
mask
?
forceTorqueEnergy
[
0
].
z
:
0
.
0
f
;
debugArray
[
index
].
w
=
mask
?
forceTorqueEnergy
[
0
].
w
:
0
.
0
f
;
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
mask
?
forceTorqueEnergy
[
1
].
x
:
0
.
0
f
;
debugArray
[
index
].
y
=
mask
?
forceTorqueEnergy
[
1
].
y
:
0
.
0
f
;
debugArray
[
index
].
z
=
mask
?
forceTorqueEnergy
[
1
].
z
:
0
.
0
f
;
float
offsetF
=
(
float
)(
3
*
(
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
));
float
offsetF
=
(
float
)(
3
*
(
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
paddedNumberOfAtoms
));
debugArray
[
index
].
w
=
offsetF
;
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
mask
?
forceTorqueEnergy
[
2
].
x
:
0
.
0
f
;
debugArray
[
index
].
y
=
mask
?
forceTorqueEnergy
[
2
].
y
:
0
.
0
f
;
debugArray
[
index
].
z
=
mask
?
forceTorqueEnergy
[
2
].
z
:
0
.
0
f
;
offsetF
=
(
float
)
(
3
*
(
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
));
offsetF
=
(
float
)
(
3
*
(
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
paddedNumberOfAtoms
));
debugArray
[
index
].
w
=
offsetF
;
for
(
int
pullIndex
=
0
;
pullIndex
<
maxPullIndex
;
pullIndex
++
){
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullBack
[
pullIndex
].
x
;
debugArray
[
index
].
y
=
pullBack
[
pullIndex
].
y
;
debugArray
[
index
].
z
=
pullBack
[
pullIndex
].
z
;
...
...
@@ -419,78 +390,23 @@ if( atomI == targetAtom || atomJ == targetAtom ){
#ifdef USE_OUTPUT_BUFFER_PER_WARP
float
of
;
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
cAmoebaSim
.
paddedNumberOfAtoms
);
of
=
outputForce
[
offset
];
of
+=
localParticle
.
force
[
0
];
outputForce
[
offset
]
=
of
;
of
=
outputForce
[
offset
+
1
];
of
+=
localParticle
.
force
[
1
];
outputForce
[
offset
+
1
]
=
of
;
of
=
outputForce
[
offset
+
2
];
of
+=
localParticle
.
force
[
2
];
outputForce
[
offset
+
2
]
=
of
;
of
=
outputTorque
[
offset
];
of
+=
localParticle
.
torque
[
0
];
outputTorque
[
offset
]
=
of
;
of
=
outputTorque
[
offset
+
1
];
of
+=
localParticle
.
torque
[
1
];
outputTorque
[
offset
+
1
]
=
of
;
unsigned
int
offset
=
(
x
+
tgx
+
warp
*
cSim
.
paddedNumberOfAtoms
);
add3dArrayToFloat4
(
offset
,
localParticle
.
force
,
cSim
.
pForce4
);
add3dArray
(
3
*
offset
,
localParticle
.
torque
,
outputTorque
);
of
=
outputTorque
[
offset
+
2
];
of
+=
localParticle
.
torque
[
2
];
outputTorque
[
offset
+
2
]
=
of
;
offset
=
3
*
(
y
+
tgx
+
warp
*
cAmoebaSim
.
paddedNumberOfAtoms
);
of
=
outputForce
[
offset
];
of
+=
sA
[
threadIdx
.
x
].
force
[
0
];
outputForce
[
offset
]
=
of
;
of
=
outputForce
[
offset
+
1
];
of
+=
sA
[
threadIdx
.
x
].
force
[
1
];
outputForce
[
offset
+
1
]
=
of
;
of
=
outputForce
[
offset
+
2
];
of
+=
sA
[
threadIdx
.
x
].
force
[
2
];
outputForce
[
offset
+
2
]
=
of
;
of
=
outputTorque
[
offset
];
of
+=
sA
[
threadIdx
.
x
].
torque
[
0
];
outputTorque
[
offset
]
=
of
;
of
=
outputTorque
[
offset
+
1
];
of
+=
sA
[
threadIdx
.
x
].
torque
[
1
];
outputTorque
[
offset
+
1
]
=
of
;
of
=
outputTorque
[
offset
+
2
];
of
+=
sA
[
threadIdx
.
x
].
torque
[
2
];
outputTorque
[
offset
+
2
]
=
of
;
offset
=
(
y
+
tgx
+
warp
*
cSim
.
paddedNumberOfAtoms
);
add3dArrayToFloat4
(
offset
,
sA
[
threadIdx
.
x
].
force
,
cSim
.
pForce4
);
add3dArray
(
3
*
offset
,
sA
[
threadIdx
.
x
].
torque
,
outputTorque
);
#else
unsigned
int
offset
=
3
*
(
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cAmoebaSim
.
paddedNumberOfAtoms
);
outputForce
[
offset
]
=
localParticle
.
force
[
0
];
outputForce
[
offset
+
1
]
=
localParticle
.
force
[
1
];
outputForce
[
offset
+
2
]
=
localParticle
.
force
[
2
];
outputTorque
[
offset
]
=
localParticle
.
torque
[
0
];
outputTorque
[
offset
+
1
]
=
localParticle
.
torque
[
1
];
outputTorque
[
offset
+
2
]
=
localParticle
.
torque
[
2
];
offset
=
3
*
(
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cAmoebaSim
.
paddedNumberOfAtoms
);
unsigned
int
offset
=
(
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
paddedNumberOfAtoms
);
add3dArrayToFloat4
(
offset
,
localParticle
.
force
,
cSim
.
pForce4
);
load3dArray
(
3
*
offset
,
localParticle
.
torque
,
outputTorque
);
outputForce
[
offset
]
=
sA
[
threadIdx
.
x
].
force
[
0
];
outputForce
[
offset
+
1
]
=
sA
[
threadIdx
.
x
].
force
[
1
];
outputForce
[
offset
+
2
]
=
sA
[
threadIdx
.
x
].
force
[
2
];
offset
=
(
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
paddedNumberOfAtoms
);
outputTorque
[
offset
]
=
sA
[
threadIdx
.
x
].
torque
[
0
];
outputTorque
[
offset
+
1
]
=
sA
[
threadIdx
.
x
].
torque
[
1
];
outputTorque
[
offset
+
2
]
=
sA
[
threadIdx
.
x
].
torque
[
2
];
add3dArrayToFloat4
(
offset
,
sA
[
threadIdx
.
x
].
force
,
cSim
.
pForce4
);
load3dArray
(
3
*
offset
,
sA
[
threadIdx
.
x
].
torque
,
outputTorque
);
#endif
lasty
=
y
;
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
View file @
761d7e17
...
...
@@ -3,6 +3,7 @@
//-----------------------------------------------------------------------------------------
#include "cudaKernels.h"
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
...
...
@@ -135,17 +136,19 @@ static void kReducePmeEField_kernel( unsigned int fieldComponents, unsigned int
static
void
kReducePmeDirectE_Fields
(
amoebaGpuContext
amoebaGpu
)
{
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
// E_FieldPolar = E_Field (reciprocal) + E_FieldPolar (direct) + self
kReducePmeEFieldPolar_kernel
<<<
amoebaGpu
->
nonbond
B
locks
,
amoebaGpu
->
fieldR
educe
T
hreads
PerB
lock
>>>
(
amoebaGpu
->
paddedNumberOfAtoms
*
3
,
amoebaGpu
->
outputBuffers
,
kReducePmeEFieldPolar_kernel
<<<
gpu
->
sim
.
nonbond
_b
locks
,
gpu
->
sim
.
bsf_r
educe
_t
hreads
_per_b
lock
>>>
(
gpu
->
sim
.
paddedNumberOfAtoms
*
3
,
gpu
->
sim
.
outputBuffers
,
amoebaGpu
->
psE_Field
->
_pDevData
,
amoebaGpu
->
psWorkArray_3_2
->
_pDevData
,
amoebaGpu
->
psE_FieldPolar
->
_pDevData
);
LAUNCHERROR
(
"kReducePmeE_Fields1"
);
// E_Field = E_Field (reciprocal) + E_Field (direct) + self
kReducePmeEField_kernel
<<<
amoebaGpu
->
nonbond
B
locks
,
amoebaGpu
->
fieldR
educe
T
hreads
PerB
lock
>>>
(
amoebaGpu
->
paddedNumberOfAtoms
*
3
,
amoebaGpu
->
outputBuffers
,
kReducePmeEField_kernel
<<<
gpu
->
sim
.
nonbond
_b
locks
,
gpu
->
sim
.
bsf_r
educe
_t
hreads
_per_b
lock
>>>
(
gpu
->
sim
.
paddedNumberOfAtoms
*
3
,
gpu
->
sim
.
outputBuffers
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
amoebaGpu
->
psE_Field
->
_pDevData
);
LAUNCHERROR
(
"kReducePmeE_Fields2"
);
}
...
...
@@ -362,7 +365,6 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
static
int
isNanOrInfinity
(
double
number
){
return
(
number
!=
number
||
number
==
std
::
numeric_limits
<
double
>::
infinity
()
||
number
==
-
std
::
numeric_limits
<
double
>::
infinity
())
?
1
:
0
;
}
#endif
static
void
bubbleSort
(
std
::
vector
<
int
>&
array
,
std
::
vector
<
int
>&
track
,
int
length
)
{
...
...
@@ -390,7 +392,8 @@ static void bubbleSort( std::vector<int>& array, std::vector<int>& track, int le
if
(
test
==
0
)
break
;
/*will exit if the list is sorted!*/
}
/*end for i*/
}
/*end bubbleSort*/
}
#endif
/**---------------------------------------------------------------------------------------
...
...
@@ -438,7 +441,7 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
}
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaPmeDirectFixedE_FieldCutoffByWarp_kernel
<<<
amoebaGpu
->
nonbond
B
locks
,
threadsPerBlock
,
sizeof
(
FixedFieldParticle
)
*
threadsPerBlock
>>>
(
kCalculateAmoebaPmeDirectFixedE_FieldCutoffByWarp_kernel
<<<
gpu
->
sim
.
nonbond
_b
locks
,
threadsPerBlock
,
sizeof
(
FixedFieldParticle
)
*
threadsPerBlock
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
#ifdef AMOEBA_DEBUG
...
...
@@ -448,7 +451,7 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
amoebaGpu
->
psWorkArray_3_2
->
_pDevData
);
#endif
}
else
{
kCalculateAmoebaPmeDirectFixedE_FieldCutoff_kernel
<<<
amoebaGpu
->
nonbond
B
locks
,
threadsPerBlock
,
sizeof
(
FixedFieldParticle
)
*
threadsPerBlock
>>>
(
kCalculateAmoebaPmeDirectFixedE_FieldCutoff_kernel
<<<
gpu
->
sim
.
nonbond
_b
locks
,
threadsPerBlock
,
sizeof
(
FixedFieldParticle
)
*
threadsPerBlock
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
#ifdef AMOEBA_DEBUG
...
...
@@ -469,15 +472,15 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
threadsPerBlock
,
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
FixedFieldParticle
)
+
sizeof
(
float3
)),
(
sizeof
(
FixedFieldParticle
)
+
sizeof
(
float3
)),
(
sizeof
(
FixedFieldParticle
)
+
sizeof
(
float3
))
*
threadsPerBlock
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"AmoebaCutoffForces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u warp=%d
\n
"
,
amoebaGpu
->
nonbond
B
locks
,
threadsPerBlock
,
amoebaG
pu
->
bOutputBufferPerWarp
,
gpu
->
sim
.
nonbond
_b
locks
,
threadsPerBlock
,
g
pu
->
bOutputBufferPerWarp
,
sizeof
(
FixedFieldParticle
),
sizeof
(
FixedFieldParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
,
gpu
->
bOutputBufferPerWarp
);
(
void
)
fflush
(
amoebaGpu
->
log
);
/*
(void) fprintf( amoebaGpu->log, "Out WorkArray_3_[1,2] paddedNumberOfAtoms=%d\n",
amoebaGpu->
paddedNumberOfAtoms,
amoebaGpu->
outputBuffers );
(void) fprintf( amoebaGpu->log, "Out WorkArray_3_[1,2] paddedNumberOfAtoms=%d\n",
gpu->sim.
paddedNumberOfAtoms,
gpu->sim.
outputBuffers );
amoebaGpu->psWorkArray_3_1->Download();
amoebaGpu->psWorkArray_3_2->Download();
for( int ii = 0; ii <
amoebaGpu->
paddedNumberOfAtoms; ii++ ){
for( int ii = 0; ii <
gpu->sim.
paddedNumberOfAtoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
int indexOffset = ii*3;
...
...
@@ -624,33 +627,13 @@ void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu )
fileId
.
push_back
(
0
);
VectorOfDoubleVectors
outputVector
;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
kReduceForces
(
gpu
);
cudaLoadCudaFloat4Array
(
gpu
->
natoms
,
3
,
gpu
->
psForce4
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psE_Field
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psE_FieldPolar
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaRecipEField"
,
fileId
,
outputVector
);
//exit(0);
}
if
(
0
){
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
std
::
vector
<
int
>
fileId
;
fileId
.
push_back
(
0
);
VectorOfDoubleVectors
outputVector
;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psE_Field
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psE_FieldPolar
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaEField"
,
fileId
,
outputVector
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaRecipForceTorqueFixed"
,
fileId
,
outputVector
);
//cudaWriteVectorOfDoubleVectorsToFile( "CudaRecipEField", fileId, outputVector );
exit
(
0
);
}
if
(
0
){
cudaComputeAmoebaPmeDirectFixedEField
(
amoebaGpu
);
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
std
::
vector
<
int
>
fileId
;
//fileId.push_back( 0 );
VectorOfDoubleVectors
outputVector
;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psE_Field
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psE_FieldPolar
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaDirectEField"
,
fileId
,
outputVector
);
}
}
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.h
View file @
761d7e17
...
...
@@ -101,7 +101,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
if
(
bExclusionFlag
){
unsigned
int
xi
=
x
>>
GRIDBITS
;
unsigned
int
cell
=
xi
+
xi
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
/
GRID
-
xi
*
(
xi
+
1
)
/
2
;
unsigned
int
cell
=
xi
+
xi
*
cSim
.
paddedNumberOfAtoms
/
GRID
-
xi
*
(
xi
+
1
)
/
2
;
dScaleMask
=
cAmoebaSim
.
pD_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
pScaleMask
=
cAmoebaSim
.
pP_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
}
else
{
...
...
@@ -127,7 +127,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
// nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
// by setting match flag
unsigned
int
match
=
(
(
atomI
==
(
y
+
j
))
||
(
atomI
>=
c
AmoebaSim
.
numberOfA
toms
)
||
((
y
+
j
)
>=
c
AmoebaSim
.
numberOfA
toms
)
)
?
1
:
0
;
unsigned
int
match
=
(
(
atomI
==
(
y
+
j
))
||
(
atomI
>=
c
Sim
.
a
toms
)
||
((
y
+
j
)
>=
c
Sim
.
a
toms
)
)
?
1
:
0
;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
...
...
@@ -151,37 +151,37 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
debugArray
[
index
].
z
=
dScaleValue
;
debugArray
[
index
].
w
=
pScaleValue
;
/*
index += c
Amoeba
Sim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) bExclusionFlag;
debugArray[index].y = (float) (tgx);
debugArray[index].z = (float) j;
debugArray[index].w = flag;
index += c
Amoeba
Sim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) dScaleMask;
debugArray[index].y = (float) pScaleMask.x;
debugArray[index].z = (float) pScaleMask.y;
debugArray[index].w = flag;
*/
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
match
?
0
.
0
f
:
ijField
[
0
].
x
;
debugArray
[
index
].
y
=
match
?
0
.
0
f
:
ijField
[
1
].
x
;
debugArray
[
index
].
z
=
match
?
0
.
0
f
:
ijField
[
2
].
x
;
debugArray
[
index
].
w
=
flag
+
1
.
0
f
;
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
match
?
0
.
0
f
:
ijField
[
0
].
z
;
debugArray
[
index
].
y
=
match
?
0
.
0
f
:
ijField
[
1
].
z
;
debugArray
[
index
].
z
=
match
?
0
.
0
f
:
ijField
[
2
].
z
;
debugArray
[
index
].
w
=
flag
+
2
.
0
f
;
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
match
?
0
.
0
f
:
ijField
[
0
].
y
;
debugArray
[
index
].
y
=
match
?
0
.
0
f
:
ijField
[
1
].
y
;
debugArray
[
index
].
z
=
match
?
0
.
0
f
:
ijField
[
2
].
y
;
debugArray
[
index
].
w
=
flag
+
3
.
0
f
;
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
match
?
0
.
0
f
:
ijField
[
0
].
w
;
debugArray
[
index
].
y
=
match
?
0
.
0
f
:
ijField
[
1
].
w
;
debugArray
[
index
].
z
=
match
?
0
.
0
f
:
ijField
[
2
].
w
;
...
...
@@ -189,7 +189,7 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
for
(
int
pullIndex
=
0
;
pullIndex
<
maxPullIndex
;
pullIndex
++
){
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullBack
[
pullIndex
].
x
;
debugArray
[
index
].
y
=
pullBack
[
pullIndex
].
y
;
debugArray
[
index
].
z
=
pullBack
[
pullIndex
].
z
;
...
...
@@ -204,11 +204,11 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
cSim
.
paddedNumberOfAtoms
);
load3dArrayBufferPerWarp
(
offset
,
fieldSum
,
outputEField
);
load3dArrayBufferPerWarp
(
offset
,
fieldPolarSum
,
outputEFieldPolar
);
#else
unsigned
int
offset
=
3
*
(
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
unsigned
int
offset
=
3
*
(
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
paddedNumberOfAtoms
);
load3dArray
(
offset
,
fieldSum
,
outputEField
);
load3dArray
(
offset
,
fieldPolarSum
,
outputEFieldPolar
);
#endif
...
...
@@ -235,7 +235,7 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
if
(
bExclusionFlag
)
{
unsigned
int
xi
=
x
>>
GRIDBITS
;
unsigned
int
yi
=
y
>>
GRIDBITS
;
unsigned
int
cell
=
xi
+
yi
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
/
GRID
-
yi
*
(
yi
+
1
)
/
2
;
unsigned
int
cell
=
xi
+
yi
*
cSim
.
paddedNumberOfAtoms
/
GRID
-
yi
*
(
yi
+
1
)
/
2
;
dScaleMask
=
cAmoebaSim
.
pD_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
pScaleMask
=
cAmoebaSim
.
pP_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
}
else
{
...
...
@@ -258,7 +258,7 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
#endif
);
unsigned
int
outOfBounds
=
(
(
atomI
>=
c
AmoebaSim
.
numberOfA
toms
)
||
((
y
+
jIdx
)
>=
c
AmoebaSim
.
numberOfA
toms
)
)
?
1
:
0
;
unsigned
int
outOfBounds
=
(
(
atomI
>=
c
Sim
.
a
toms
)
||
((
y
+
jIdx
)
>=
c
Sim
.
a
toms
)
)
?
1
:
0
;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
...
...
@@ -331,44 +331,44 @@ if( (atomI == targetAtom || (y + jIdx) == targetAtom) ){
float
flag
=
9
.
0
f
;
/*
index += c
Amoeba
Sim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) bExclusionFlag;
debugArray[index].y = (float) (tgx);
debugArray[index].z = (float) j;
debugArray[index].w = jIdx;
index += c
Amoeba
Sim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) dScaleMask;
debugArray[index].y = (float) pScaleMask.x;
debugArray[index].z = (float) pScaleMask.y;
debugArray[index].w = (float) flags;
*/
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
outOfBounds
?
0
.
0
f
:
ijField
[
0
].
x
;
debugArray
[
index
].
y
=
outOfBounds
?
0
.
0
f
:
ijField
[
1
].
x
;
debugArray
[
index
].
z
=
outOfBounds
?
0
.
0
f
:
ijField
[
2
].
x
;
debugArray
[
index
].
w
=
flag
+
1
.
0
f
;
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
outOfBounds
?
0
.
0
f
:
ijField
[
0
].
y
;
debugArray
[
index
].
y
=
outOfBounds
?
0
.
0
f
:
ijField
[
1
].
y
;
debugArray
[
index
].
z
=
outOfBounds
?
0
.
0
f
:
ijField
[
2
].
y
;
debugArray
[
index
].
w
=
flag
+
2
.
0
f
;
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
outOfBounds
?
0
.
0
f
:
ijField
[
0
].
z
;
debugArray
[
index
].
y
=
outOfBounds
?
0
.
0
f
:
ijField
[
1
].
z
;
debugArray
[
index
].
z
=
outOfBounds
?
0
.
0
f
:
ijField
[
2
].
z
;
debugArray
[
index
].
w
=
flag
+
3
.
0
f
;
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
outOfBounds
?
0
.
0
f
:
ijField
[
0
].
w
;
debugArray
[
index
].
y
=
outOfBounds
?
0
.
0
f
:
ijField
[
1
].
w
;
debugArray
[
index
].
z
=
outOfBounds
?
0
.
0
f
:
ijField
[
2
].
w
;
debugArray
[
index
].
w
=
flag
+
4
.
0
f
;
for
(
int
pullIndex
=
0
;
pullIndex
<
maxPullIndex
;
pullIndex
++
){
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullBack
[
pullIndex
].
x
;
debugArray
[
index
].
y
=
pullBack
[
pullIndex
].
y
;
debugArray
[
index
].
z
=
pullBack
[
pullIndex
].
z
;
...
...
@@ -384,20 +384,20 @@ if( (atomI == targetAtom || (y + jIdx) == targetAtom) ){
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
cSim
.
paddedNumberOfAtoms
);
load3dArrayBufferPerWarp
(
offset
,
fieldSum
,
outputEField
);
load3dArrayBufferPerWarp
(
offset
,
fieldPolarSum
,
outputEFieldPolar
);
offset
=
3
*
(
y
+
tgx
+
warp
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
offset
=
3
*
(
y
+
tgx
+
warp
*
cSim
.
paddedNumberOfAtoms
);
load3dArrayBufferPerWarp
(
offset
,
sA
[
threadIdx
.
x
].
eField
,
outputEField
);
load3dArrayBufferPerWarp
(
offset
,
sA
[
threadIdx
.
x
].
eFieldP
,
outputEFieldPolar
);
#else
unsigned
int
offset
=
3
*
(
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
unsigned
int
offset
=
3
*
(
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
paddedNumberOfAtoms
);
load3dArray
(
offset
,
fieldSum
,
outputEField
);
load3dArray
(
offset
,
fieldPolarSum
,
outputEFieldPolar
);
offset
=
3
*
(
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
offset
=
3
*
(
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
paddedNumberOfAtoms
);
load3dArray
(
offset
,
sA
[
threadIdx
.
x
].
eField
,
outputEField
);
load3dArray
(
offset
,
sA
[
threadIdx
.
x
].
eFieldP
,
outputEFieldPolar
);
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
View file @
761d7e17
...
...
@@ -354,13 +354,14 @@ static void kSorUpdateMutualInducedField_kernel(
static
void
kReduceMutualInducedFields
(
amoebaGpuContext
amoebaGpu
,
CUDAStream
<
float
>*
outputArray
,
CUDAStream
<
float
>*
outputPolarArray
)
{
kReduceFields_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
amoebaGpu
->
fieldReduceThreadsPerBlock
>>>
(
amoebaGpu
->
paddedNumberOfAtoms
*
3
,
amoebaGpu
->
outputBuffers
,
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
kReduceFields_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
bsf_reduce_threads_per_block
>>>
(
gpu
->
sim
.
paddedNumberOfAtoms
*
3
,
gpu
->
sim
.
outputBuffers
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
outputArray
->
_pDevData
);
LAUNCHERROR
(
"kReducePmeMI_Fields1"
);
kReduceFields_kernel
<<<
amoebaGpu
->
nonbond
B
locks
,
amoebaGpu
->
fieldR
educe
T
hreads
PerB
lock
>>>
(
amoebaGpu
->
paddedNumberOfAtoms
*
3
,
amoebaGpu
->
outputBuffers
,
kReduceFields_kernel
<<<
gpu
->
sim
.
nonbond
_b
locks
,
gpu
->
sim
.
bsf_r
educe
_t
hreads
_per_b
lock
>>>
(
gpu
->
sim
.
paddedNumberOfAtoms
*
3
,
gpu
->
sim
.
outputBuffers
,
amoebaGpu
->
psWorkArray_3_2
->
_pDevData
,
outputPolarArray
->
_pDevData
);
LAUNCHERROR
(
"kReducePmeMI_Fields2"
);
}
...
...
@@ -410,21 +411,22 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
MutualInducedParticle
)),
maxThreads
);
}
if
(
gpu
->
bOutputBufferPerWarp
){
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Cutoff -- use warp
\n
"
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u
\n
"
,
methodName
,
amoebaGpu
->
nonbond
B
locks
,
threadsPerBlock
,
amoebaG
pu
->
bOutputBufferPerWarp
,
sizeof
(
MutualInducedParticle
),
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Cutoff -- use warp
\n
"
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u
\n
"
,
methodName
,
gpu
->
sim
.
nonbond
_b
locks
,
threadsPerBlock
,
g
pu
->
bOutputBufferPerWarp
,
sizeof
(
MutualInducedParticle
),
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
if
(
gpu
->
bOutputBufferPerWarp
){
//gpu->sim.pInteractingWorkUnit,
//amoebaGpu->psWorkUnit->_pDevData,
kCalculateAmoebaPmeMutualInducedFieldCutoffByWarp_kernel
<<<
amoebaGpu
->
nonbond
B
locks
,
threadsPerBlock
,
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
>>>
(
kCalculateAmoebaPmeMutualInducedFieldCutoffByWarp_kernel
<<<
gpu
->
sim
.
nonbond
_b
locks
,
threadsPerBlock
,
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
#ifdef AMOEBA_DEBUG
...
...
@@ -436,17 +438,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
}
else
{
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Cutoff no warp
\n
"
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u
\n
"
,
methodName
,
amoebaGpu
->
nonbondBlocks
,
threadsPerBlock
,
amoebaGpu
->
bOutputBufferPerWarp
,
sizeof
(
MutualInducedParticle
),
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
kCalculateAmoebaPmeMutualInducedFieldCutoff_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
threadsPerBlock
,
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
>>>
(
kCalculateAmoebaPmeMutualInducedFieldCutoff_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
#ifdef AMOEBA_DEBUG
...
...
@@ -596,7 +588,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
(void) fprintf( amoebaGpu->log, "%s Initial setup for matrix multiply\n", methodName );
int offset = 0;
int maxPrint = 10;
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%4d pol=%12.4e ", ii,
amoebaGpu->psPolarizability->_pSysData[offset] );
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
View file @
761d7e17
...
...
@@ -110,7 +110,7 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
#endif
);
unsigned
int
mask
=
(
(
atomI
==
(
y
+
j
))
||
(
atomI
>=
c
AmoebaSim
.
numberOfA
toms
)
||
((
y
+
j
)
>=
c
AmoebaSim
.
numberOfA
toms
)
)
?
0
:
1
;
unsigned
int
mask
=
(
(
atomI
==
(
y
+
j
))
||
(
atomI
>=
c
Sim
.
a
toms
)
||
((
y
+
j
)
>=
c
Sim
.
a
toms
)
)
?
0
:
1
;
// add to field at atomI the field due atomJ's dipole
...
...
@@ -136,51 +136,51 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
debugArray[index].w = 6.0f;
index += c
Amoeba
Sim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
pullBackIndex++;
index += c
Amoeba
Sim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
index += c
Amoeba
Sim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
float flag = 6.0f;
debugArray[index].x = ijField[0].x;
debugArray[index].y = ijField[1].x;
debugArray[index].z = ijField[2].x;
debugArray[index].w = flag;
index += c
Amoeba
Sim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[0].x;
debugArray[index].y = ijField[1].x;
debugArray[index].z = ijField[2].x;
debugArray[index].w = flag;
index += c
Amoeba
Sim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[0].z;
debugArray[index].y = ijField[1].z;
debugArray[index].z = ijField[2].z;
debugArray[index].w = flag;
index += c
Amoeba
Sim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[0].z;
debugArray[index].y = ijField[1].z;
debugArray[index].z = ijField[2].z;
debugArray[index].w = flag;
index += c
Amoeba
Sim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
index += c
Amoeba
Sim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[0].x;
debugArray[index].y = match ? 0.0f : ijField[1].x;
debugArray[index].z = match ? 0.0f : ijField[2].x;
index += c
Amoeba
Sim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
...
...
@@ -199,12 +199,12 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
cSim
.
paddedNumberOfAtoms
);
load3dArrayBufferPerWarp
(
offset
,
fieldSum
,
outputField
);
load3dArrayBufferPerWarp
(
offset
,
fieldPolarSum
,
outputFieldPolar
);
#else
unsigned
int
offset
=
3
*
(
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
unsigned
int
offset
=
3
*
(
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
paddedNumberOfAtoms
);
load3dArray
(
offset
,
fieldSum
,
outputField
);
load3dArray
(
offset
,
fieldPolarSum
,
outputFieldPolar
);
...
...
@@ -245,7 +245,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
#endif
);
unsigned
int
mask
=
(
(
atomI
>=
c
AmoebaSim
.
numberOfA
toms
)
||
((
y
+
jIdx
)
>=
c
AmoebaSim
.
numberOfA
toms
)
)
?
0
:
1
;
unsigned
int
mask
=
(
(
atomI
>=
c
Sim
.
a
toms
)
||
((
y
+
jIdx
)
>=
c
Sim
.
a
toms
)
)
?
0
:
1
;
// add to field at atomI the field due atomJ's dipole
...
...
@@ -323,39 +323,39 @@ if( atomI == targetAtom || (y+jIdx) == targetAtom ){
debugArray[index].w = 7.0f;
index += c
Amoeba
Sim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
pullBackIndex++;
index += c
Amoeba
Sim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
index += c
Amoeba
Sim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
float flag = 7.0f;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
debugArray[index].w = flag;
index += c
Amoeba
Sim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
debugArray[index].w = flag;
index += c
Amoeba
Sim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI+1][0];
debugArray[index].y = ijField[indexI+1][1];
debugArray[index].z = ijField[indexI+1][2];
debugArray[index].w = flag;
index += c
Amoeba
Sim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ+1][0];
debugArray[index].y = ijField[indexJ+1][1];
debugArray[index].z = ijField[indexJ+1][2];
...
...
@@ -372,21 +372,21 @@ if( atomI == targetAtom || (y+jIdx) == targetAtom ){
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
cSim
.
paddedNumberOfAtoms
);
load3dArrayBufferPerWarp
(
offset
,
fieldSum
,
outputField
);
load3dArrayBufferPerWarp
(
offset
,
fieldPolarSum
,
outputFieldPolar
);
offset
=
3
*
(
y
+
tgx
+
warp
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
offset
=
3
*
(
y
+
tgx
+
warp
*
cSim
.
paddedNumberOfAtoms
);
load3dArrayBufferPerWarp
(
offset
,
sA
[
threadIdx
.
x
].
field
,
outputField
);
load3dArrayBufferPerWarp
(
offset
,
sA
[
threadIdx
.
x
].
fieldPolar
,
outputFieldPolar
);
#else
unsigned
int
offset
=
3
*
(
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
unsigned
int
offset
=
3
*
(
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
paddedNumberOfAtoms
);
load3dArray
(
offset
,
fieldSum
,
outputField
);
load3dArray
(
offset
,
fieldPolarSum
,
outputFieldPolar
);
offset
=
3
*
(
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
offset
=
3
*
(
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
paddedNumberOfAtoms
);
load3dArray
(
offset
,
sA
[
threadIdx
.
x
].
field
,
outputField
);
load3dArray
(
offset
,
sA
[
threadIdx
.
x
].
fieldPolar
,
outputFieldPolar
);
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRotateFrame.cu
View file @
761d7e17
...
...
@@ -77,74 +77,58 @@ void kCudaComputeCheckChiral_kernel( void )
const
int
C
=
3
;
float
delta
[
4
][
3
];
float4
*
atomCoord
=
cSim
.
pPosq
;
int4
*
multiPoleAtoms
=
cAmoebaSim
.
pMultipoleParticlesIdsAndAxisType
;
float
*
molecularDipole
=
cAmoebaSim
.
pMolecularDipole
;
float
*
molecularQuadrupole
=
cAmoebaSim
.
pMolecularQuadrupole
;
float4
*
particleCoord
=
cSim
.
pPosq
;
int4
*
multiPoleParticles
=
cAmoebaSim
.
pMultipoleParticlesIdsAndAxisType
;
float
*
labFrameDipole
=
cAmoebaSim
.
pLabFrameDipole
;
float
*
labFrameQuadrupole
=
cAmoebaSim
.
pLabFrameQuadrupole
;
// ---------------------------------------------------------------------------------------
int
atomIndex
=
blockIdx
.
x
;
if
(
atomIndex
>=
cSim
.
atoms
)
return
;
int
axisType
=
multiPoleAtoms
[
atomIndex
].
w
;
float
*
molDipole
=
&
(
molecularDipole
[
atomIndex
*
3
]);
float
*
labDipole
=
&
(
labFrameDipole
[
atomIndex
*
3
]);
labDipole
[
0
]
=
molDipole
[
0
];
labDipole
[
1
]
=
molDipole
[
1
];
labDipole
[
2
]
=
molDipole
[
2
];
float
*
molQuadrupole
=
&
(
molecularQuadrupole
[
atomIndex
*
9
]);
float
*
labQuadrupole
=
&
(
labFrameQuadrupole
[
atomIndex
*
9
]);
labQuadrupole
[
0
]
=
molQuadrupole
[
0
];
labQuadrupole
[
1
]
=
molQuadrupole
[
1
];
labQuadrupole
[
2
]
=
molQuadrupole
[
2
];
labQuadrupole
[
3
]
=
molQuadrupole
[
3
];
labQuadrupole
[
4
]
=
molQuadrupole
[
4
];
labQuadrupole
[
5
]
=
molQuadrupole
[
5
];
labQuadrupole
[
6
]
=
molQuadrupole
[
6
];
labQuadrupole
[
7
]
=
molQuadrupole
[
7
];
labQuadrupole
[
8
]
=
molQuadrupole
[
8
];
// skip z-then-x
if
(
axisType
==
0
||
multiPoleAtoms
[
atomIndex
].
y
<
0
)
return
;
// ---------------------------------------------------------------------------------------
int
atomA
=
atomIndex
;
int
atomB
=
multiPoleAtoms
[
atomIndex
].
z
;
int
atomC
=
multiPoleAtoms
[
atomIndex
].
x
;
int
atomD
=
multiPoleAtoms
[
atomIndex
].
y
;
delta
[
AD
][
0
]
=
atomCoord
[
atomA
].
x
-
atomCoord
[
atomD
].
x
;
delta
[
AD
][
1
]
=
atomCoord
[
atomA
].
y
-
atomCoord
[
atomD
].
y
;
delta
[
AD
][
2
]
=
atomCoord
[
atomA
].
z
-
atomCoord
[
atomD
].
z
;
delta
[
BD
][
0
]
=
atomCoord
[
atomB
].
x
-
atomCoord
[
atomD
].
x
;
delta
[
BD
][
1
]
=
atomCoord
[
atomB
].
y
-
atomCoord
[
atomD
].
y
;
delta
[
BD
][
2
]
=
atomCoord
[
atomB
].
z
-
atomCoord
[
atomD
].
z
;
delta
[
CD
][
0
]
=
atomCoord
[
atomC
].
x
-
atomCoord
[
atomD
].
x
;
delta
[
CD
][
1
]
=
atomCoord
[
atomC
].
y
-
atomCoord
[
atomD
].
y
;
delta
[
CD
][
2
]
=
atomCoord
[
atomC
].
z
-
atomCoord
[
atomD
].
z
;
delta
[
C
][
0
]
=
delta
[
BD
][
1
]
*
delta
[
CD
][
2
]
-
delta
[
BD
][
2
]
*
delta
[
CD
][
1
];
delta
[
C
][
1
]
=
delta
[
CD
][
1
]
*
delta
[
AD
][
2
]
-
delta
[
CD
][
2
]
*
delta
[
AD
][
1
];
delta
[
C
][
2
]
=
delta
[
AD
][
1
]
*
delta
[
BD
][
2
]
-
delta
[
AD
][
2
]
*
delta
[
BD
][
1
];
float
volume
=
delta
[
C
][
0
]
*
delta
[
AD
][
0
]
+
delta
[
C
][
1
]
*
delta
[
BD
][
0
]
+
delta
[
C
][
2
]
*
delta
[
CD
][
0
];
if
(
volume
<
0.0
){
labDipole
[
1
]
*=
-
1.0
f
;
// pole(3,i)
labQuadrupole
[
1
]
*=
-
1.0
f
;
// pole(6,i) && pole(8,i)
labQuadrupole
[
3
]
*=
-
1.0
f
;
// pole(10,i) && pole(12,i)
labQuadrupole
[
5
]
*=
-
1.0
f
;
// pole(6,i) && pole(8,i)
labQuadrupole
[
7
]
*=
-
1.0
f
;
// pole(10,i) && pole(12,i)
}
int
particleIndex
=
__mul24
(
blockIdx
.
x
,
blockDim
.
x
)
+
threadIdx
.
x
;
int
numberOfParticles
=
cSim
.
atoms
;
while
(
particleIndex
<
numberOfParticles
)
{
// skip z-then-x
int
axisType
=
multiPoleParticles
[
particleIndex
].
w
;
if
(
axisType
!=
0
&&
multiPoleParticles
[
particleIndex
].
y
>=
0
)
{
// ---------------------------------------------------------------------------------------
int
particleA
=
particleIndex
;
int
particleB
=
multiPoleParticles
[
particleIndex
].
z
;
int
particleC
=
multiPoleParticles
[
particleIndex
].
x
;
int
particleD
=
multiPoleParticles
[
particleIndex
].
y
;
delta
[
AD
][
0
]
=
particleCoord
[
particleA
].
x
-
particleCoord
[
particleD
].
x
;
delta
[
AD
][
1
]
=
particleCoord
[
particleA
].
y
-
particleCoord
[
particleD
].
y
;
delta
[
AD
][
2
]
=
particleCoord
[
particleA
].
z
-
particleCoord
[
particleD
].
z
;
delta
[
BD
][
0
]
=
particleCoord
[
particleB
].
x
-
particleCoord
[
particleD
].
x
;
delta
[
BD
][
1
]
=
particleCoord
[
particleB
].
y
-
particleCoord
[
particleD
].
y
;
delta
[
BD
][
2
]
=
particleCoord
[
particleB
].
z
-
particleCoord
[
particleD
].
z
;
delta
[
CD
][
0
]
=
particleCoord
[
particleC
].
x
-
particleCoord
[
particleD
].
x
;
delta
[
CD
][
1
]
=
particleCoord
[
particleC
].
y
-
particleCoord
[
particleD
].
y
;
delta
[
CD
][
2
]
=
particleCoord
[
particleC
].
z
-
particleCoord
[
particleD
].
z
;
delta
[
C
][
0
]
=
delta
[
BD
][
1
]
*
delta
[
CD
][
2
]
-
delta
[
BD
][
2
]
*
delta
[
CD
][
1
];
delta
[
C
][
1
]
=
delta
[
CD
][
1
]
*
delta
[
AD
][
2
]
-
delta
[
CD
][
2
]
*
delta
[
AD
][
1
];
delta
[
C
][
2
]
=
delta
[
AD
][
1
]
*
delta
[
BD
][
2
]
-
delta
[
AD
][
2
]
*
delta
[
BD
][
1
];
float
volume
=
delta
[
C
][
0
]
*
delta
[
AD
][
0
]
+
delta
[
C
][
1
]
*
delta
[
BD
][
0
]
+
delta
[
C
][
2
]
*
delta
[
CD
][
0
];
if
(
volume
<
0.0
){
labFrameDipole
[
particleIndex
*
3
+
1
]
*=
-
1.0
f
;
// pole(3,i)
labFrameQuadrupole
[
particleIndex
*
9
+
1
]
*=
-
1.0
f
;
// pole(6,i) && pole(8,i)
labFrameQuadrupole
[
particleIndex
*
9
+
3
]
*=
-
1.0
f
;
// pole(10,i) && pole(12,i)
labFrameQuadrupole
[
particleIndex
*
9
+
5
]
*=
-
1.0
f
;
// pole(6,i) && pole(8,i)
labFrameQuadrupole
[
particleIndex
*
9
+
7
]
*=
-
1.0
f
;
// pole(10,i) && pole(12,i)
}
}
particleIndex
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
__global__
...
...
@@ -162,18 +146,14 @@ void kCudaComputeLabFrameMoments_kernel( void )
float
vectorY
[
3
];
float
vectorZ
[
3
];
int
numOfAtoms
=
cSim
.
atoms
;
float4
*
atomCoord
=
cSim
.
pPosq
;
int4
*
multiPoleAtoms
=
cAmoebaSim
.
pMultipoleParticlesIdsAndAxisType
;
int
particleIndex
=
__mul24
(
blockIdx
.
x
,
blockDim
.
x
)
+
threadIdx
.
x
;
int
numberOfParticles
=
cSim
.
atoms
;
float4
*
particleCoord
=
cSim
.
pPosq
;
int4
*
multiPoleParticles
=
cAmoebaSim
.
pMultipoleParticlesIdsAndAxisType
;
float
*
labFrameDipole
=
cAmoebaSim
.
pLabFrameDipole
;
float
*
labFrameQuadrupole
=
cAmoebaSim
.
pLabFrameQuadrupole
;
// ---------------------------------------------------------------------------------------
int
atomIndex
=
blockIdx
.
x
;
// ---------------------------------------------------------------------------------------
// get coordinates of this atom and the z & x axis atoms
// compute the vector between the atoms and 1/sqrt(d2), d2 is distance between
// this atom and the axis atom
...
...
@@ -182,216 +162,210 @@ void kCudaComputeLabFrameMoments_kernel( void )
// code common to ZThenX and Bisector
float4
coordinatesThisAtom
=
atomCoord
[
atomIndex
];
int
multipoleAtomIndex
=
multiPoleAtoms
[
atomIndex
].
z
;
float4
coordinatesAxisAtom
=
atomCoord
[
multipoleAtomIndex
];
vectorZ
[
0
]
=
coordinatesAxisAtom
.
x
-
coordinatesThisAtom
.
x
;
vectorZ
[
1
]
=
coordinatesAxisAtom
.
y
-
coordinatesThisAtom
.
y
;
vectorZ
[
2
]
=
coordinatesAxisAtom
.
z
-
coordinatesThisAtom
.
z
;
multipoleAtomIndex
=
multiPoleAtoms
[
atomIndex
].
x
;
coordinatesAxisAtom
=
atomCoord
[
multipoleAtomIndex
];
vectorX
[
0
]
=
coordinatesAxisAtom
.
x
-
coordinatesThisAtom
.
x
;
vectorX
[
1
]
=
coordinatesAxisAtom
.
y
-
coordinatesThisAtom
.
y
;
vectorX
[
2
]
=
coordinatesAxisAtom
.
z
-
coordinatesThisAtom
.
z
;
int
axisType
=
multiPoleAtoms
[
atomIndex
].
w
;
while
(
particleIndex
<
numberOfParticles
)
{
float4
coordinatesThisParticle
=
particleCoord
[
particleIndex
];
int
multipoleParticleIndex
=
multiPoleParticles
[
particleIndex
].
z
;
float4
coordinatesAxisParticle
=
particleCoord
[
multipoleParticleIndex
];
vectorZ
[
0
]
=
coordinatesAxisParticle
.
x
-
coordinatesThisParticle
.
x
;
vectorZ
[
1
]
=
coordinatesAxisParticle
.
y
-
coordinatesThisParticle
.
y
;
vectorZ
[
2
]
=
coordinatesAxisParticle
.
z
-
coordinatesThisParticle
.
z
;
multipoleParticleIndex
=
multiPoleParticles
[
particleIndex
].
x
;
coordinatesAxisParticle
=
particleCoord
[
multipoleParticleIndex
];
vectorX
[
0
]
=
coordinatesAxisParticle
.
x
-
coordinatesThisParticle
.
x
;
vectorX
[
1
]
=
coordinatesAxisParticle
.
y
-
coordinatesThisParticle
.
y
;
vectorX
[
2
]
=
coordinatesAxisParticle
.
z
-
coordinatesThisParticle
.
z
;
int
axisType
=
multiPoleParticles
[
particleIndex
].
w
;
/*
z-only
(1) norm z
(2) select random x
(3) x = x - (x.z)z
(4) norm x
z-then-x
(1) norm z
(2) norm x (not needed)
(3) x = x - (x.z)z
(4) norm x
bisector
(1) norm z
(2) norm x
(3) z = x + z
(4) norm z
(5) x = x - (x.z)z
(6) norm x
z-bisect
(1) norm z
(2) norm x
(3) norm y
(3) x = x + y
(4) norm x
(5) x = x - (x.z)z
(6) norm x
/*
z-only
(1) norm z
(2) select random x
(3) x = x - (x.z)z
(4) norm x
z-then-x
(1) norm z
(2) norm x (not needed)
(3) x = x - (x.z)z
(4) norm x
bisector
(1) norm z
(2) norm x
(3) z = x + z
(4) norm z
(5) x = x - (x.z)z
(6) norm x
z-bisect
(1) norm z
(2) norm x
(3) norm y
(3) x = x + y
(4) norm x
(5) x = x - (x.z)z
(6) norm x
3-fold
(1) norm z
(2) norm x
(3) norm y
(4) z = x + y + z
(5) norm z
(6) x = x - (x.z)z
(7) norm x
*/
// branch based on axis type
3-fold
(1) norm z
(2) norm x
(3) norm y
(4) z = x + y + z
(5) norm z
(6) x = x - (x.z)z
(7) norm x
*/
// branch based on axis type
float
sum
=
normVector3
(
vectorZ
);
if
(
axisType
==
1
){
// bisector
sum
=
normVector3
(
vectorX
);
vectorZ
[
0
]
+=
vectorX
[
0
];
vectorZ
[
1
]
+=
vectorX
[
1
];
vectorZ
[
2
]
+=
vectorX
[
2
];
sum
=
normVector3
(
vectorZ
);
}
else
if
(
axisType
==
2
||
axisType
==
3
){
float
sum
=
normVector3
(
vectorZ
);
if
(
axisType
==
1
){
// bisector
// z-bisect
multipoleParticleIndex
=
multiPoleParticles
[
particleIndex
].
y
;
coordinatesAxisParticle
=
particleCoord
[
multipoleParticleIndex
];
vectorY
[
0
]
=
coordinatesAxisParticle
.
x
-
coordinatesThisParticle
.
x
;
vectorY
[
1
]
=
coordinatesAxisParticle
.
y
-
coordinatesThisParticle
.
y
;
vectorY
[
2
]
=
coordinatesAxisParticle
.
z
-
coordinatesThisParticle
.
z
;
sum
=
normVector3
(
vectorY
);
sum
=
normVector3
(
vectorX
);
if
(
axisType
==
2
){
vectorX
[
0
]
+=
vectorY
[
0
];
vectorX
[
1
]
+=
vectorY
[
1
];
vectorX
[
2
]
+=
vectorY
[
2
];
sum
=
normVector3
(
vectorX
);
}
else
{
// 3-fold
sum
=
normVector3
(
vectorX
);
vectorZ
[
0
]
+=
vectorX
[
0
]
+
vectorY
[
0
];
vectorZ
[
1
]
+=
vectorX
[
1
]
+
vectorY
[
1
];
vectorZ
[
2
]
+=
vectorX
[
2
]
+
vectorY
[
2
];
sum
=
normVector3
(
vectorZ
);
}
}
else
if
(
axisType
>=
4
){
vectorX
[
0
]
=
0.1
f
;
vectorX
[
1
]
=
0.1
f
;
vectorX
[
2
]
=
0.1
f
;
}
vectorZ
[
0
]
+=
vectorX
[
0
];
vectorZ
[
1
]
+=
vectorX
[
1
];
vectorZ
[
2
]
+=
vectorX
[
2
];
sum
=
normVector3
(
vectorZ
);
}
else
if
(
axisType
==
2
||
axisType
==
3
){
// z-bisect
multipoleAtomIndex
=
multiPoleAtoms
[
atomIndex
].
y
;
coordinatesAxisAtom
=
atomCoord
[
multipoleAtomIndex
];
vectorY
[
0
]
=
coordinatesAxisAtom
.
x
-
coordinatesThisAtom
.
x
;
vectorY
[
1
]
=
coordinatesAxisAtom
.
y
-
coordinatesThisAtom
.
y
;
vectorY
[
2
]
=
coordinatesAxisAtom
.
z
-
coordinatesThisAtom
.
z
;
sum
=
normVector3
(
vectorY
);
sum
=
normVector3
(
vectorX
);
if
(
axisType
==
2
){
vectorX
[
0
]
+=
vectorY
[
0
];
vectorX
[
1
]
+=
vectorY
[
1
];
vectorX
[
2
]
+=
vectorY
[
2
];
sum
=
normVector3
(
vectorX
);
}
else
{
// 3-fold
// x = x - (x.z)z
float
dot
=
vectorZ
[
0
]
*
vectorX
[
0
]
+
vectorZ
[
1
]
*
vectorX
[
1
]
+
vectorZ
[
2
]
*
vectorX
[
2
];
vectorX
[
0
]
-=
dot
*
vectorZ
[
0
];
vectorX
[
1
]
-=
dot
*
vectorZ
[
1
];
vectorX
[
2
]
-=
dot
*
vectorZ
[
2
];
sum
=
normVector3
(
vectorX
);
vectorZ
[
0
]
+=
vectorX
[
0
]
+
vectorY
[
0
];
vectorZ
[
1
]
+=
vectorX
[
1
]
+
vectorY
[
1
];
vectorZ
[
2
]
+=
vectorX
[
2
]
+
vectorY
[
2
];
sum
=
normVector3
(
vectorZ
);
vectorY
[
0
]
=
(
vectorZ
[
1
]
*
vectorX
[
2
])
-
(
vectorZ
[
2
]
*
vectorX
[
1
]);
vectorY
[
1
]
=
(
vectorZ
[
2
]
*
vectorX
[
0
])
-
(
vectorZ
[
0
]
*
vectorX
[
2
]);
vectorY
[
2
]
=
(
vectorZ
[
0
]
*
vectorX
[
1
])
-
(
vectorZ
[
1
]
*
vectorX
[
0
]);
// use identity rotation matrix for unrecognized axis types
if
(
axisType
<
0
||
axisType
>
4
){
vectorX
[
0
]
=
1.0
f
;
vectorX
[
1
]
=
0.0
f
;
vectorX
[
2
]
=
0.0
f
;
vectorY
[
0
]
=
0.0
f
;
vectorY
[
1
]
=
1.0
f
;
vectorY
[
2
]
=
0.0
f
;
vectorZ
[
0
]
=
0.0
f
;
vectorZ
[
1
]
=
0.0
f
;
vectorZ
[
2
]
=
1.0
f
;
}
}
else
if
(
axisType
>=
4
){
vectorX
[
0
]
=
0.1
f
;
vectorX
[
1
]
=
0.1
f
;
vectorX
[
2
]
=
0.1
f
;
}
// x = x - (x.z)z
float
dot
=
vectorZ
[
0
]
*
vectorX
[
0
]
+
vectorZ
[
1
]
*
vectorX
[
1
]
+
vectorZ
[
2
]
*
vectorX
[
2
];
float
molDipole
[
3
];
molDipole
[
0
]
=
labFrameDipole
[
particleIndex
*
3
];
molDipole
[
1
]
=
labFrameDipole
[
particleIndex
*
3
+
1
];
molDipole
[
2
]
=
labFrameDipole
[
particleIndex
*
3
+
2
];
vectorX
[
0
]
-=
dot
*
vectorZ
[
0
];
vectorX
[
1
]
-=
dot
*
vectorZ
[
1
];
vectorX
[
2
]
-=
dot
*
vectorZ
[
2
];
// set out-of-range elements to 0.0f
sum
=
normVector3
(
vectorX
);
labFrameDipole
[
particleIndex
*
3
]
=
molDipole
[
0
]
*
vectorX
[
0
]
+
molDipole
[
1
]
*
vectorY
[
0
]
+
molDipole
[
2
]
*
vectorZ
[
0
];
labFrameDipole
[
particleIndex
*
3
+
1
]
=
molDipole
[
0
]
*
vectorX
[
1
]
+
molDipole
[
1
]
*
vectorY
[
1
]
+
molDipole
[
2
]
*
vectorZ
[
1
];
labFrameDipole
[
particleIndex
*
3
+
2
]
=
molDipole
[
0
]
*
vectorX
[
2
]
+
molDipole
[
1
]
*
vectorY
[
2
]
+
molDipole
[
2
]
*
vectorZ
[
2
];
// ---------------------------------------------------------------------------------------
float
mPole
[
3
][
3
];
unsigned
int
offset
=
particleIndex
*
9
;
mPole
[
0
][
0
]
=
labFrameQuadrupole
[
offset
];
mPole
[
0
][
1
]
=
labFrameQuadrupole
[
offset
+
1
];
mPole
[
0
][
2
]
=
labFrameQuadrupole
[
offset
+
2
];
mPole
[
1
][
0
]
=
labFrameQuadrupole
[
offset
+
3
];
mPole
[
1
][
1
]
=
labFrameQuadrupole
[
offset
+
4
];
mPole
[
1
][
2
]
=
labFrameQuadrupole
[
offset
+
5
];
mPole
[
2
][
0
]
=
labFrameQuadrupole
[
offset
+
6
];
mPole
[
2
][
1
]
=
labFrameQuadrupole
[
offset
+
7
];
mPole
[
2
][
2
]
=
labFrameQuadrupole
[
offset
+
8
];
labFrameQuadrupole
[
offset
+
8
]
=
vectorX
[
2
]
*
(
vectorX
[
2
]
*
mPole
[
0
][
0
]
+
vectorY
[
2
]
*
mPole
[
0
][
1
]
+
vectorZ
[
2
]
*
mPole
[
0
][
2
]);
labFrameQuadrupole
[
offset
+
8
]
+=
vectorY
[
2
]
*
(
vectorX
[
2
]
*
mPole
[
1
][
0
]
+
vectorY
[
2
]
*
mPole
[
1
][
1
]
+
vectorZ
[
2
]
*
mPole
[
1
][
2
]);
labFrameQuadrupole
[
offset
+
8
]
+=
vectorZ
[
2
]
*
(
vectorX
[
2
]
*
mPole
[
2
][
0
]
+
vectorY
[
2
]
*
mPole
[
2
][
1
]
+
vectorZ
[
2
]
*
mPole
[
2
][
2
]);
vectorY
[
0
]
=
(
vectorZ
[
1
]
*
vectorX
[
2
])
-
(
vectorZ
[
2
]
*
vectorX
[
1
]);
vectorY
[
1
]
=
(
vectorZ
[
2
]
*
vectorX
[
0
])
-
(
vectorZ
[
0
]
*
vectorX
[
2
]);
vectorY
[
2
]
=
(
vectorZ
[
0
]
*
vectorX
[
1
])
-
(
vectorZ
[
1
]
*
vectorX
[
0
]);
// use identity rotation matrix for unrecognized axis types
labFrameQuadrupole
[
offset
+
4
]
=
vectorX
[
1
]
*
(
vectorX
[
1
]
*
mPole
[
0
][
0
]
+
vectorY
[
1
]
*
mPole
[
0
][
1
]
+
vectorZ
[
1
]
*
mPole
[
0
][
2
]);
labFrameQuadrupole
[
offset
+
4
]
+=
vectorY
[
1
]
*
(
vectorX
[
1
]
*
mPole
[
1
][
0
]
+
vectorY
[
1
]
*
mPole
[
1
][
1
]
+
vectorZ
[
1
]
*
mPole
[
1
][
2
]);
labFrameQuadrupole
[
offset
+
4
]
+=
vectorZ
[
1
]
*
(
vectorX
[
1
]
*
mPole
[
2
][
0
]
+
vectorY
[
1
]
*
mPole
[
2
][
1
]
+
vectorZ
[
1
]
*
mPole
[
2
][
2
]);
if
(
axisType
<
0
||
axisType
>
4
){
labFrameQuadrupole
[
offset
+
5
]
=
vectorX
[
1
]
*
(
vectorX
[
2
]
*
mPole
[
0
][
0
]
+
vectorY
[
2
]
*
mPole
[
0
][
1
]
+
vectorZ
[
2
]
*
mPole
[
0
][
2
]);
labFrameQuadrupole
[
offset
+
5
]
+=
vectorY
[
1
]
*
(
vectorX
[
2
]
*
mPole
[
1
][
0
]
+
vectorY
[
2
]
*
mPole
[
1
][
1
]
+
vectorZ
[
2
]
*
mPole
[
1
][
2
]);
labFrameQuadrupole
[
offset
+
5
]
+=
vectorZ
[
1
]
*
(
vectorX
[
2
]
*
mPole
[
2
][
0
]
+
vectorY
[
2
]
*
mPole
[
2
][
1
]
+
vectorZ
[
2
]
*
mPole
[
2
][
2
]);
vectorX
[
0
]
=
1.0
f
;
vectorX
[
1
]
=
0.0
f
;
vectorX
[
2
]
=
0.0
f
;
labFrameQuadrupole
[
offset
]
=
vectorX
[
0
]
*
(
vectorX
[
0
]
*
mPole
[
0
][
0
]
+
vectorY
[
0
]
*
mPole
[
0
][
1
]
+
vectorZ
[
0
]
*
mPole
[
0
][
2
])
;
labFrameQuadrupole
[
offset
]
+=
vectorY
[
0
]
*
(
vectorX
[
0
]
*
mPole
[
1
][
0
]
+
vectorY
[
0
]
*
mPole
[
1
][
1
]
+
vectorZ
[
0
]
*
mPole
[
1
][
2
])
;
labFrameQuadrupole
[
offset
]
+=
vectorZ
[
0
]
*
(
vectorX
[
0
]
*
mPole
[
2
][
0
]
+
vectorY
[
0
]
*
mPole
[
2
][
1
]
+
vectorZ
[
0
]
*
mPole
[
2
][
2
])
;
vectorY
[
0
]
=
0.0
f
;
vectorY
[
1
]
=
1.0
f
;
vectorY
[
2
]
=
0.0
f
;
labFrameQuadrupole
[
offset
+
1
]
=
vectorX
[
0
]
*
(
vectorX
[
1
]
*
mPole
[
0
][
0
]
+
vectorY
[
1
]
*
mPole
[
0
][
1
]
+
vectorZ
[
1
]
*
mPole
[
0
][
2
])
;
labFrameQuadrupole
[
offset
+
1
]
+=
vectorY
[
0
]
*
(
vectorX
[
1
]
*
mPole
[
1
][
0
]
+
vectorY
[
1
]
*
mPole
[
1
][
1
]
+
vectorZ
[
1
]
*
mPole
[
1
][
2
])
;
labFrameQuadrupole
[
offset
+
1
]
+=
vectorZ
[
0
]
*
(
vectorX
[
1
]
*
mPole
[
2
][
0
]
+
vectorY
[
1
]
*
mPole
[
2
][
1
]
+
vectorZ
[
1
]
*
mPole
[
2
][
2
])
;
vectorZ
[
0
]
=
0.0
f
;
vectorZ
[
1
]
=
0.0
f
;
vectorZ
[
2
]
=
1.0
f
;
}
float
molDipole
[
3
];
float
*
labDipole
=
&
(
labFrameDipole
[
atomIndex
*
3
]);
molDipole
[
0
]
=
labDipole
[
0
];
molDipole
[
1
]
=
labDipole
[
1
];
molDipole
[
2
]
=
labDipole
[
2
];
// set out-of-range elements to 0.0f
labFrameQuadrupole
[
offset
+
2
]
=
vectorX
[
0
]
*
(
vectorX
[
2
]
*
mPole
[
0
][
0
]
+
vectorY
[
2
]
*
mPole
[
0
][
1
]
+
vectorZ
[
2
]
*
mPole
[
0
][
2
]);
labFrameQuadrupole
[
offset
+
2
]
+=
vectorY
[
0
]
*
(
vectorX
[
2
]
*
mPole
[
1
][
0
]
+
vectorY
[
2
]
*
mPole
[
1
][
1
]
+
vectorZ
[
2
]
*
mPole
[
1
][
2
]);
labFrameQuadrupole
[
offset
+
2
]
+=
vectorZ
[
0
]
*
(
vectorX
[
2
]
*
mPole
[
2
][
0
]
+
vectorY
[
2
]
*
mPole
[
2
][
1
]
+
vectorZ
[
2
]
*
mPole
[
2
][
2
]);
labDipole
[
0
]
=
atomIndex
>=
numOfAtoms
?
0.0
f
:
molDipole
[
0
]
*
vectorX
[
0
]
+
molDipole
[
1
]
*
vectorY
[
0
]
+
molDipole
[
2
]
*
vectorZ
[
0
];
labDipole
[
1
]
=
atomIndex
>=
numOfAtoms
?
0.0
f
:
molDipole
[
0
]
*
vectorX
[
1
]
+
molDipole
[
1
]
*
vectorY
[
1
]
+
molDipole
[
2
]
*
vectorZ
[
1
];
labDipole
[
2
]
=
atomIndex
>=
numOfAtoms
?
0.0
f
:
molDipole
[
0
]
*
vectorX
[
2
]
+
molDipole
[
1
]
*
vectorY
[
2
]
+
molDipole
[
2
]
*
vectorZ
[
2
];
// ---------------------------------------------------------------------------------------
float
*
rPole
[
3
];
float
mPole
[
3
][
3
];
float
*
labQuadrupole
=
&
(
labFrameQuadrupole
[
atomIndex
*
9
]);
for
(
int
ii
=
0
;
ii
<
3
;
ii
++
){
mPole
[
ii
][
0
]
=
labQuadrupole
[
3
*
ii
+
0
];
mPole
[
ii
][
1
]
=
labQuadrupole
[
3
*
ii
+
1
];
mPole
[
ii
][
2
]
=
labQuadrupole
[
3
*
ii
+
2
];
labFrameQuadrupole
[
offset
+
3
]
=
labFrameQuadrupole
[
offset
+
1
];
labFrameQuadrupole
[
offset
+
6
]
=
labFrameQuadrupole
[
offset
+
2
];
labFrameQuadrupole
[
offset
+
7
]
=
labFrameQuadrupole
[
offset
+
5
];
rPole
[
ii
]
=
labQuadrupole
+
ii
*
3
;
rPole
[
ii
][
0
]
=
0.0
f
;
rPole
[
ii
][
1
]
=
0.0
f
;
rPole
[
ii
][
2
]
=
0.0
f
;
}
int
ii
=
threadIdx
.
x
;
if
(
ii
<
3
){
for
(
int
jj
=
ii
;
jj
<
3
;
jj
++
){
rPole
[
ii
][
jj
]
+=
vectorX
[
ii
]
*
vectorX
[
jj
]
*
mPole
[
0
][
0
];
rPole
[
ii
][
jj
]
+=
vectorX
[
ii
]
*
vectorY
[
jj
]
*
mPole
[
0
][
1
];
rPole
[
ii
][
jj
]
+=
vectorX
[
ii
]
*
vectorZ
[
jj
]
*
mPole
[
0
][
2
];
rPole
[
ii
][
jj
]
+=
vectorY
[
ii
]
*
vectorX
[
jj
]
*
mPole
[
1
][
0
];
rPole
[
ii
][
jj
]
+=
vectorY
[
ii
]
*
vectorY
[
jj
]
*
mPole
[
1
][
1
];
rPole
[
ii
][
jj
]
+=
vectorY
[
ii
]
*
vectorZ
[
jj
]
*
mPole
[
1
][
2
];
rPole
[
ii
][
jj
]
+=
vectorZ
[
ii
]
*
vectorX
[
jj
]
*
mPole
[
2
][
0
];
rPole
[
ii
][
jj
]
+=
vectorZ
[
ii
]
*
vectorY
[
jj
]
*
mPole
[
2
][
1
];
rPole
[
ii
][
jj
]
+=
vectorZ
[
ii
]
*
vectorZ
[
jj
]
*
mPole
[
2
][
2
];
}
particleIndex
+=
gridDim
.
x
*
blockDim
.
x
;
}
__syncthreads
();
rPole
[
1
][
0
]
=
rPole
[
0
][
1
];
rPole
[
2
][
0
]
=
rPole
[
0
][
2
];
rPole
[
2
][
1
]
=
rPole
[
1
][
2
];
// set out-of-range elements to 0.0f
labQuadrupole
[
0
]
=
atomIndex
>=
numOfAtoms
?
0.0
f
:
labQuadrupole
[
0
];
labQuadrupole
[
1
]
=
atomIndex
>=
numOfAtoms
?
0.0
f
:
labQuadrupole
[
1
];
labQuadrupole
[
2
]
=
atomIndex
>=
numOfAtoms
?
0.0
f
:
labQuadrupole
[
2
];
labQuadrupole
[
3
]
=
atomIndex
>=
numOfAtoms
?
0.0
f
:
labQuadrupole
[
3
];
labQuadrupole
[
4
]
=
atomIndex
>=
numOfAtoms
?
0.0
f
:
labQuadrupole
[
4
];
labQuadrupole
[
5
]
=
atomIndex
>=
numOfAtoms
?
0.0
f
:
labQuadrupole
[
5
];
labQuadrupole
[
6
]
=
atomIndex
>=
numOfAtoms
?
0.0
f
:
labQuadrupole
[
6
];
labQuadrupole
[
7
]
=
atomIndex
>=
numOfAtoms
?
0.0
f
:
labQuadrupole
[
7
];
labQuadrupole
[
8
]
=
atomIndex
>=
numOfAtoms
?
0.0
f
:
labQuadrupole
[
8
];
}
void
cudaComputeAmoebaLabFrameMoments
(
amoebaGpuContext
amoebaGpu
)
...
...
@@ -405,8 +379,8 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
int
numBlocks
=
amoebaGpu
->
paddedNumberOfAtom
s
;
int
numThreads
=
20
;
int
numBlocks
=
gpu
->
sim
.
block
s
;
int
numThreads
=
gpu
->
sim
.
update_threads_per_block
;
//#define AMOEBA_DEBUG
#ifdef AMOEBA_DEBUG
...
...
@@ -434,100 +408,19 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
double
kernelTime
=
0.0
;
#endif
// copy molecular moments to lab frame moment arrays
// check if chiral center requires moments to have sign flipped
// compute lab frame moments
cudaMemcpy
(
amoebaGpu
->
psLabFrameDipole
->
_pDevData
,
amoebaGpu
->
psMolecularDipole
->
_pDevData
,
3
*
gpu
->
sim
.
paddedNumberOfAtoms
*
sizeof
(
float
),
cudaMemcpyDeviceToDevice
);
cudaMemcpy
(
amoebaGpu
->
psLabFrameQuadrupole
->
_pDevData
,
amoebaGpu
->
psMolecularQuadrupole
->
_pDevData
,
9
*
gpu
->
sim
.
paddedNumberOfAtoms
*
sizeof
(
float
),
cudaMemcpyDeviceToDevice
);
kCudaComputeCheckChiral_kernel
<<<
numBlocks
,
numThreads
>>>
(
);
LAUNCHERROR
(
"kCudaComputeCheckChiral"
);
kCudaComputeLabFrameMoments_kernel
<<<
numBlocks
,
numThreads
>>>
(
);
LAUNCHERROR
(
methodName
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
static
int
timestep
=
0
;
timestep
++
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Finished rotation kernel execution in %lf us
\n
"
,
kernelTime
);
(
void
)
fflush
(
amoebaGpu
->
log
);
(
void
)
fflush
(
amoebaGpu
->
log
);
amoebaGpu
->
psLabFrameDipole
->
Download
();
(
void
)
fprintf
(
amoebaGpu
->
log
,
"psLabFrameDipole completed
\n
"
);
(
void
)
fflush
(
amoebaGpu
->
log
);
amoebaGpu
->
psLabFrameQuadrupole
->
Download
();
(
void
)
fprintf
(
amoebaGpu
->
log
,
"psLabFrameQpole completed
\n
"
);
(
void
)
fflush
(
amoebaGpu
->
log
);
int
maxPrint
=
20
;
for
(
int
ii
=
0
;
ii
<
amoebaGpu
->
paddedNumberOfAtoms
;
ii
++
){
int
dipoleOffset
=
3
*
ii
;
int
quadrupoleOffset
=
9
*
ii
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
%6d [%6d %6d %6d] "
,
ii
,
amoebaGpu
->
psMultipoleParticlesIdsAndAxisType
->
_pSysData
[
ii
].
x
,
amoebaGpu
->
psMultipoleParticlesIdsAndAxisType
->
_pSysData
[
ii
].
y
,
amoebaGpu
->
psMultipoleParticlesIdsAndAxisType
->
_pSysData
[
ii
].
w
);
// coords
(
void
)
fprintf
(
amoebaGpu
->
log
,
"x[%16.9e %16.9e %16.9e]
\n
"
,
gpu
->
psPosq4
->
_pSysData
[
ii
].
x
,
gpu
->
psPosq4
->
_pSysData
[
ii
].
y
,
gpu
->
psPosq4
->
_pSysData
[
ii
].
z
);
/*
(void) fprintf( amoebaGpu->log," R[%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n",
amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset],
amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+1],
amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+2],
amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+3],
amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+4],
amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+5],
amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+6],
amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+7],
amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+8] );
*/
// dipole
(
void
)
fprintf
(
amoebaGpu
->
log
,
" D[%16.9e %16.9e %16.9e]
\n
"
,
amoebaGpu
->
psLabFrameDipole
->
_pSysData
[
dipoleOffset
],
amoebaGpu
->
psLabFrameDipole
->
_pSysData
[
dipoleOffset
+
1
],
amoebaGpu
->
psLabFrameDipole
->
_pSysData
[
dipoleOffset
+
2
]
);
// quadrupole
(
void
)
fprintf
(
amoebaGpu
->
log
,
" Q[%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]
\n
"
,
amoebaGpu
->
psLabFrameQuadrupole
->
_pSysData
[
quadrupoleOffset
],
amoebaGpu
->
psLabFrameQuadrupole
->
_pSysData
[
quadrupoleOffset
+
1
],
amoebaGpu
->
psLabFrameQuadrupole
->
_pSysData
[
quadrupoleOffset
+
2
],
amoebaGpu
->
psLabFrameQuadrupole
->
_pSysData
[
quadrupoleOffset
+
3
],
amoebaGpu
->
psLabFrameQuadrupole
->
_pSysData
[
quadrupoleOffset
+
4
],
amoebaGpu
->
psLabFrameQuadrupole
->
_pSysData
[
quadrupoleOffset
+
5
],
amoebaGpu
->
psLabFrameQuadrupole
->
_pSysData
[
quadrupoleOffset
+
6
],
amoebaGpu
->
psLabFrameQuadrupole
->
_pSysData
[
quadrupoleOffset
+
7
],
amoebaGpu
->
psLabFrameQuadrupole
->
_pSysData
[
quadrupoleOffset
+
8
]
);
if
(
ii
==
maxPrint
&&
(
ii
<
(
gpu
->
natoms
-
maxPrint
))
){
ii
=
gpu
->
natoms
-
maxPrint
;
}
}
int
nansDetected
=
checkForNansAndInfinities
(
amoebaGpu
->
paddedNumberOfAtoms
*
3
,
amoebaGpu
->
psLabFrameDipole
);
nansDetected
+=
checkForNansAndInfinities
(
amoebaGpu
->
paddedNumberOfAtoms
*
9
,
amoebaGpu
->
psLabFrameQuadrupole
);
if
(
nansDetected
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Nans detected in dipole/quadrupoles.
\n
"
);
exit
(
0
);
}
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
if
(
0
){
int
particles
=
amoebaGpu
->
paddedNumberOfAtoms
;
std
::
vector
<
int
>
fileId
;
//fileId.push_back( 0 );
VectorOfDoubleVectors
outputVector
;
cudaLoadCudaFloat4Array
(
particles
,
3
,
gpu
->
psPosq4
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaLoadCudaFloatArray
(
particles
,
3
,
amoebaGpu
->
psLabFrameDipole
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaLoadCudaFloatArray
(
particles
,
9
,
amoebaGpu
->
psLabFrameQuadrupole
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaRotatedMoments"
,
fileId
,
outputVector
);
}
}
void
kCalculateAmoebaMultipoleForces
(
amoebaGpuContext
amoebaGpu
,
bool
hasAmoebaGeneralizedKirkwood
)
...
...
@@ -536,21 +429,9 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
// compute lab frame moments
if
(
0
){
static
int
iteration
=
0
;
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
checkForNansFloat4
(
gpu
->
natoms
,
gpu
->
psPosq4
,
gpu
->
psAtomIndex
->
_pSysData
,
++
iteration
,
"MultipoleForcesPreLabCoord"
,
stderr
);
}
if
(
0
){
static
int
iteration
=
0
;
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
checkForNansFloat4
(
gpu
->
natoms
,
gpu
->
psForce4
,
gpu
->
psAtomIndex
->
_pSysData
,
++
iteration
,
"MultipoleForcesPreForce"
,
stderr
);
}
cudaComputeAmoebaLabFrameMoments
(
amoebaGpu
);
if
(
0
){
if
(
1
){
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
std
::
vector
<
int
>
fileId
;
//fileId.push_back( 0 );
...
...
@@ -567,10 +448,14 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
cudaComputeAmoebaFixedEAndGkFields
(
amoebaGpu
);
cudaComputeAmoebaMutualInducedAndGkField
(
amoebaGpu
);
}
else
{
if
(
amoebaGpu
->
multipoleNonbondedMethod
==
AMOEBA_NO_CUTOFF
){
cudaComputeAmoebaFixedEField
(
amoebaGpu
);
cudaComputeAmoebaMutualInducedField
(
amoebaGpu
);
}
else
{
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
kFindBlockBoundsPeriodic_kernel
<<<
(
gpu
->
psGridBoundingBox
->
_length
+
63
)
/
64
,
64
>>>
();
LAUNCHERROR
(
"kFindBlockBoundsPeriodic"
);
...
...
@@ -581,6 +466,7 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
kFindInteractionsWithinBlocksPeriodic_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
unsigned
int
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
LAUNCHERROR
(
"kFindInteractionsWithinBlocksPeriodic"
);
cudaComputeAmoebaPmeFixedEField
(
amoebaGpu
);
cudaComputeAmoebaPmeMutualInducedField
(
amoebaGpu
);
}
...
...
@@ -597,33 +483,10 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
// calculate electrostatic forces
if
(
amoebaGpu
->
multipoleNonbondedMethod
==
AMOEBA_NO_CUTOFF
){
cudaComputeAmoebaElectrostatic
(
amoebaGpu
);
// map torques to forces
cudaComputeAmoebaMapTorquesAndAddTotalForce
(
amoebaGpu
,
amoebaGpu
->
psTorque
,
amoebaGpu
->
psForce
,
amoebaGpu
->
gpuContext
->
psForce4
);
if
(
0
){
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
std
::
vector
<
int
>
fileId
;
//fileId.push_back( 0 );
VectorOfDoubleVectors
outputVector
;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloat4Array
(
gpu
->
natoms
,
3
,
amoebaGpu
->
gpuContext
->
psForce4
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
/
4.184
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaMpole"
,
fileId
,
outputVector
);
}
}
else
{
cudaComputeAmoebaPmeElectrostatic
(
amoebaGpu
);
}
if
(
0
){
static
int
iteration
=
0
;
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
checkForNansFloat4
(
gpu
->
natoms
,
gpu
->
psForce4
,
gpu
->
psAtomIndex
->
_pSysData
,
++
iteration
,
"MultipoleForcesPstForce"
,
stderr
);
}
}
#undef AMOEBA_DEBUG
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaUtilities.cu
View file @
761d7e17
...
...
@@ -117,23 +117,27 @@ void kClearFields_kernel( unsigned int bufferLength, float* EField )
void
kClearFields_3
(
amoebaGpuContext
amoebaGpu
,
unsigned
int
numberToClear
)
{
kClearFields_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
384
>>>
(
amoebaGpu
->
paddedNumberOfAtoms
*
3
*
amoebaGpu
->
outputBuffers
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
);
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
kClearFields_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
384
>>>
(
gpu
->
sim
.
paddedNumberOfAtoms
*
3
*
gpu
->
sim
.
outputBuffers
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
);
LAUNCHERROR
(
"kClearFields_3_1"
);
kClearFields_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
384
>>>
(
amoebaGpu
->
paddedNumberOfAtoms
*
3
*
amoebaGpu
->
outputBuffers
,
amoebaGpu
->
psWorkArray_3_2
->
_pDevData
);
LAUNCHERROR
(
"kClearFields_3_2"
);
if
(
numberToClear
>
1
){
kClearFields_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
384
>>>
(
gpu
->
sim
.
paddedNumberOfAtoms
*
3
*
gpu
->
sim
.
outputBuffers
,
amoebaGpu
->
psWorkArray_3_2
->
_pDevData
);
LAUNCHERROR
(
"kClearFields_3_2"
);
}
else
{
return
;
}
if
(
numberToClear
>
2
){
kClearFields_kernel
<<<
amoebaGpu
->
nonbond
B
locks
,
384
>>>
(
amoebaGpu
->
paddedNumberOfAtoms
*
3
*
amoebaGpu
->
outputBuffers
,
kClearFields_kernel
<<<
gpu
->
sim
.
nonbond
_b
locks
,
384
>>>
(
gpu
->
sim
.
paddedNumberOfAtoms
*
3
*
gpu
->
sim
.
outputBuffers
,
amoebaGpu
->
psWorkArray_3_3
->
_pDevData
);
LAUNCHERROR
(
"kClearFields_3_3"
);
}
else
{
return
;
}
if
(
numberToClear
>
3
){
kClearFields_kernel
<<<
amoebaGpu
->
nonbond
B
locks
,
384
>>>
(
amoebaGpu
->
paddedNumberOfAtoms
*
3
*
amoebaGpu
->
outputBuffers
,
kClearFields_kernel
<<<
gpu
->
sim
.
nonbond
_b
locks
,
384
>>>
(
gpu
->
sim
.
paddedNumberOfAtoms
*
3
*
gpu
->
sim
.
outputBuffers
,
amoebaGpu
->
psWorkArray_3_4
->
_pDevData
);
LAUNCHERROR
(
"kClearFields_3_4"
);
}
...
...
@@ -144,11 +148,12 @@ void kClearFields_3( amoebaGpuContext amoebaGpu, unsigned int numberToClear )
void
kClearFields_1
(
amoebaGpuContext
amoebaGpu
)
{
kClearFields_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
384
>>>
(
amoebaGpu
->
paddedNumberOfAtoms
*
amoebaGpu
->
outputBuffers
,
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
kClearFields_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
384
>>>
(
gpu
->
sim
.
paddedNumberOfAtoms
*
gpu
->
sim
.
outputBuffers
,
amoebaGpu
->
psWorkArray_1_1
->
_pDevData
);
LAUNCHERROR
(
"kClearFields_1_1"
);
kClearFields_kernel
<<<
amoebaGpu
->
nonbond
B
locks
,
384
>>>
(
amoebaGpu
->
paddedNumberOfAtoms
*
amoebaGpu
->
outputBuffers
,
kClearFields_kernel
<<<
gpu
->
sim
.
nonbond
_b
locks
,
384
>>>
(
gpu
->
sim
.
paddedNumberOfAtoms
*
gpu
->
sim
.
outputBuffers
,
amoebaGpu
->
psWorkArray_1_2
->
_pDevData
);
LAUNCHERROR
(
"kClearFields_1_2"
);
}
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.cu
View file @
761d7e17
...
...
@@ -449,8 +449,9 @@ static void kCalculateAmoebaVdw14_7NonReduction(amoebaGpuContext amoebaGpu, CUDA
static
void
kReduceVdw14_7
(
amoebaGpuContext
amoebaGpu
,
CUDAStream
<
float
>*
outputArray
)
{
kReduceFields_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
amoebaGpu
->
fieldReduceThreadsPerBlock
>>>
(
amoebaGpu
->
paddedNumberOfAtoms
*
3
,
amoebaGpu
->
outputBuffers
,
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
kReduceFields_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
bsf_reduce_threads_per_block
>>>
(
gpu
->
sim
.
paddedNumberOfAtoms
*
3
,
gpu
->
sim
.
outputBuffers
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
outputArray
->
_pDevData
);
LAUNCHERROR
(
"kReduceVdw14_7"
);
}
...
...
@@ -545,13 +546,13 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
if
(
1
&&
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Apply cutoff=%d warp=%d
\n
"
,
applyCutoff
,
gpu
->
bOutputBufferPerWarp
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u
\n
"
,
amoebaGpu
->
nonbond
B
locks
,
threadsPerBlock
,
amoebaGpu
->
bOutputBufferPerWarp
,
gpu
->
sim
.
nonbond
_b
locks
,
threadsPerBlock
,
amoebaGpu
->
bOutputBufferPerWarp
,
sizeof
(
Vdw14_7Particle
),
sizeof
(
Vdw14_7Particle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
if
(
0
){
gpu
->
psInteractionCount
->
Download
();
amoebaGpu
->
psVdwWorkUnit
->
Download
();
unsigned
int
totalWarps
=
(
amoebaGpu
->
nonbond
B
locks
*
threadsPerBlock
)
/
GRID
;
unsigned
int
totalWarps
=
(
gpu
->
sim
.
nonbond
_b
locks
*
threadsPerBlock
)
/
GRID
;
float
ratiof
=
(
float
)
totalWarps
/
(
float
)
amoebaGpu
->
psVdwWorkUnit
->
_length
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Ixn warps=%u count=%u
\n
"
,
totalWarps
,
gpu
->
psInteractionCount
->
_pSysData
[
0
]
);
for
(
unsigned
int
ii
=
0
;
ii
<
amoebaGpu
->
psVdwWorkUnit
->
_length
;
ii
++
){
...
...
@@ -610,7 +611,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
#endif
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaVdw14_7CutoffByWarp_kernel
<<<
amoebaGpu
->
nonbond
B
locks
,
threadsPerBlock
,
sizeof
(
Vdw14_7Particle
)
*
threadsPerBlock
>>>
(
kCalculateAmoebaVdw14_7CutoffByWarp_kernel
<<<
gpu
->
sim
.
nonbond
_b
locks
,
threadsPerBlock
,
sizeof
(
Vdw14_7Particle
)
*
threadsPerBlock
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
amoebaGpu
->
psAmoebaVdwCoordinates
->
_pDevData
,
amoebaGpu
->
psVdwSigmaEpsilon
->
_pDevData
,
...
...
@@ -624,7 +625,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
#endif
}
else
{
kCalculateAmoebaVdw14_7Cutoff_kernel
<<<
amoebaGpu
->
nonbond
B
locks
,
threadsPerBlock
,
sizeof
(
Vdw14_7Particle
)
*
threadsPerBlock
>>>
(
kCalculateAmoebaVdw14_7Cutoff_kernel
<<<
gpu
->
sim
.
nonbond
_b
locks
,
threadsPerBlock
,
sizeof
(
Vdw14_7Particle
)
*
threadsPerBlock
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
amoebaGpu
->
psAmoebaVdwCoordinates
->
_pDevData
,
amoebaGpu
->
psVdwSigmaEpsilon
->
_pDevData
,
...
...
@@ -644,7 +645,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaVdw14_7N2ByWarp_kernel
<<<
amoebaGpu
->
nonbond
B
locks
,
threadsPerBlock
,
sizeof
(
Vdw14_7Particle
)
*
threadsPerBlock
>>>
(
kCalculateAmoebaVdw14_7N2ByWarp_kernel
<<<
gpu
->
sim
.
nonbond
_b
locks
,
threadsPerBlock
,
sizeof
(
Vdw14_7Particle
)
*
threadsPerBlock
>>>
(
amoebaGpu
->
psVdwWorkUnit
->
_pDevData
,
amoebaGpu
->
psAmoebaVdwCoordinates
->
_pDevData
,
amoebaGpu
->
psVdwSigmaEpsilon
->
_pDevData
,
...
...
@@ -658,7 +659,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
#endif
}
else
{
kCalculateAmoebaVdw14_7N2_kernel
<<<
amoebaGpu
->
nonbond
B
locks
,
threadsPerBlock
,
sizeof
(
Vdw14_7Particle
)
*
threadsPerBlock
>>>
(
kCalculateAmoebaVdw14_7N2_kernel
<<<
gpu
->
sim
.
nonbond
_b
locks
,
threadsPerBlock
,
sizeof
(
Vdw14_7Particle
)
*
threadsPerBlock
>>>
(
amoebaGpu
->
psVdwWorkUnit
->
_pDevData
,
amoebaGpu
->
psAmoebaVdwCoordinates
->
_pDevData
,
amoebaGpu
->
psVdwSigmaEpsilon
->
_pDevData
,
...
...
@@ -707,7 +708,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
amoebaGpu->psWorkArray_3_1->Download();
//for( int jj = 0; jj < 3*gpu->natoms; jj += 3 )
for( int jj = 0; jj < 3*gpu->natoms; jj += 3 ){
for( int kk = 0; kk <
amoebaGpu->
outputBuffers; kk++ ){
for( int kk = 0; kk <
gpu->sim.
outputBuffers; kk++ ){
float delta = fabs(amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj+2] + 1.0f);
if( delta < 5.0e-06 || isNanOrInfinity( (double) amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj] ) || isNanOrInfinity( (double) amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj+2] ) )
(void) fprintf( amoebaGpu->log,"%6d %6d [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n", jj, kk,
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.h
View file @
761d7e17
...
...
@@ -95,7 +95,7 @@ void METHOD_NAME(kCalculateAmoebaVdw14_7, _kernel)(
if
(
bExclusionFlag
){
unsigned
int
xi
=
x
>>
GRIDBITS
;
unsigned
int
cell
=
xi
+
xi
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
/
GRID
-
xi
*
(
xi
+
1
)
/
2
;
unsigned
int
cell
=
xi
+
xi
*
cSim
.
paddedNumberOfAtoms
/
GRID
-
xi
*
(
xi
+
1
)
/
2
;
exclusionIndex
=
cAmoebaSim
.
pVdwExclusionIndicesIndex
[
cell
]
+
tgx
;
exclusionMask
=
cAmoebaSim
.
pVdwExclusionIndices
[
exclusionIndex
];
}
...
...
@@ -136,7 +136,7 @@ void METHOD_NAME(kCalculateAmoebaVdw14_7, _kernel)(
);
// mask out excluded ixns
unsigned
int
mask
=
(
(
atomI
>=
c
AmoebaSim
.
numberOfA
toms
)
||
((
y
+
j
)
>=
c
AmoebaSim
.
numberOfA
toms
)
)
?
0
:
1
;
unsigned
int
mask
=
(
(
atomI
>=
c
Sim
.
a
toms
)
||
((
y
+
j
)
>=
c
Sim
.
a
toms
)
)
?
0
:
1
;
if
(
mask
&&
bExclusionFlag
){
unsigned
int
maskIndex
=
1
<<
j
;
mask
=
(
exclusionMask
&
maskIndex
)
?
0
:
1
;
...
...
@@ -158,25 +158,25 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
debugArray
[
index
].
z
=
-
1
.
0
f
;
debugArray
[
index
].
w
=
(
float
)
(
mask
+
1
);
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
(
float
)
x
;
debugArray
[
index
].
y
=
(
float
)
y
;
debugArray
[
index
].
z
=
(
float
)
tgx
;
debugArray
[
index
].
w
=
energy
;
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullDebug
[
0
].
x
;
debugArray
[
index
].
y
=
pullDebug
[
0
].
y
;
debugArray
[
index
].
z
=
pullDebug
[
0
].
z
;
debugArray
[
index
].
w
=
pullDebug
[
0
].
w
;
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullDebug
[
1
].
x
;
debugArray
[
index
].
y
=
pullDebug
[
1
].
y
;
debugArray
[
index
].
z
=
pullDebug
[
1
].
z
;
debugArray
[
index
].
w
=
pullDebug
[
1
].
w
;
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
mask
?
ijForce
[
0
]
:
0
.
0
f
;
debugArray
[
index
].
y
=
mask
?
ijForce
[
1
]
:
0
.
0
f
;
debugArray
[
index
].
z
=
mask
?
ijForce
[
2
]
:
0
.
0
f
;
...
...
@@ -189,11 +189,11 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
cSim
.
paddedNumberOfAtoms
);
load3dArrayBufferPerWarp
(
offset
,
forceSum
,
outputForce
);
#else
unsigned
int
offset
=
3
*
(
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
unsigned
int
offset
=
3
*
(
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
paddedNumberOfAtoms
);
load3dArray
(
offset
,
forceSum
,
outputForce
);
#endif
...
...
@@ -268,7 +268,7 @@ flags = 0xFFFFFFFF;
// mask out excluded ixns
unsigned
int
mask
=
(
(
atomI
>=
c
AmoebaSim
.
numberOfA
toms
)
||
((
y
+
jIdx
)
>=
c
AmoebaSim
.
numberOfA
toms
)
)
?
0
:
1
;
unsigned
int
mask
=
(
(
atomI
>=
c
Sim
.
a
toms
)
||
((
y
+
jIdx
)
>=
c
Sim
.
a
toms
)
)
?
0
:
1
;
if
(
mask
&&
bExclusionFlag
){
unsigned
int
maskIndex
=
1
<<
jIdx
;
mask
=
(
exclusionMask
&
maskIndex
)
?
0
:
1
;
...
...
@@ -337,25 +337,25 @@ if( atomI == targetAtom || (y+jIdx) == targetAtom ){
debugArray
[
index
].
z
=
-
3
.
0
;
debugArray
[
index
].
w
=
(
float
)
(
mask
+
1
);
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
(
float
)
x
;
debugArray
[
index
].
y
=
(
float
)
y
;
debugArray
[
index
].
z
=
(
float
)
tgx
;
debugArray
[
index
].
w
=
energy
;
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullDebug
[
0
].
x
;
debugArray
[
index
].
y
=
pullDebug
[
0
].
y
;
debugArray
[
index
].
z
=
pullDebug
[
0
].
z
;
debugArray
[
index
].
w
=
pullDebug
[
0
].
w
;
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullDebug
[
1
].
x
;
debugArray
[
index
].
y
=
pullDebug
[
1
].
y
;
debugArray
[
index
].
z
=
pullDebug
[
1
].
z
;
debugArray
[
index
].
w
=
pullDebug
[
1
].
w
;
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
mask
?
ijForce
[
0
]
:
0
.
0
f
;
debugArray
[
index
].
y
=
mask
?
ijForce
[
1
]
:
0
.
0
f
;
debugArray
[
index
].
z
=
mask
?
ijForce
[
2
]
:
0
.
0
f
;
...
...
@@ -375,17 +375,17 @@ if( atomI == targetAtom || (y+jIdx) == targetAtom ){
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
cSim
.
paddedNumberOfAtoms
);
load3dArrayBufferPerWarp
(
offset
,
forceSum
,
outputForce
);
offset
=
3
*
(
y
+
tgx
+
warp
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
offset
=
3
*
(
y
+
tgx
+
warp
*
cSim
.
paddedNumberOfAtoms
);
load3dArrayBufferPerWarp
(
offset
,
sA
[
threadIdx
.
x
].
force
,
outputForce
);
#else
unsigned
int
offset
=
3
*
(
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
unsigned
int
offset
=
3
*
(
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
paddedNumberOfAtoms
);
load3dArray
(
offset
,
forceSum
,
outputForce
);
offset
=
3
*
(
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
offset
=
3
*
(
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
paddedNumberOfAtoms
);
load3dArray
(
offset
,
sA
[
threadIdx
.
x
].
force
,
outputForce
);
#endif
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.cu
View file @
761d7e17
...
...
@@ -357,8 +357,9 @@ __device__ void calculateWcaDispersionPairIxn_kernel( float4 atomCoordinatesI, f
static
void
kReduceWcaDispersion
(
amoebaGpuContext
amoebaGpu
,
CUDAStream
<
float
>*
outputArray
)
{
kReduceFields_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
amoebaGpu
->
fieldReduceThreadsPerBlock
>>>
(
amoebaGpu
->
paddedNumberOfAtoms
*
3
,
amoebaGpu
->
outputBuffers
,
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
kReduceFields_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
bsf_reduce_threads_per_block
>>>
(
gpu
->
sim
.
paddedNumberOfAtoms
*
3
,
gpu
->
sim
.
outputBuffers
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
outputArray
->
_pDevData
);
LAUNCHERROR
(
"kReduceWcaDispersion"
);
}
...
...
@@ -367,8 +368,9 @@ static void kReduceWcaDispersion(amoebaGpuContext amoebaGpu, CUDAStream<float>*
static
void
kReduceWcaDispersionToFloat4
(
amoebaGpuContext
amoebaGpu
,
CUDAStream
<
float4
>*
outputArray
)
{
kReduceFieldsToFloat4_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
amoebaGpu
->
fieldReduceThreadsPerBlock
>>>
(
amoebaGpu
->
paddedNumberOfAtoms
*
3
,
amoebaGpu
->
outputBuffers
,
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
kReduceFieldsToFloat4_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
bsf_reduce_threads_per_block
>>>
(
gpu
->
sim
.
paddedNumberOfAtoms
*
3
,
gpu
->
sim
.
outputBuffers
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevData
,
outputArray
->
_pDevData
);
LAUNCHERROR
(
"kReduceWcaDispersion"
);
}
...
...
@@ -417,7 +419,7 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u
\n
"
,
methodName
,
amoebaGpu
->
nonbond
B
locks
,
threadsPerBlock
,
amoebaG
pu
->
bOutputBufferPerWarp
,
methodName
,
gpu
->
sim
.
nonbond
_b
locks
,
threadsPerBlock
,
g
pu
->
bOutputBufferPerWarp
,
sizeof
(
WcaDispersionParticle
),
sizeof
(
WcaDispersionParticle
)
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
...
...
@@ -425,7 +427,7 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
#endif
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaWcaDispersionN2ByWarp_kernel
<<<
amoebaGpu
->
nonbond
B
locks
,
threadsPerBlock
,
sizeof
(
WcaDispersionParticle
)
*
threadsPerBlock
>>>
(
kCalculateAmoebaWcaDispersionN2ByWarp_kernel
<<<
gpu
->
sim
.
nonbond
_b
locks
,
threadsPerBlock
,
sizeof
(
WcaDispersionParticle
)
*
threadsPerBlock
>>>
(
amoebaGpu
->
psWorkUnit
->
_pDevData
,
gpu
->
psPosq4
->
_pDevData
,
amoebaGpu
->
psWcaDispersionRadiusEpsilon
->
_pDevData
,
...
...
@@ -438,7 +440,7 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
}
else
{
kCalculateAmoebaWcaDispersionN2_kernel
<<<
amoebaGpu
->
nonbond
B
locks
,
threadsPerBlock
,
sizeof
(
WcaDispersionParticle
)
*
threadsPerBlock
>>>
(
kCalculateAmoebaWcaDispersionN2_kernel
<<<
gpu
->
sim
.
nonbond
_b
locks
,
threadsPerBlock
,
sizeof
(
WcaDispersionParticle
)
*
threadsPerBlock
>>>
(
amoebaGpu
->
psWorkUnit
->
_pDevData
,
gpu
->
psPosq4
->
_pDevData
,
amoebaGpu
->
psWcaDispersionRadiusEpsilon
->
_pDevData
,
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.h
View file @
761d7e17
...
...
@@ -137,43 +137,43 @@ unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
debugArray
[
index
].
x
=
(
float
)
atomI
;
debugArray
[
index
].
y
=
(
float
)
(
y
+
tj
);
//debugArray[index].z = (float) c
AmoebaSim.numberOfA
toms;
//debugArray[index].z = (float) c
Sim.a
toms;
debugArray
[
index
].
z
=
atomI
==
(
y
+
tj
)
?
0
.
0
f
:
energy
;
energy
=
(
(
atomI
!=
(
y
+
tj
))
&&
(
atomI
<
c
AmoebaSim
.
numberOfA
toms
)
&&
((
y
+
tj
)
<
c
AmoebaSim
.
numberOfA
toms
)
)
?
(
energy
)
:
0
.
0
f
;
energy
=
(
(
atomI
!=
(
y
+
tj
))
&&
(
atomI
<
c
Sim
.
a
toms
)
&&
((
y
+
tj
)
<
c
Sim
.
a
toms
)
)
?
(
energy
)
:
0
.
0
f
;
debugArray
[
index
].
w
=
energy
+
totalEnergy
;
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
iCoord
.
x
;
debugArray
[
index
].
y
=
iCoord
.
y
;
debugArray
[
index
].
z
=
iCoord
.
z
;
debugArray
[
index
].
w
=
(
float
)
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
jCoord
.
x
;
debugArray
[
index
].
y
=
jCoord
.
y
;
debugArray
[
index
].
z
=
jCoord
.
z
;
debugArray
[
index
].
w
=
-
4
.
0
f
;
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
emixo
;
debugArray
[
index
].
y
=
emixh
;
debugArray
[
index
].
z
=
rmixo
;
debugArray
[
index
].
w
=
rmixh
;
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullDebug
[
0
].
x
;
debugArray
[
index
].
y
=
pullDebug
[
0
].
y
;
debugArray
[
index
].
z
=
pullDebug
[
0
].
z
;
debugArray
[
index
].
w
=
pullDebug
[
0
].
w
;
#if 0
index += c
Amoeba
Sim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[1].x;
debugArray[index].y = pullDebug[1].y;
debugArray[index].z = pullDebug[1].z;
debugArray[index].w = pullDebug[1].w;
index += c
Amoeba
Sim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[2].x;
debugArray[index].y = pullDebug[2].y;
debugArray[index].z = pullDebug[2].z;
...
...
@@ -184,7 +184,7 @@ debugArray[index].w = pullDebug[2].w;
// energy = 0.0f;
}
#endif
if
(
(
atomI
!=
(
y
+
tj
))
&&
(
atomI
<
c
AmoebaSim
.
numberOfA
toms
)
&&
((
y
+
tj
)
<
c
AmoebaSim
.
numberOfA
toms
)
){
if
(
(
atomI
!=
(
y
+
tj
))
&&
(
atomI
<
c
Sim
.
a
toms
)
&&
((
y
+
tj
)
<
c
Sim
.
a
toms
)
){
// add to field at atomI the field due atomJ's dipole
...
...
@@ -219,34 +219,34 @@ debugArray[index].w = pullDebug[2].w;
if
(
(
atomI
==
targetAtom
)
||
(
(
y
+
tj
)
==
targetAtom
)
){
unsigned
int
index
=
(
atomI
==
targetAtom
)
?
(
y
+
tj
)
:
atomI
;
index
+=
2
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
2
*
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
(
float
)
atomI
;
debugArray
[
index
].
y
=
(
float
)
(
y
+
tj
);
debugArray
[
index
].
z
=
atomI
==
(
y
+
tj
)
?
0
.
0
f
:
energy
;
energy
=
(
(
atomI
!=
(
y
+
tj
))
&&
(
atomI
<
c
AmoebaSim
.
numberOfA
toms
)
&&
((
y
+
tj
)
<
c
AmoebaSim
.
numberOfA
toms
)
)
?
(
energy
)
:
0
.
0
f
;
energy
=
(
(
atomI
!=
(
y
+
tj
))
&&
(
atomI
<
c
Sim
.
a
toms
)
&&
((
y
+
tj
)
<
c
Sim
.
a
toms
)
)
?
(
energy
)
:
0
.
0
f
;
debugArray
[
index
].
w
=
energy
+
totalEnergy
;
//debugArray[index].w = -2.0f;
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
emjxo
;
debugArray
[
index
].
y
=
emjxh
;
debugArray
[
index
].
z
=
rmjxo
;
debugArray
[
index
].
w
=
rmjxh
;
index
+=
c
Amoeba
Sim
.
paddedNumberOfAtoms
;
index
+=
cSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullDebug
[
0
].
x
;
debugArray
[
index
].
y
=
pullDebug
[
0
].
y
;
debugArray
[
index
].
z
=
pullDebug
[
0
].
z
;
debugArray
[
index
].
w
=
pullDebug
[
0
].
w
;
#if 0
index += c
Amoeba
Sim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[1].x;
debugArray[index].y = pullDebug[1].y;
debugArray[index].z = pullDebug[1].z;
debugArray[index].w = pullDebug[1].w;
index += c
Amoeba
Sim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[2].x;
debugArray[index].y = pullDebug[2].y;
debugArray[index].z = pullDebug[2].z;
...
...
@@ -257,7 +257,7 @@ debugArray[index].w = pullDebug[2].w;
//energy = 0.0f;
}
#endif
if
(
(
atomI
!=
(
y
+
tj
))
&&
(
atomI
<
c
AmoebaSim
.
numberOfA
toms
)
&&
((
y
+
tj
)
<
c
AmoebaSim
.
numberOfA
toms
)
){
if
(
(
atomI
!=
(
y
+
tj
))
&&
(
atomI
<
c
Sim
.
a
toms
)
&&
((
y
+
tj
)
<
c
Sim
.
a
toms
)
){
// add to field at atomI the field due atomJ's dipole
...
...
@@ -281,18 +281,18 @@ debugArray[index].w = pullDebug[2].w;
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
cSim
.
paddedNumberOfAtoms
);
load3dArrayBufferPerWarp
(
offset
,
forceSum
,
outputForce
);
offset
=
3
*
(
y
+
tgx
+
warp
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
offset
=
3
*
(
y
+
tgx
+
warp
*
cSim
.
paddedNumberOfAtoms
);
load3dArrayBufferPerWarp
(
offset
,
sA
[
threadIdx
.
x
].
force
,
outputForce
);
#else
unsigned
int
offset
=
3
*
(
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
unsigned
int
offset
=
3
*
(
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
paddedNumberOfAtoms
);
load3dArray
(
offset
,
forceSum
,
outputForce
);
offset
=
3
*
(
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
c
Amoeba
Sim
.
paddedNumberOfAtoms
);
offset
=
3
*
(
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
paddedNumberOfAtoms
);
load3dArray
(
offset
,
sA
[
threadIdx
.
x
].
force
,
outputForce
);
#endif
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment