Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
8a331fb9
"vscode:/vscode.git/clone" did not exist on "b36ac4e1f03e359b5e1df4f2c53b41e6f3e758a3"
Commit
8a331fb9
authored
Apr 20, 2011
by
Mark Friedrichs
Browse files
Direct space optimizations
parent
af4d503a
Changes
17
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
1046 additions
and
290 deletions
+1046
-290
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
+6
-5
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
...ins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
+1
-1
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
...rms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
+1
-1
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
...cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
+1
-1
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
...forms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
+1
-1
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
...latforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
+1
-1
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
...rms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
+1
-1
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
...c/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
+1
-1
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
...uda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
+1
-1
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedParticle.h
...a/src/kernels/kCalculateAmoebaCudaMutualInducedParticle.h
+0
-2
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
...src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
+890
-65
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
.../src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
+6
-1
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
...ms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
+2
-2
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
.../src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
+101
-15
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
...a/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
+31
-190
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.cu
...platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.cu
+1
-1
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.cu
...rms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.cu
+1
-1
No files found.
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
View file @
8a331fb9
...
...
@@ -3449,19 +3449,20 @@ tgx = 0;
Get threads/block
@param amoebaGpu amoebaGpuContext
@param sharedMemoryPerThread shared memory/thread
@param amoebaGpu amoebaGpuContext
@param sharedMemoryPerThread shared memory/thread
@param sharedMemoryPerBlock shared memory/block
@return threadsPerBlock
--------------------------------------------------------------------------------------- */
unsigned
int
getThreadsPerBlock
(
amoebaGpuContext
amoebaGpu
,
unsigned
int
sharedMemoryPerThread
)
unsigned
int
getThreadsPerBlock
(
amoebaGpuContext
amoebaGpu
,
unsigned
int
sharedMemoryPerThread
,
unsigned
int
sharedMemoryPerBlock
)
{
unsigned
int
grid
=
amoebaGpu
->
gpuContext
->
grid
;
unsigned
int
threadsPerBlock
=
(
amoebaGpu
->
gpuContext
->
sharedMemoryPerBlock
+
grid
-
1
)
/
(
grid
*
sharedMemoryPerThread
);
unsigned
int
threadsPerBlock
=
(
sharedMemoryPerBlock
+
grid
-
1
)
/
(
grid
*
sharedMemoryPerThread
);
threadsPerBlock
=
threadsPerBlock
<
1
?
1
:
threadsPerBlock
;
threadsPerBlock
*=
grid
;
threadsPerBlock
*=
grid
;
return
threadsPerBlock
;
}
...
...
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
View file @
8a331fb9
...
...
@@ -160,7 +160,7 @@ extern void kClearFloat( amoebaGpuContext amoebaGpu, unsigned int entries, CUDAS
extern
void
kClearFloat4
(
amoebaGpuContext
amoebaGpu
,
unsigned
int
entries
,
CUDAStream
<
float4
>*
fieldToClear
);
extern
void
kClearFields_1
(
amoebaGpuContext
amoebaGpu
);
extern
void
kClearFields_3
(
amoebaGpuContext
amoebaGpu
,
unsigned
int
numberToClear
);
extern
unsigned
int
getThreadsPerBlock
(
amoebaGpuContext
amoebaGpu
,
unsigned
int
sharedMemoryPerThread
);
extern
unsigned
int
getThreadsPerBlock
(
amoebaGpuContext
amoebaGpu
,
unsigned
int
sharedMemoryPerThread
,
unsigned
int
sharedMemoryPerBlock
);
//extern int isNanOrInfinity( double number );
extern
void
trackMutualInducedIterations
(
amoebaGpuContext
amoebaGpu
,
int
iteration
);
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
View file @
8a331fb9
...
...
@@ -759,7 +759,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
maxThreads
=
128
;
else
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
ElectrostaticParticle
)),
maxThreads
);
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
ElectrostaticParticle
)
,
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
kClearFields_3
(
amoebaGpu
,
1
);
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
View file @
8a331fb9
...
...
@@ -362,7 +362,7 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
maxThreads
=
128
;
else
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
FixedFieldParticle
)),
maxThreads
);
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
FixedFieldParticle
)
,
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
kClearFields_3
(
amoebaGpu
,
3
);
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
View file @
8a331fb9
...
...
@@ -108,7 +108,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
maxThreads
=
128
;
else
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
FixedFieldParticle
)),
maxThreads
);
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
FixedFieldParticle
)
,
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
#ifdef AMOEBA_DEBUG
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
View file @
8a331fb9
...
...
@@ -1813,7 +1813,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
maxThreads
=
128
;
else
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
KirkwoodParticle
)),
maxThreads
);
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
KirkwoodParticle
)
,
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
View file @
8a331fb9
...
...
@@ -978,7 +978,7 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
maxThreads
=
96
;
else
maxThreads
=
32
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
KirkwoodEDiffParticle
)),
maxThreads
);
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
KirkwoodEDiffParticle
)
,
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
#ifdef AMOEBA_DEBUG
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
View file @
8a331fb9
...
...
@@ -490,7 +490,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
maxThreads
=
128
;
else
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
MutualInducedParticle
)),
maxThreads
);
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
MutualInducedParticle
)
,
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
#ifdef AMOEBA_DEBUG
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
View file @
8a331fb9
...
...
@@ -276,7 +276,7 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
maxThreads
=
128
;
else
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
MutualInducedParticle
)),
maxThreads
);
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
MutualInducedParticle
)
,
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
#ifdef AMOEBA_DEBUG
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedParticle.h
View file @
8a331fb9
...
...
@@ -4,7 +4,6 @@ struct MutualInducedParticle {
float
x
;
float
y
;
float
z
;
float
q
;
float
inducedDipole
[
3
];
float
inducedDipolePolar
[
3
];
...
...
@@ -41,7 +40,6 @@ __device__ static void loadMutualInducedShared( MutualInducedParticle* sA, unsig
sA
->
x
=
posq
.
x
;
sA
->
y
=
posq
.
y
;
sA
->
z
=
posq
.
z
;
sA
->
q
=
posq
.
w
;
// dipole
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
View file @
8a331fb9
...
...
@@ -38,6 +38,8 @@ static int const UScaleIndex = 2;
static
int
const
MScaleIndex
=
3
;
static
int
const
LastScalingIndex
=
4
;
#define CALCULATE_FULL_TILE
struct
PmeDirectElectrostaticParticle
{
// coordinates charge
...
...
@@ -53,7 +55,7 @@ struct PmeDirectElectrostaticParticle {
// lab frame quadrupole
float
labFrameQuadrupole
[
9
];
float
labFrameQuadrupole
[
6
];
// induced dipole
...
...
@@ -69,14 +71,16 @@ struct PmeDirectElectrostaticParticle {
float
damp
;
float
force
[
3
];
float
torque
[
3
];
float
padding
;
//
float padding;
#ifndef CALCULATE_FULL_TILE
float
tempForce
[
3
];
float
tempTorque
[
3
];
#endif
};
#ifndef CALCULATE_FULL_TILE
__device__
void
sumTempBuffer
(
PmeDirectElectrostaticParticle
&
atomI
,
PmeDirectElectrostaticParticle
&
atomJ
){
atomI
.
tempForce
[
0
]
+=
atomJ
.
tempForce
[
0
];
...
...
@@ -87,6 +91,7 @@ __device__ void sumTempBuffer( PmeDirectElectrostaticParticle& atomI, PmeDirectE
atomI
.
tempTorque
[
1
]
+=
atomJ
.
tempTorque
[
1
];
atomI
.
tempTorque
[
2
]
+=
atomJ
.
tempTorque
[
2
];
}
#endif
/*
__device__ static void debugSetup( unsigned int atomI, unsigned int atomJ,
...
...
@@ -124,11 +129,11 @@ __device__ static void calculatePmeSelfEnergyElectrostaticPairIxn_kernel( PmeDir
atomI
.
labFrameDipole
[
2
]
*
atomI
.
labFrameDipole
[
2
];
float
qii
=
atomI
.
labFrameQuadrupole
[
0
]
*
atomI
.
labFrameQuadrupole
[
0
]
+
atomI
.
labFrameQuadrupole
[
4
]
*
atomI
.
labFrameQuadrupole
[
4
]
+
atomI
.
labFrameQuadrupole
[
8
]
*
atomI
.
labFrameQuadrupole
[
8
]
+
2.0
f
*
(
atomI
.
labFrameQuadrupole
[
3
]
*
atomI
.
labFrameQuadrupole
[
3
]
+
atomI
.
labFrameQuadrupole
[
5
]
*
atomI
.
labFrameQuadrupole
[
5
]
+
2.0
f
*
(
atomI
.
labFrameQuadrupole
[
1
]
*
atomI
.
labFrameQuadrupole
[
1
]
+
atomI
.
labFrameQuadrupole
[
2
]
*
atomI
.
labFrameQuadrupole
[
2
]
+
atomI
.
labFrameQuadrupole
[
5
]
*
atomI
.
labFrameQuadrupole
[
5
]);
atomI
.
labFrameQuadrupole
[
4
]
*
atomI
.
labFrameQuadrupole
[
4
]);
float
uii
=
atomI
.
labFrameDipole
[
0
]
*
atomI
.
inducedDipole
[
0
]
+
atomI
.
labFrameDipole
[
1
]
*
atomI
.
inducedDipole
[
1
]
+
atomI
.
labFrameDipole
[
2
]
*
atomI
.
inducedDipole
[
2
];
...
...
@@ -152,7 +157,836 @@ __device__ static void calculatePmeSelfTorqueElectrostaticPairIxn_kernel( PmeDir
atomI
.
torque
[
2
]
+=
term
*
(
atomI
.
labFrameDipole
[
0
]
*
uiy
-
atomI
.
labFrameDipole
[
1
]
*
uix
);
}
__device__
void
calculatePmeDirectElectrostaticPairIxn_kernel
(
PmeDirectElectrostaticParticle
&
atomI
,
PmeDirectElectrostaticParticle
&
atomJ
,
/*
#undef SUB_METHOD_NAME
#undef F1
#define SUB_METHOD_NAME(a, b) a##F1##b
#define F1
//#define T1
#include "kCalculateAmoebaCudaPmeDirectElectrostatic_b.h"
#undef F1
//#undef T1
#undef SUB_METHOD_NAME
#define SUB_METHOD_NAME(a, b) a##F2##b
#define F2
#include "kCalculateAmoebaCudaPmeDirectElectrostatic_b.h"
#undef F2
#undef SUB_METHOD_NAME
#undef SUB_METHOD_NAME
#undef T1
#define SUB_METHOD_NAME(a, b) a##T1##b
#define T1
#include "kCalculateAmoebaCudaPmeDirectElectrostatic_b.h"
#undef T1
#undef SUB_METHOD_NAME
#define SUB_METHOD_NAME(a, b) a##T2##b
#define T2
#include "kCalculateAmoebaCudaPmeDirectElectrostatic_b.h"
#undef T2
#undef SUB_METHOD_NAME
#undef SUB_METHOD_NAME
#undef T3
#define SUB_METHOD_NAME(a, b) a##T3##b
#define T3
#include "kCalculateAmoebaCudaPmeDirectElectrostatic_b.h"
#undef T3
#undef SUB_METHOD_NAME
#define SUB_METHOD_NAME(a, b) a##T4##b
#define T4
#include "kCalculateAmoebaCudaPmeDirectElectrostatic_b.h"
#undef T4
#undef SUB_METHOD_NAME
*/
static
__device__
void
calculatePmeDirectElectrostaticPairIxnF1_kernel
(
const
PmeDirectElectrostaticParticle
&
atomI
,
const
PmeDirectElectrostaticParticle
&
atomJ
,
const
float4
delta
,
const
float4
bn
,
const
float
bn5
,
const
float
*
scalingFactors
,
float4
*
forceTorqueEnergy
){
float
xr
=
delta
.
x
;
float
yr
=
delta
.
y
;
float
zr
=
delta
.
z
;
float
r
=
delta
.
w
;
// set the permanent multipole and induced dipole values;
float
ci
=
atomI
.
q
;
float
di1
=
atomI
.
labFrameDipole
[
0
];
float
di2
=
atomI
.
labFrameDipole
[
1
];
float
di3
=
atomI
.
labFrameDipole
[
2
];
float
qi1
=
atomI
.
labFrameQuadrupole
[
0
];
float
qi2
=
atomI
.
labFrameQuadrupole
[
1
];
float
qi3
=
atomI
.
labFrameQuadrupole
[
2
];
float
qi5
=
atomI
.
labFrameQuadrupole
[
3
];
float
qi6
=
atomI
.
labFrameQuadrupole
[
4
];
float
qi9
=
atomI
.
labFrameQuadrupole
[
5
];
float
ck
=
atomJ
.
q
;
float
dk1
=
atomJ
.
labFrameDipole
[
0
];
float
dk2
=
atomJ
.
labFrameDipole
[
1
];
float
dk3
=
atomJ
.
labFrameDipole
[
2
];
float
qk1
=
atomJ
.
labFrameQuadrupole
[
0
];
float
qk2
=
atomJ
.
labFrameQuadrupole
[
1
];
float
qk3
=
atomJ
.
labFrameQuadrupole
[
2
];
float
qk5
=
atomJ
.
labFrameQuadrupole
[
3
];
float
qk6
=
atomJ
.
labFrameQuadrupole
[
4
];
float
qk9
=
atomJ
.
labFrameQuadrupole
[
5
];
float
bn1
=
bn
.
x
;
float
bn2
=
bn
.
y
;
float
bn3
=
bn
.
z
;
float
bn4
=
bn
.
w
;
// apply Thole polarization damping to scale factors
float
rr1
=
1.0
f
/
r
;
float
rr2
=
rr1
*
rr1
;
float
rr3
=
rr1
*
rr2
;
float
rr5
=
3.0
f
*
rr3
*
rr2
;
float
rr7
=
5.0
f
*
rr5
*
rr2
;
float
rr9
=
7.0
f
*
rr7
*
rr2
;
float
rr11
=
9.0
f
*
rr9
*
rr2
;
float
qir1
=
qi1
*
xr
+
qi2
*
yr
+
qi3
*
zr
;
float
qir2
=
qi2
*
xr
+
qi5
*
yr
+
qi6
*
zr
;
float
qir3
=
qi3
*
xr
+
qi6
*
yr
+
qi9
*
zr
;
float
qkr1
=
qk1
*
xr
+
qk2
*
yr
+
qk3
*
zr
;
float
qkr2
=
qk2
*
xr
+
qk5
*
yr
+
qk6
*
zr
;
float
qkr3
=
qk3
*
xr
+
qk6
*
yr
+
qk9
*
zr
;
float
offset
=
1.0
f
-
scalingFactors
[
MScaleIndex
];
float
gf4
=
2.0
f
*
(
bn2
-
offset
*
rr5
);
float
qidk1
=
qi1
*
dk1
+
qi2
*
dk2
+
qi3
*
dk3
;
float
qkdi1
=
qk1
*
di1
+
qk2
*
di2
+
qk3
*
di3
;
float
ftm21
=
gf4
*
(
qkdi1
-
qidk1
);
float
qidk2
=
qi2
*
dk1
+
qi5
*
dk2
+
qi6
*
dk3
;
float
qkdi2
=
qk2
*
di1
+
qk5
*
di2
+
qk6
*
di3
;
float
ftm22
=
gf4
*
(
qkdi2
-
qidk2
);
float
qidk3
=
qi3
*
dk1
+
qi6
*
dk2
+
qi9
*
dk3
;
float
qkdi3
=
qk3
*
di1
+
qk6
*
di2
+
qk9
*
di3
;
float
ftm23
=
gf4
*
(
qkdi3
-
qidk3
);
float
gf7
=
4.0
f
*
(
bn3
-
offset
*
rr7
);
float
qiqkr1
=
qi1
*
qkr1
+
qi2
*
qkr2
+
qi3
*
qkr3
;
float
qkqir1
=
qk1
*
qir1
+
qk2
*
qir2
+
qk3
*
qir3
;
ftm21
+=
gf7
*
(
qiqkr1
+
qkqir1
);
float
qiqkr2
=
qi2
*
qkr1
+
qi5
*
qkr2
+
qi6
*
qkr3
;
float
qkqir2
=
qk2
*
qir1
+
qk5
*
qir2
+
qk6
*
qir3
;
ftm22
+=
gf7
*
(
qiqkr2
+
qkqir2
);
float
qiqkr3
=
qi3
*
qkr1
+
qi6
*
qkr2
+
qi9
*
qkr3
;
float
qkqir3
=
qk3
*
qir1
+
qk6
*
qir2
+
qk9
*
qir3
;
ftm23
+=
gf7
*
(
qiqkr3
+
qkqir3
);
// calculate the scalar products for permanent components
float
sc2
=
di1
*
dk1
+
di2
*
dk2
+
di3
*
dk3
;
float
sc7
=
qir1
*
dk1
+
qir2
*
dk2
+
qir3
*
dk3
;
float
sc8
=
qkr1
*
di1
+
qkr2
*
di2
+
qkr3
*
di3
;
float
sc9
=
qir1
*
qkr1
+
qir2
*
qkr2
+
qir3
*
qkr3
;
float
sc10
=
qi1
*
qk1
+
qi2
*
qk2
+
qi3
*
qk3
+
qi2
*
qk2
+
qi5
*
qk5
+
qi6
*
qk6
+
qi3
*
qk3
+
qi6
*
qk6
+
qi9
*
qk9
;
float
sc3
=
di1
*
xr
+
di2
*
yr
+
di3
*
zr
;
float
sc5
=
qir1
*
xr
+
qir2
*
yr
+
qir3
*
zr
;
float
sc4
=
dk1
*
xr
+
dk2
*
yr
+
dk3
*
zr
;
float
sc6
=
qkr1
*
xr
+
qkr2
*
yr
+
qkr3
*
zr
;
// calculate the scalar products for induced components
// calculate the gl functions for permanent components
float
gl0
=
ci
*
ck
;
float
gl1
=
ck
*
sc3
-
ci
*
sc4
;
float
gl2
=
ci
*
sc6
+
ck
*
sc5
-
sc3
*
sc4
;
float
gl3
=
sc3
*
sc6
-
sc4
*
sc5
;
float
gl4
=
sc5
*
sc6
;
float
gl5
=
-
4.0
f
*
sc9
;
float
gl6
=
sc2
;
float
gl7
=
2.0
f
*
(
sc7
-
sc8
);
float
gl8
=
2.0
f
*
sc10
;
forceTorqueEnergy
->
w
+=
-
offset
*
rr1
*
gl0
+
(
bn1
-
offset
*
rr3
)
*
(
gl1
+
gl6
)
+
(
bn2
-
offset
*
rr5
)
*
(
gl2
+
gl7
+
gl8
)
+
(
bn3
-
offset
*
rr7
)
*
(
gl3
+
gl5
)
+
(
bn4
-
offset
*
rr9
)
*
gl4
;
// calculate the gl functions for induced components
// intermediate variables for permanent force terms
float
gf1
=
bn1
*
gl0
+
bn2
*
(
gl1
+
gl6
)
+
bn3
*
(
gl2
+
gl7
+
gl8
)
+
bn4
*
(
gl3
+
gl5
)
+
bn5
*
gl4
;
gf1
-=
offset
*
(
rr3
*
gl0
+
rr5
*
(
gl1
+
gl6
)
+
rr7
*
(
gl2
+
gl7
+
gl8
)
+
rr9
*
(
gl3
+
gl5
)
+
rr11
*
gl4
);
ftm21
+=
gf1
*
xr
;
ftm22
+=
gf1
*
yr
;
ftm23
+=
gf1
*
zr
;
float
gf2
=
-
ck
*
bn1
+
sc4
*
bn2
-
sc6
*
bn3
-
offset
*
(
-
ck
*
rr3
+
sc4
*
rr5
-
sc6
*
rr7
);
ftm21
+=
gf2
*
di1
;
ftm22
+=
gf2
*
di2
;
ftm23
+=
gf2
*
di3
;
float
gf3
=
ci
*
bn1
+
sc3
*
bn2
+
sc5
*
bn3
-
offset
*
(
ci
*
rr3
+
sc3
*
rr5
+
sc5
*
rr7
);
ftm21
+=
gf3
*
dk1
;
ftm22
+=
gf3
*
dk2
;
ftm23
+=
gf3
*
dk3
;
float
gf5
=
2.0
f
*
(
-
ck
*
bn2
+
sc4
*
bn3
-
sc6
*
bn4
-
offset
*
(
-
ck
*
rr5
+
sc4
*
rr7
-
sc6
*
rr9
));
ftm21
+=
gf5
*
qir1
;
ftm22
+=
gf5
*
qir2
;
ftm23
+=
gf5
*
qir3
;
float
gf6
=
2.0
f
*
(
-
ci
*
bn2
-
sc3
*
bn3
-
sc5
*
bn4
-
offset
*
(
-
ci
*
rr5
-
sc3
*
rr7
-
sc5
*
rr9
));
ftm21
+=
gf6
*
qkr1
;
ftm22
+=
gf6
*
qkr2
;
ftm23
+=
gf6
*
qkr3
;
forceTorqueEnergy
->
x
=
ftm21
;
forceTorqueEnergy
->
y
=
ftm22
;
forceTorqueEnergy
->
z
=
ftm23
;
return
;
}
static
__device__
void
calculatePmeDirectElectrostaticPairIxnF2_kernel
(
const
PmeDirectElectrostaticParticle
&
atomI
,
const
PmeDirectElectrostaticParticle
&
atomJ
,
const
float4
delta
,
const
float4
bn
,
const
float
*
scalingFactors
,
float4
*
forceTorqueEnergy
){
float
xr
=
delta
.
x
;
float
yr
=
delta
.
y
;
float
zr
=
delta
.
z
;
float
r
=
delta
.
w
;
// set the permanent multipole and induced dipole values;
float
ci
=
atomI
.
q
;
float
di1
=
atomI
.
labFrameDipole
[
0
];
float
di2
=
atomI
.
labFrameDipole
[
1
];
float
di3
=
atomI
.
labFrameDipole
[
2
];
float
qi1
=
atomI
.
labFrameQuadrupole
[
0
];
float
qi2
=
atomI
.
labFrameQuadrupole
[
1
];
float
qi3
=
atomI
.
labFrameQuadrupole
[
2
];
float
qi5
=
atomI
.
labFrameQuadrupole
[
3
];
float
qi6
=
atomI
.
labFrameQuadrupole
[
4
];
float
qi9
=
atomI
.
labFrameQuadrupole
[
5
];
float
ck
=
atomJ
.
q
;
float
dk1
=
atomJ
.
labFrameDipole
[
0
];
float
dk2
=
atomJ
.
labFrameDipole
[
1
];
float
dk3
=
atomJ
.
labFrameDipole
[
2
];
float
qk1
=
atomJ
.
labFrameQuadrupole
[
0
];
float
qk2
=
atomJ
.
labFrameQuadrupole
[
1
];
float
qk3
=
atomJ
.
labFrameQuadrupole
[
2
];
float
qk5
=
atomJ
.
labFrameQuadrupole
[
3
];
float
qk6
=
atomJ
.
labFrameQuadrupole
[
4
];
float
qk9
=
atomJ
.
labFrameQuadrupole
[
5
];
float
bn1
=
bn
.
x
;
float
bn2
=
bn
.
y
;
float
bn3
=
bn
.
z
;
float
bn4
=
bn
.
w
;
// apply Thole polarization damping to scale factors
float
rr1
=
1.0
f
/
r
;
float
rr2
=
rr1
*
rr1
;
float
rr3
=
rr1
*
rr2
;
float
rr5
=
3.0
f
*
rr3
*
rr2
;
float
rr7
=
5.0
f
*
rr5
*
rr2
;
float
rr9
=
7.0
f
*
rr7
*
rr2
;
float
scale3
=
1.0
f
;
float
scale5
=
1.0
f
;
float
scale7
=
1.0
f
;
float
damp
=
atomI
.
damp
*
atomJ
.
damp
;
if
(
damp
!=
0.0
f
){
float
pgamma
=
atomI
.
thole
<
atomJ
.
thole
?
atomI
.
thole
:
atomJ
.
thole
;
float
ratio
=
r
/
damp
;
damp
=
-
pgamma
*
ratio
*
ratio
*
ratio
;
if
(
damp
>
-
50.0
f
){
float
expdamp
=
exp
(
damp
);
scale3
=
1.0
f
-
expdamp
;
scale5
=
1.0
f
-
(
1.0
f
-
damp
)
*
expdamp
;
scale7
=
1.0
f
-
(
1.0
f
-
damp
+
0.6
f
*
damp
*
damp
)
*
expdamp
;
}
else
{
damp
=
0.0
f
;
}
}
float
psc5
=
1.0
f
-
scale5
*
scalingFactors
[
PScaleIndex
];
float
dsc5
=
1.0
f
-
scale5
*
scalingFactors
[
DScaleIndex
];
float
qiuk1
=
qi1
*
atomJ
.
inducedDipole
[
0
]
+
qi2
*
atomJ
.
inducedDipole
[
1
]
+
qi3
*
atomJ
.
inducedDipole
[
2
];
float
qkui1
=
qk1
*
atomI
.
inducedDipole
[
0
]
+
qk2
*
atomI
.
inducedDipole
[
1
]
+
qk3
*
atomI
.
inducedDipole
[
2
];
float
qiukp1
=
qi1
*
atomJ
.
inducedDipoleP
[
0
]
+
qi2
*
atomJ
.
inducedDipoleP
[
1
]
+
qi3
*
atomJ
.
inducedDipoleP
[
2
];
float
qkuip1
=
qk1
*
atomI
.
inducedDipoleP
[
0
]
+
qk2
*
atomI
.
inducedDipoleP
[
1
]
+
qk3
*
atomI
.
inducedDipoleP
[
2
];
forceTorqueEnergy
->
x
+=
bn2
*
(
qkui1
+
qkuip1
-
qiuk1
-
qiukp1
);
forceTorqueEnergy
->
x
-=
rr5
*
((
qkui1
-
qiuk1
)
*
psc5
+
(
qkuip1
-
qiukp1
)
*
dsc5
);
float
qiuk2
=
qi2
*
atomJ
.
inducedDipole
[
0
]
+
qi5
*
atomJ
.
inducedDipole
[
1
]
+
qi6
*
atomJ
.
inducedDipole
[
2
];
float
qkui2
=
qk2
*
atomI
.
inducedDipole
[
0
]
+
qk5
*
atomI
.
inducedDipole
[
1
]
+
qk6
*
atomI
.
inducedDipole
[
2
];
float
qiukp2
=
qi2
*
atomJ
.
inducedDipoleP
[
0
]
+
qi5
*
atomJ
.
inducedDipoleP
[
1
]
+
qi6
*
atomJ
.
inducedDipoleP
[
2
];
float
qkuip2
=
qk2
*
atomI
.
inducedDipoleP
[
0
]
+
qk5
*
atomI
.
inducedDipoleP
[
1
]
+
qk6
*
atomI
.
inducedDipoleP
[
2
];
forceTorqueEnergy
->
y
+=
bn2
*
(
qkui2
+
qkuip2
-
qiuk2
-
qiukp2
);
forceTorqueEnergy
->
y
-=
rr5
*
((
qkui2
-
qiuk2
)
*
psc5
+
(
qkuip2
-
qiukp2
)
*
dsc5
);
float
qiuk3
=
qi3
*
atomJ
.
inducedDipole
[
0
]
+
qi6
*
atomJ
.
inducedDipole
[
1
]
+
qi9
*
atomJ
.
inducedDipole
[
2
];
float
qkui3
=
qk3
*
atomI
.
inducedDipole
[
0
]
+
qk6
*
atomI
.
inducedDipole
[
1
]
+
qk9
*
atomI
.
inducedDipole
[
2
];
float
qiukp3
=
qi3
*
atomJ
.
inducedDipoleP
[
0
]
+
qi6
*
atomJ
.
inducedDipoleP
[
1
]
+
qi9
*
atomJ
.
inducedDipoleP
[
2
];
float
qkuip3
=
qk3
*
atomI
.
inducedDipoleP
[
0
]
+
qk6
*
atomI
.
inducedDipoleP
[
1
]
+
qk9
*
atomI
.
inducedDipoleP
[
2
];
forceTorqueEnergy
->
z
+=
bn2
*
(
qkui3
+
qkuip3
-
qiuk3
-
qiukp3
);
forceTorqueEnergy
->
z
-=
rr5
*
((
qkui3
-
qiuk3
)
*
psc5
+
(
qkuip3
-
qiukp3
)
*
dsc5
);
float
sc3
=
di1
*
xr
+
di2
*
yr
+
di3
*
zr
;
float
sc4
=
dk1
*
xr
+
dk2
*
yr
+
dk3
*
zr
;
float
qir1
=
qi1
*
xr
+
qi2
*
yr
+
qi3
*
zr
;
float
qir2
=
qi2
*
xr
+
qi5
*
yr
+
qi6
*
zr
;
float
qir3
=
qi3
*
xr
+
qi6
*
yr
+
qi9
*
zr
;
float
sc5
=
qir1
*
xr
+
qir2
*
yr
+
qir3
*
zr
;
float
qkr1
=
qk1
*
xr
+
qk2
*
yr
+
qk3
*
zr
;
float
qkr2
=
qk2
*
xr
+
qk5
*
yr
+
qk6
*
zr
;
float
qkr3
=
qk3
*
xr
+
qk6
*
yr
+
qk9
*
zr
;
float
sc6
=
qkr1
*
xr
+
qkr2
*
yr
+
qkr3
*
zr
;
// calculate the scalar products for induced components
float
sci3
=
atomI
.
inducedDipole
[
0
]
*
xr
+
atomI
.
inducedDipole
[
1
]
*
yr
+
atomI
.
inducedDipole
[
2
]
*
zr
;
float
scip3
=
atomI
.
inducedDipoleP
[
0
]
*
xr
+
atomI
.
inducedDipoleP
[
1
]
*
yr
+
atomI
.
inducedDipoleP
[
2
]
*
zr
;
float
sci4
=
atomJ
.
inducedDipole
[
0
]
*
xr
+
atomJ
.
inducedDipole
[
1
]
*
yr
+
atomJ
.
inducedDipole
[
2
]
*
zr
;
float
scip4
=
atomJ
.
inducedDipoleP
[
0
]
*
xr
+
atomJ
.
inducedDipoleP
[
1
]
*
yr
+
atomJ
.
inducedDipoleP
[
2
]
*
zr
;
float
usc5
=
1.0
f
-
scale5
*
scalingFactors
[
UScaleIndex
];
float
prefactor1
=
0.5
f
*
(
bn2
-
rr5
*
usc5
);
forceTorqueEnergy
->
x
+=
prefactor1
*
(
(
sci4
*
atomI
.
inducedDipoleP
[
0
]
+
scip4
*
atomI
.
inducedDipole
[
0
])
+
sci3
*
atomJ
.
inducedDipoleP
[
0
]
+
scip3
*
atomJ
.
inducedDipole
[
0
]
);
forceTorqueEnergy
->
y
+=
prefactor1
*
(
(
sci4
*
atomI
.
inducedDipoleP
[
1
]
+
scip4
*
atomI
.
inducedDipole
[
1
])
+
sci3
*
atomJ
.
inducedDipoleP
[
1
]
+
scip3
*
atomJ
.
inducedDipole
[
1
]
);
forceTorqueEnergy
->
z
+=
prefactor1
*
(
(
sci4
*
atomI
.
inducedDipoleP
[
2
]
+
scip4
*
atomI
.
inducedDipole
[
2
])
+
sci3
*
atomJ
.
inducedDipoleP
[
2
]
+
scip3
*
atomJ
.
inducedDipole
[
2
]
);
float
dsc7
=
1.0
f
-
scale7
*
scalingFactors
[
DScaleIndex
];
float
psc7
=
1.0
f
-
scale7
*
scalingFactors
[
PScaleIndex
];
float
gfi5
=
bn3
*
(
sci4
+
scip4
)
-
rr7
*
(
sci4
*
psc7
+
scip4
*
dsc7
);
forceTorqueEnergy
->
x
+=
gfi5
*
qir1
;
forceTorqueEnergy
->
y
+=
gfi5
*
qir2
;
forceTorqueEnergy
->
z
+=
gfi5
*
qir3
;
prefactor1
=
0.5
f
*
(
bn2
*
(
sci4
+
scip4
)
-
rr5
*
(
sci4
*
psc5
+
scip4
*
dsc5
)
);
float
prefactor2
=
0.5
f
*
(
bn2
*
(
sci3
+
scip3
)
-
rr5
*
(
sci3
*
psc5
+
scip3
*
dsc5
)
);
forceTorqueEnergy
->
x
+=
prefactor1
*
di1
+
prefactor2
*
dk1
;
forceTorqueEnergy
->
y
+=
prefactor1
*
di2
+
prefactor2
*
dk2
;
forceTorqueEnergy
->
z
+=
prefactor1
*
di3
+
prefactor2
*
dk3
;
float
gfi6
=
-
bn3
*
(
sci3
+
scip3
)
+
rr7
*
(
sci3
*
psc7
+
scip3
*
dsc7
);
forceTorqueEnergy
->
x
+=
gfi6
*
qkr1
;
forceTorqueEnergy
->
y
+=
gfi6
*
qkr2
;
forceTorqueEnergy
->
z
+=
gfi6
*
qkr3
;
float
sci1
=
atomI
.
inducedDipole
[
0
]
*
dk1
+
atomI
.
inducedDipole
[
1
]
*
dk2
+
atomI
.
inducedDipole
[
2
]
*
dk3
+
di1
*
atomJ
.
inducedDipole
[
0
]
+
di2
*
atomJ
.
inducedDipole
[
1
]
+
di3
*
atomJ
.
inducedDipole
[
2
];
float
sci7
=
qir1
*
atomJ
.
inducedDipole
[
0
]
+
qir2
*
atomJ
.
inducedDipole
[
1
]
+
qir3
*
atomJ
.
inducedDipole
[
2
];
float
sci8
=
qkr1
*
atomI
.
inducedDipole
[
0
]
+
qkr2
*
atomI
.
inducedDipole
[
1
]
+
qkr3
*
atomI
.
inducedDipole
[
2
];
float
scip1
=
atomI
.
inducedDipoleP
[
0
]
*
dk1
+
atomI
.
inducedDipoleP
[
1
]
*
dk2
+
atomI
.
inducedDipoleP
[
2
]
*
dk3
+
di1
*
atomJ
.
inducedDipoleP
[
0
]
+
di2
*
atomJ
.
inducedDipoleP
[
1
]
+
di3
*
atomJ
.
inducedDipoleP
[
2
];
float
scip2
=
atomI
.
inducedDipole
[
0
]
*
atomJ
.
inducedDipoleP
[
0
]
+
atomI
.
inducedDipole
[
1
]
*
atomJ
.
inducedDipoleP
[
1
]
+
atomI
.
inducedDipole
[
2
]
*
atomJ
.
inducedDipoleP
[
2
]
+
atomJ
.
inducedDipole
[
0
]
*
atomI
.
inducedDipoleP
[
0
]
+
atomJ
.
inducedDipole
[
1
]
*
atomI
.
inducedDipoleP
[
1
]
+
atomJ
.
inducedDipole
[
2
]
*
atomI
.
inducedDipoleP
[
2
];
float
scip7
=
qir1
*
atomJ
.
inducedDipoleP
[
0
]
+
qir2
*
atomJ
.
inducedDipoleP
[
1
]
+
qir3
*
atomJ
.
inducedDipoleP
[
2
];
float
scip8
=
qkr1
*
atomI
.
inducedDipoleP
[
0
]
+
qkr2
*
atomI
.
inducedDipoleP
[
1
]
+
qkr3
*
atomI
.
inducedDipoleP
[
2
];
// calculate the gl functions for permanent components
// calculate the gl functions for induced components
float
gli1
=
ck
*
sci3
-
ci
*
sci4
;
float
gli2
=
-
sc3
*
sci4
-
sci3
*
sc4
;
float
gli3
=
sci3
*
sc6
-
sci4
*
sc5
;
float
gli6
=
sci1
;
float
gli7
=
2.0
f
*
(
sci7
-
sci8
);
float
glip1
=
ck
*
scip3
-
ci
*
scip4
;
float
glip2
=
-
sc3
*
scip4
-
scip3
*
sc4
;
float
glip3
=
scip3
*
sc6
-
scip4
*
sc5
;
float
glip6
=
scip1
;
float
glip7
=
2.0
f
*
(
scip7
-
scip8
);
float
ei
=
(
bn1
*
(
gli1
+
gli6
)
+
bn2
*
(
gli2
+
gli7
)
+
bn3
*
gli3
);
float
psc3
=
1.0
f
-
scale3
*
scalingFactors
[
PScaleIndex
];
ei
-=
(
rr3
*
(
gli1
+
gli6
)
*
psc3
+
rr5
*
(
gli2
+
gli7
)
*
psc5
+
rr7
*
gli3
*
psc7
);
forceTorqueEnergy
->
w
+=
0.5
f
*
ei
;
float
dsc3
=
1.0
f
-
scale3
*
scalingFactors
[
DScaleIndex
];
float
usc3
=
1.0
f
-
scale3
*
scalingFactors
[
UScaleIndex
];
float
gfi1
=
(
bn2
*
(
gli1
+
glip1
+
gli6
+
glip6
)
+
bn2
*
scip2
+
bn3
*
(
gli2
+
glip2
+
gli7
+
glip7
)
-
bn3
*
(
sci3
*
scip4
+
scip3
*
sci4
)
+
bn4
*
(
gli3
+
glip3
));
gfi1
-=
rr5
*
((
gli1
+
gli6
)
*
psc3
+
(
glip1
+
glip6
)
*
dsc3
+
scip2
*
usc3
)
+
rr7
*
((
gli7
+
gli2
)
*
psc5
+
(
glip7
+
glip2
)
*
dsc5
-
(
sci3
*
scip4
+
scip3
*
sci4
)
*
usc5
)
+
rr9
*
(
gli3
*
psc7
+
glip3
*
dsc7
);
gfi1
*=
0.5
f
;
forceTorqueEnergy
->
x
+=
gfi1
*
xr
;
forceTorqueEnergy
->
y
+=
gfi1
*
yr
;
forceTorqueEnergy
->
z
+=
gfi1
*
zr
;
float
gfi2
=
-
ck
*
bn1
+
sc4
*
bn2
-
sc6
*
bn3
;
float
gfi3
=
ci
*
bn1
+
sc3
*
bn2
+
sc5
*
bn3
;
float
ftm2i1
=
gfi2
*
(
atomI
.
inducedDipole
[
0
]
+
atomI
.
inducedDipoleP
[
0
])
+
gfi3
*
(
atomJ
.
inducedDipole
[
0
]
+
atomJ
.
inducedDipoleP
[
0
]);
float
ftm2i2
=
gfi2
*
(
atomI
.
inducedDipole
[
1
]
+
atomI
.
inducedDipoleP
[
1
])
+
gfi3
*
(
atomJ
.
inducedDipole
[
1
]
+
atomJ
.
inducedDipoleP
[
1
]);
float
ftm2i3
=
gfi2
*
(
atomI
.
inducedDipole
[
2
]
+
atomI
.
inducedDipoleP
[
2
])
+
gfi3
*
(
atomJ
.
inducedDipole
[
2
]
+
atomJ
.
inducedDipoleP
[
2
]);
forceTorqueEnergy
->
x
-=
0.5
f
*
(
-
rr3
*
ck
*
(
atomI
.
inducedDipole
[
0
]
*
psc3
+
atomI
.
inducedDipoleP
[
0
]
*
dsc3
)
+
rr5
*
sc4
*
(
atomI
.
inducedDipole
[
0
]
*
psc5
+
atomI
.
inducedDipoleP
[
0
]
*
dsc5
)
-
rr7
*
sc6
*
(
atomI
.
inducedDipole
[
0
]
*
psc7
+
atomI
.
inducedDipoleP
[
0
]
*
dsc7
)
+
rr3
*
ci
*
(
atomJ
.
inducedDipole
[
0
]
*
psc3
+
atomJ
.
inducedDipoleP
[
0
]
*
dsc3
)
+
rr5
*
sc3
*
(
atomJ
.
inducedDipole
[
0
]
*
psc5
+
atomJ
.
inducedDipoleP
[
0
]
*
dsc5
)
+
rr7
*
sc5
*
(
atomJ
.
inducedDipole
[
0
]
*
psc7
+
atomJ
.
inducedDipoleP
[
0
]
*
dsc7
));
forceTorqueEnergy
->
y
-=
0.5
f
*
(
-
rr3
*
ck
*
(
atomI
.
inducedDipole
[
1
]
*
psc3
+
atomI
.
inducedDipoleP
[
1
]
*
dsc3
)
+
rr5
*
sc4
*
(
atomI
.
inducedDipole
[
1
]
*
psc5
+
atomI
.
inducedDipoleP
[
1
]
*
dsc5
)
-
rr7
*
sc6
*
(
atomI
.
inducedDipole
[
1
]
*
psc7
+
atomI
.
inducedDipoleP
[
1
]
*
dsc7
)
+
rr3
*
ci
*
(
atomJ
.
inducedDipole
[
1
]
*
psc3
+
atomJ
.
inducedDipoleP
[
1
]
*
dsc3
)
+
rr5
*
sc3
*
(
atomJ
.
inducedDipole
[
1
]
*
psc5
+
atomJ
.
inducedDipoleP
[
1
]
*
dsc5
)
+
rr7
*
sc5
*
(
atomJ
.
inducedDipole
[
1
]
*
psc7
+
atomJ
.
inducedDipoleP
[
1
]
*
dsc7
));
forceTorqueEnergy
->
z
-=
0.5
f
*
(
-
rr3
*
ck
*
(
atomI
.
inducedDipole
[
2
]
*
psc3
+
atomI
.
inducedDipoleP
[
2
]
*
dsc3
)
+
rr5
*
sc4
*
(
atomI
.
inducedDipole
[
2
]
*
psc5
+
atomI
.
inducedDipoleP
[
2
]
*
dsc5
)
-
rr7
*
sc6
*
(
atomI
.
inducedDipole
[
2
]
*
psc7
+
atomI
.
inducedDipoleP
[
2
]
*
dsc7
)
+
rr3
*
ci
*
(
atomJ
.
inducedDipole
[
2
]
*
psc3
+
atomJ
.
inducedDipoleP
[
2
]
*
dsc3
)
+
rr5
*
sc3
*
(
atomJ
.
inducedDipole
[
2
]
*
psc5
+
atomJ
.
inducedDipoleP
[
2
]
*
dsc5
)
+
rr7
*
sc5
*
(
atomJ
.
inducedDipole
[
2
]
*
psc7
+
atomJ
.
inducedDipoleP
[
2
]
*
dsc7
));
if
(
damp
!=
0.0
f
){
float
expdamp
=
exp
(
damp
);
float
temp3
=
-
3.0
f
*
damp
*
expdamp
*
rr2
;
float
temp5
=
-
damp
;
float
temp7
=
-
0.2
f
-
0.6
f
*
damp
;
float
ddsc31
=
temp3
*
xr
;
float
ddsc32
=
temp3
*
yr
;
float
ddsc33
=
temp3
*
zr
;
float
ddsc51
=
temp5
*
ddsc31
;
float
ddsc52
=
temp5
*
ddsc32
;
float
ddsc53
=
temp5
*
ddsc33
;
float
ddsc71
=
temp7
*
ddsc51
;
float
ddsc72
=
temp7
*
ddsc52
;
float
ddsc73
=
temp7
*
ddsc53
;
temp3
=
rr3
*
((
gli1
+
gli6
)
*
scalingFactors
[
PScaleIndex
]
+
(
glip1
+
glip6
)
*
scalingFactors
[
DScaleIndex
]);
temp5
=
rr5
*
((
gli2
+
gli7
)
*
scalingFactors
[
PScaleIndex
]
+
(
glip2
+
glip7
)
*
scalingFactors
[
DScaleIndex
]);
temp7
=
rr7
*
(
gli3
*
scalingFactors
[
PScaleIndex
]
+
glip3
*
scalingFactors
[
DScaleIndex
]);
ftm2i1
-=
(
temp3
*
ddsc31
+
temp5
*
ddsc51
+
temp7
*
ddsc71
);
ftm2i2
-=
(
temp3
*
ddsc32
+
temp5
*
ddsc52
+
temp7
*
ddsc72
);
ftm2i3
-=
(
temp3
*
ddsc33
+
temp5
*
ddsc53
+
temp7
*
ddsc73
);
if
(
cAmoebaSim
.
polarizationType
==
0
){
temp3
=
rr3
*
scalingFactors
[
UScaleIndex
]
*
scip2
;
temp5
=
-
rr5
*
scalingFactors
[
UScaleIndex
]
*
(
sci3
*
scip4
+
scip3
*
sci4
);
ftm2i1
-=
(
temp3
*
ddsc31
+
temp5
*
ddsc51
);
ftm2i2
-=
(
temp3
*
ddsc32
+
temp5
*
ddsc52
);
ftm2i3
-=
(
temp3
*
ddsc33
+
temp5
*
ddsc53
);
}
}
if
(
cAmoebaSim
.
polarizationType
){
float
gfd
=
(
bn2
*
scip2
-
bn3
*
(
scip3
*
sci4
+
sci3
*
scip4
))
-
(
rr5
*
scip2
*
usc3
-
rr7
*
(
scip3
*
sci4
+
sci3
*
scip4
)
*
usc5
);
float
p1
=
(
bn2
-
usc5
*
rr5
);
ftm2i1
-=
gfd
*
xr
+
p1
*
(
sci4
*
atomI
.
inducedDipoleP
[
0
]
+
scip4
*
atomI
.
inducedDipole
[
0
]
+
sci3
*
atomJ
.
inducedDipoleP
[
0
]
+
scip3
*
atomJ
.
inducedDipole
[
0
]);
ftm2i2
-=
gfd
*
yr
+
p1
*
(
sci4
*
atomI
.
inducedDipoleP
[
1
]
+
scip4
*
atomI
.
inducedDipole
[
1
]
+
sci3
*
atomJ
.
inducedDipoleP
[
1
]
+
scip3
*
atomJ
.
inducedDipole
[
1
]);
ftm2i3
-=
gfd
*
zr
+
p1
*
(
sci4
*
atomI
.
inducedDipoleP
[
2
]
+
scip4
*
atomI
.
inducedDipole
[
2
]
+
sci3
*
atomJ
.
inducedDipoleP
[
2
]
+
scip3
*
atomJ
.
inducedDipole
[
2
]);
}
forceTorqueEnergy
->
x
+=
0.5
f
*
ftm2i1
;
forceTorqueEnergy
->
y
+=
0.5
f
*
ftm2i2
;
forceTorqueEnergy
->
z
+=
0.5
f
*
ftm2i3
;
return
;
}
static
__device__
void
calculatePmeDirectElectrostaticPairIxnT1_kernel
(
const
PmeDirectElectrostaticParticle
&
atomI
,
const
PmeDirectElectrostaticParticle
&
atomJ
,
const
float4
delta
,
const
float4
bn
,
const
float
*
scalingFactors
,
float4
*
forceTorqueEnergy
){
float
xr
=
delta
.
x
;
float
yr
=
delta
.
y
;
float
zr
=
delta
.
z
;
float
r
=
delta
.
w
;
// set the permanent multipole and induced dipole values;
float
di1
=
atomI
.
labFrameDipole
[
0
];
float
di2
=
atomI
.
labFrameDipole
[
1
];
float
di3
=
atomI
.
labFrameDipole
[
2
];
float
qi1
=
atomI
.
labFrameQuadrupole
[
0
];
float
qi2
=
atomI
.
labFrameQuadrupole
[
1
];
float
qi3
=
atomI
.
labFrameQuadrupole
[
2
];
float
qi5
=
atomI
.
labFrameQuadrupole
[
3
];
float
qi6
=
atomI
.
labFrameQuadrupole
[
4
];
float
qi9
=
atomI
.
labFrameQuadrupole
[
5
];
float
ck
=
atomJ
.
q
;
float
dk1
=
atomJ
.
labFrameDipole
[
0
];
float
dk2
=
atomJ
.
labFrameDipole
[
1
];
float
dk3
=
atomJ
.
labFrameDipole
[
2
];
float
qk1
=
atomJ
.
labFrameQuadrupole
[
0
];
float
qk2
=
atomJ
.
labFrameQuadrupole
[
1
];
float
qk3
=
atomJ
.
labFrameQuadrupole
[
2
];
float
qk5
=
atomJ
.
labFrameQuadrupole
[
3
];
float
qk6
=
atomJ
.
labFrameQuadrupole
[
4
];
float
qk9
=
atomJ
.
labFrameQuadrupole
[
5
];
float
bn1
=
bn
.
x
;
float
bn2
=
bn
.
y
;
float
bn3
=
bn
.
z
;
float
bn4
=
bn
.
w
;
// apply Thole polarization damping to scale factors
float
rr1
=
1.0
f
/
r
;
float
rr2
=
rr1
*
rr1
;
float
rr3
=
rr1
*
rr2
;
float
rr5
=
3.0
f
*
rr3
*
rr2
;
float
rr7
=
5.0
f
*
rr5
*
rr2
;
float
rr9
=
7.0
f
*
rr7
*
rr2
;
float
scale
=
1.0
f
-
scalingFactors
[
MScaleIndex
];
float
prefactor
=
scale
*
rr3
-
bn1
;
float
dixdk1
=
di2
*
dk3
-
di3
*
dk2
;
float
ttm21
=
prefactor
*
dixdk1
;
float
dixdk2
=
di3
*
dk1
-
di1
*
dk3
;
float
ttm22
=
prefactor
*
dixdk2
;
float
dixdk3
=
di1
*
dk2
-
di2
*
dk1
;
float
ttm23
=
prefactor
*
dixdk3
;
float
qir1
=
qi1
*
xr
+
qi2
*
yr
+
qi3
*
zr
;
float
qir2
=
qi2
*
xr
+
qi5
*
yr
+
qi6
*
zr
;
float
qir3
=
qi3
*
xr
+
qi6
*
yr
+
qi9
*
zr
;
float
qkr1
=
qk1
*
xr
+
qk2
*
yr
+
qk3
*
zr
;
float
qkr2
=
qk2
*
xr
+
qk5
*
yr
+
qk6
*
zr
;
float
qkr3
=
qk3
*
xr
+
qk6
*
yr
+
qk9
*
zr
;
float
qiqkr1
=
qi1
*
qkr1
+
qi2
*
qkr2
+
qi3
*
qkr3
;
float
qiqkr2
=
qi2
*
qkr1
+
qi5
*
qkr2
+
qi6
*
qkr3
;
float
qiqkr3
=
qi3
*
qkr1
+
qi6
*
qkr2
+
qi9
*
qkr3
;
float
rxqikr1
=
yr
*
qiqkr3
-
zr
*
qiqkr2
;
float
qkrxqir1
=
qkr2
*
qir3
-
qkr3
*
qir2
;
prefactor
=
4.0
f
*
(
bn3
-
scale
*
rr7
);
ttm21
-=
prefactor
*
(
rxqikr1
+
qkrxqir1
);
float
rxqikr2
=
zr
*
qiqkr1
-
xr
*
qiqkr3
;
float
qkrxqir2
=
qkr3
*
qir1
-
qkr1
*
qir3
;
ttm22
-=
prefactor
*
(
rxqikr2
+
qkrxqir2
);
float
rxqikr3
=
xr
*
qiqkr2
-
yr
*
qiqkr1
;
float
qkrxqir3
=
qkr1
*
qir2
-
qkr2
*
qir1
;
ttm23
-=
prefactor
*
(
rxqikr3
+
qkrxqir3
);
float
qidk1
=
qi1
*
dk1
+
qi2
*
dk2
+
qi3
*
dk3
;
float
qidk2
=
qi2
*
dk1
+
qi5
*
dk2
+
qi6
*
dk3
;
float
qidk3
=
qi3
*
dk1
+
qi6
*
dk2
+
qi9
*
dk3
;
float
dixqkr1
=
di2
*
qkr3
-
di3
*
qkr2
;
float
dkxqir1
=
dk2
*
qir3
-
dk3
*
qir2
;
float
rxqidk1
=
yr
*
qidk3
-
zr
*
qidk2
;
float
qixqk1
=
qi2
*
qk3
+
qi5
*
qk6
+
qi6
*
qk9
-
qi3
*
qk2
-
qi6
*
qk5
-
qi9
*
qk6
;
prefactor
=
2.0
f
*
(
bn2
-
scale
*
rr5
);
ttm21
+=
prefactor
*
(
dixqkr1
+
dkxqir1
+
rxqidk1
-
2.0
f
*
qixqk1
);
float
dixqkr2
=
di3
*
qkr1
-
di1
*
qkr3
;
float
dkxqir2
=
dk3
*
qir1
-
dk1
*
qir3
;
float
rxqidk2
=
zr
*
qidk1
-
xr
*
qidk3
;
float
qixqk2
=
qi3
*
qk1
+
qi6
*
qk2
+
qi9
*
qk3
-
qi1
*
qk3
-
qi2
*
qk6
-
qi3
*
qk9
;
ttm22
+=
prefactor
*
(
dixqkr2
+
dkxqir2
+
rxqidk2
-
2.0
f
*
qixqk2
);
float
dixqkr3
=
di1
*
qkr2
-
di2
*
qkr1
;
float
dkxqir3
=
dk1
*
qir2
-
dk2
*
qir1
;
float
rxqidk3
=
xr
*
qidk2
-
yr
*
qidk1
;
float
qixqk3
=
qi1
*
qk2
+
qi2
*
qk5
+
qi3
*
qk6
-
qi2
*
qk1
-
qi5
*
qk2
-
qi6
*
qk3
;
ttm23
+=
prefactor
*
(
dixqkr3
+
dkxqir3
+
rxqidk3
-
2.0
f
*
qixqk3
);
float
sc4
=
dk1
*
xr
+
dk2
*
yr
+
dk3
*
zr
;
float
sc6
=
qkr1
*
xr
+
qkr2
*
yr
+
qkr3
*
zr
;
float
gf2
=
-
ck
*
bn1
+
sc4
*
bn2
-
sc6
*
bn3
;
float
gfr2
=
-
ck
*
rr3
+
sc4
*
rr5
-
sc6
*
rr7
;
prefactor
=
(
gf2
-
scale
*
gfr2
);
ttm21
+=
prefactor
*
(
di2
*
zr
-
di3
*
yr
);
ttm22
+=
prefactor
*
(
di3
*
xr
-
di1
*
zr
);
ttm23
+=
prefactor
*
(
di1
*
yr
-
di2
*
xr
);
float
gf5
=
(
-
ck
*
bn2
+
sc4
*
bn3
-
sc6
*
bn4
);
float
gfr5
=
(
-
ck
*
rr5
+
sc4
*
rr7
-
sc6
*
rr9
);
prefactor
=
2.0
f
*
(
gf5
-
scale
*
gfr5
);
float
rxqir1
=
yr
*
qir3
-
zr
*
qir2
;
float
rxqir2
=
zr
*
qir1
-
xr
*
qir3
;
float
rxqir3
=
xr
*
qir2
-
yr
*
qir1
;
ttm21
-=
prefactor
*
rxqir1
;
ttm22
-=
prefactor
*
rxqir2
;
ttm23
-=
prefactor
*
rxqir3
;
forceTorqueEnergy
->
x
=
ttm21
;
forceTorqueEnergy
->
y
=
ttm22
;
forceTorqueEnergy
->
z
=
ttm23
;
return
;
}
static
__device__
void
calculatePmeDirectElectrostaticPairIxnT2_kernel
(
const
PmeDirectElectrostaticParticle
&
atomI
,
const
PmeDirectElectrostaticParticle
&
atomJ
,
const
float4
delta
,
const
float4
bn
,
const
float
*
scalingFactors
,
float4
*
forceTorqueEnergy
){
float
xr
=
delta
.
x
;
float
yr
=
delta
.
y
;
float
zr
=
delta
.
z
;
float
r
=
delta
.
w
;
// set the permanent multipole and induced dipole values;
float
di1
=
atomI
.
labFrameDipole
[
0
];
float
di2
=
atomI
.
labFrameDipole
[
1
];
float
di3
=
atomI
.
labFrameDipole
[
2
];
float
qi1
=
atomI
.
labFrameQuadrupole
[
0
];
float
qi2
=
atomI
.
labFrameQuadrupole
[
1
];
float
qi3
=
atomI
.
labFrameQuadrupole
[
2
];
float
qi5
=
atomI
.
labFrameQuadrupole
[
3
];
float
qi6
=
atomI
.
labFrameQuadrupole
[
4
];
float
qi9
=
atomI
.
labFrameQuadrupole
[
5
];
float
bn1
=
bn
.
x
;
float
bn2
=
bn
.
y
;
float
bn3
=
bn
.
z
;
// apply Thole polarization damping to scale factors
float
rr1
=
1.0
f
/
r
;
float
rr2
=
rr1
*
rr1
;
float
rr3
=
rr1
*
rr2
;
float
rr5
=
3.0
f
*
rr3
*
rr2
;
float
rr7
=
5.0
f
*
rr5
*
rr2
;
float
scale3
=
1.0
f
;
float
scale5
=
1.0
f
;
float
scale7
=
1.0
f
;
float
damp
=
atomI
.
damp
*
atomJ
.
damp
;
if
(
damp
!=
0.0
f
){
float
pgamma
=
atomI
.
thole
<
atomJ
.
thole
?
atomI
.
thole
:
atomJ
.
thole
;
float
ratio
=
r
/
damp
;
damp
=
-
pgamma
*
ratio
*
ratio
*
ratio
;
if
(
damp
>
-
50.0
f
){
float
expdamp
=
exp
(
damp
);
scale3
=
1.0
f
-
expdamp
;
scale5
=
1.0
f
-
(
1.0
f
-
damp
)
*
expdamp
;
scale7
=
1.0
f
-
(
1.0
f
-
damp
+
0.6
f
*
damp
*
damp
)
*
expdamp
;
}
}
float
dsc3
=
1.0
f
-
scale3
*
scalingFactors
[
DScaleIndex
];
float
dsc5
=
1.0
f
-
scale5
*
scalingFactors
[
DScaleIndex
];
float
dsc7
=
1.0
f
-
scale7
*
scalingFactors
[
DScaleIndex
];
float
psc3
=
1.0
f
-
scale3
*
scalingFactors
[
PScaleIndex
];
float
psc5
=
1.0
f
-
scale5
*
scalingFactors
[
PScaleIndex
];
float
psc7
=
1.0
f
-
scale7
*
scalingFactors
[
PScaleIndex
];
float
prefactor1
=
0.5
f
*
(
psc3
*
rr3
-
bn1
);
float
prefactor2
=
0.5
f
*
(
dsc3
*
rr3
-
bn1
);
float
dixuk1
=
di2
*
atomJ
.
inducedDipole
[
2
]
-
di3
*
atomJ
.
inducedDipole
[
1
];
float
dixukp1
=
di2
*
atomJ
.
inducedDipoleP
[
2
]
-
di3
*
atomJ
.
inducedDipoleP
[
1
];
float
ttm2i1
=
prefactor1
*
dixuk1
+
prefactor2
*
dixukp1
;
float
dixuk2
=
di3
*
atomJ
.
inducedDipole
[
0
]
-
di1
*
atomJ
.
inducedDipole
[
2
];
float
dixukp2
=
di3
*
atomJ
.
inducedDipoleP
[
0
]
-
di1
*
atomJ
.
inducedDipoleP
[
2
];
float
ttm2i2
=
prefactor1
*
dixuk2
+
prefactor2
*
dixukp2
;
float
dixuk3
=
di1
*
atomJ
.
inducedDipole
[
1
]
-
di2
*
atomJ
.
inducedDipole
[
0
];
float
dixukp3
=
di1
*
atomJ
.
inducedDipoleP
[
1
]
-
di2
*
atomJ
.
inducedDipoleP
[
0
];
float
ttm2i3
=
prefactor1
*
dixuk3
+
prefactor2
*
dixukp3
;
float
sci4
=
atomJ
.
inducedDipole
[
0
]
*
xr
+
atomJ
.
inducedDipole
[
1
]
*
yr
+
atomJ
.
inducedDipole
[
2
]
*
zr
;
float
scip4
=
atomJ
.
inducedDipoleP
[
0
]
*
xr
+
atomJ
.
inducedDipoleP
[
1
]
*
yr
+
atomJ
.
inducedDipoleP
[
2
]
*
zr
;
float
gti2
=
bn2
*
(
sci4
+
scip4
);
float
gtri2
=
rr5
*
(
sci4
*
psc5
+
scip4
*
dsc5
);
prefactor1
=
0.5
f
*
(
gti2
-
gtri2
);
ttm2i1
+=
prefactor1
*
(
di2
*
zr
-
di3
*
yr
);
ttm2i2
+=
prefactor1
*
(
di3
*
xr
-
di1
*
zr
);
ttm2i3
+=
prefactor1
*
(
di1
*
yr
-
di2
*
xr
);
float
qir1
=
qi1
*
xr
+
qi2
*
yr
+
qi3
*
zr
;
float
qir2
=
qi2
*
xr
+
qi5
*
yr
+
qi6
*
zr
;
float
qir3
=
qi3
*
xr
+
qi6
*
yr
+
qi9
*
zr
;
prefactor1
=
rr7
*
(
sci4
*
psc7
+
scip4
*
dsc7
)
-
bn3
*
(
sci4
+
scip4
);
ttm2i1
+=
prefactor1
*
(
yr
*
qir3
-
zr
*
qir2
);
ttm2i2
+=
prefactor1
*
(
zr
*
qir1
-
xr
*
qir3
);
ttm2i3
+=
prefactor1
*
(
xr
*
qir2
-
yr
*
qir1
);
float
qiuk1
=
qi1
*
atomJ
.
inducedDipole
[
0
]
+
qi2
*
atomJ
.
inducedDipole
[
1
]
+
qi3
*
atomJ
.
inducedDipole
[
2
];
float
qiuk2
=
qi2
*
atomJ
.
inducedDipole
[
0
]
+
qi5
*
atomJ
.
inducedDipole
[
1
]
+
qi6
*
atomJ
.
inducedDipole
[
2
];
float
qiuk3
=
qi3
*
atomJ
.
inducedDipole
[
0
]
+
qi6
*
atomJ
.
inducedDipole
[
1
]
+
qi9
*
atomJ
.
inducedDipole
[
2
];
float
qiukp1
=
qi1
*
atomJ
.
inducedDipoleP
[
0
]
+
qi2
*
atomJ
.
inducedDipoleP
[
1
]
+
qi3
*
atomJ
.
inducedDipoleP
[
2
];
float
qiukp2
=
qi2
*
atomJ
.
inducedDipoleP
[
0
]
+
qi5
*
atomJ
.
inducedDipoleP
[
1
]
+
qi6
*
atomJ
.
inducedDipoleP
[
2
];
float
qiukp3
=
qi3
*
atomJ
.
inducedDipoleP
[
0
]
+
qi6
*
atomJ
.
inducedDipoleP
[
1
]
+
qi9
*
atomJ
.
inducedDipoleP
[
2
];
prefactor1
=
(
bn2
-
rr5
*
psc5
);
prefactor2
=
(
bn2
-
rr5
*
dsc5
);
float
ukxqir1
=
atomJ
.
inducedDipole
[
1
]
*
qir3
-
atomJ
.
inducedDipole
[
2
]
*
qir2
;
float
ukxqirp1
=
atomJ
.
inducedDipoleP
[
1
]
*
qir3
-
atomJ
.
inducedDipoleP
[
2
]
*
qir2
;
float
rxqiuk1
=
yr
*
qiuk3
-
zr
*
qiuk2
;
float
rxqiukp1
=
yr
*
qiukp3
-
zr
*
qiukp2
;
ttm2i1
+=
prefactor1
*
(
ukxqir1
+
rxqiuk1
)
+
prefactor2
*
(
ukxqirp1
+
rxqiukp1
);
float
ukxqir2
=
atomJ
.
inducedDipole
[
2
]
*
qir1
-
atomJ
.
inducedDipole
[
0
]
*
qir3
;
float
ukxqirp2
=
atomJ
.
inducedDipoleP
[
2
]
*
qir1
-
atomJ
.
inducedDipoleP
[
0
]
*
qir3
;
float
rxqiuk2
=
zr
*
qiuk1
-
xr
*
qiuk3
;
float
rxqiukp2
=
zr
*
qiukp1
-
xr
*
qiukp3
;
ttm2i2
+=
prefactor1
*
(
ukxqir2
+
rxqiuk2
)
+
prefactor2
*
(
ukxqirp2
+
rxqiukp2
);
float
ukxqir3
=
atomJ
.
inducedDipole
[
0
]
*
qir2
-
atomJ
.
inducedDipole
[
1
]
*
qir1
;
float
ukxqirp3
=
atomJ
.
inducedDipoleP
[
0
]
*
qir2
-
atomJ
.
inducedDipoleP
[
1
]
*
qir1
;
float
rxqiuk3
=
xr
*
qiuk2
-
yr
*
qiuk1
;
float
rxqiukp3
=
xr
*
qiukp2
-
yr
*
qiukp1
;
ttm2i3
+=
prefactor1
*
(
ukxqir3
+
rxqiuk3
)
+
prefactor2
*
(
ukxqirp3
+
rxqiukp3
);
forceTorqueEnergy
->
x
+=
ttm2i1
;
forceTorqueEnergy
->
y
+=
ttm2i2
;
forceTorqueEnergy
->
z
+=
ttm2i3
;
return
;
}
__device__
void
calculatePmeDirectElectrostaticPairIxn_kernel
(
const
PmeDirectElectrostaticParticle
&
atomI
,
const
PmeDirectElectrostaticParticle
&
atomJ
,
const
float
*
scalingFactors
,
float4
forceTorqueEnergy
[
3
]
){
float4
delta
;
delta
.
x
=
atomJ
.
x
-
atomI
.
x
;
delta
.
y
=
atomJ
.
y
-
atomI
.
y
;
delta
.
z
=
atomJ
.
z
-
atomI
.
z
;
// periodic box
delta
.
x
-=
floor
(
delta
.
x
*
cSim
.
invPeriodicBoxSizeX
+
0.5
f
)
*
cSim
.
periodicBoxSizeX
;
delta
.
y
-=
floor
(
delta
.
y
*
cSim
.
invPeriodicBoxSizeY
+
0.5
f
)
*
cSim
.
periodicBoxSizeY
;
delta
.
z
-=
floor
(
delta
.
z
*
cSim
.
invPeriodicBoxSizeZ
+
0.5
f
)
*
cSim
.
periodicBoxSizeZ
;
delta
.
w
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
if
(
delta
.
w
>
cSim
.
nonbondedCutoffSqr
||
delta
.
w
==
0.0
f
){
forceTorqueEnergy
[
0
].
x
=
forceTorqueEnergy
[
0
].
y
=
forceTorqueEnergy
[
0
].
z
=
forceTorqueEnergy
[
0
].
w
=
0.0
f
;
forceTorqueEnergy
[
1
].
x
=
forceTorqueEnergy
[
1
].
y
=
forceTorqueEnergy
[
1
].
z
=
0.0
f
;
forceTorqueEnergy
[
2
].
x
=
forceTorqueEnergy
[
2
].
y
=
forceTorqueEnergy
[
2
].
z
=
0.0
f
;
return
;
}
delta
.
w
=
sqrt
(
delta
.
w
);
float
r
=
delta
.
w
;
float
ralpha
=
cSim
.
alphaEwald
*
r
;
float
alsq2
=
2.0
f
*
cSim
.
alphaEwald
*
cSim
.
alphaEwald
;
float
alsq2n
=
0.0
f
;
if
(
cSim
.
alphaEwald
>
0.0
f
){
alsq2n
=
1.0
f
/
(
cAmoebaSim
.
sqrtPi
*
cSim
.
alphaEwald
);
}
float
exp2a
=
exp
(
-
(
ralpha
*
ralpha
));
float
rr1
=
1.0
f
/
r
;
float
bn0
=
erfc
(
ralpha
)
*
rr1
;
float
rr2
=
rr1
*
rr1
;
forceTorqueEnergy
[
0
].
w
=
atomI
.
q
*
atomJ
.
q
*
bn0
;
alsq2n
*=
alsq2
;
float4
bn
;
bn
.
x
=
(
bn0
+
alsq2n
*
exp2a
)
*
rr2
;
alsq2n
*=
alsq2
;
bn
.
y
=
(
3.0
f
*
bn
.
x
+
alsq2n
*
exp2a
)
*
rr2
;
alsq2n
*=
alsq2
;
bn
.
z
=
(
5.0
f
*
bn
.
y
+
alsq2n
*
exp2a
)
*
rr2
;
alsq2n
*=
alsq2
;
bn
.
w
=
(
7.0
f
*
bn
.
z
+
alsq2n
*
exp2a
)
*
rr2
;
alsq2n
*=
alsq2
;
float
bn5
=
(
9.0
f
*
bn
.
w
+
alsq2n
*
exp2a
)
*
rr2
;
//calculatePmeDirectElectrostaticPairIxn_F1_kernel( atomI, atomJ, delta, scalingFactors, forceTorqueEnergy );
calculatePmeDirectElectrostaticPairIxnF1_kernel
(
atomI
,
atomJ
,
delta
,
bn
,
bn5
,
scalingFactors
,
forceTorqueEnergy
);
//calculatePmeDirectElectrostaticPairIxn_F2_kernel( atomI, atomJ, delta, scalingFactors, forceTorqueEnergy );
calculatePmeDirectElectrostaticPairIxnF2_kernel
(
atomI
,
atomJ
,
delta
,
bn
,
scalingFactors
,
forceTorqueEnergy
);
//calculatePmeDirectElectrostaticPairIxn_T1_kernel( atomI, atomJ, delta, scalingFactors, forceTorqueEnergy );
calculatePmeDirectElectrostaticPairIxnT1_kernel
(
atomI
,
atomJ
,
delta
,
bn
,
scalingFactors
,
(
forceTorqueEnergy
+
1
)
);
//calculatePmeDirectElectrostaticPairIxn_T2_kernel( atomI, atomJ, delta, scalingFactors, forceTorqueEnergy );
calculatePmeDirectElectrostaticPairIxnT2_kernel
(
atomI
,
atomJ
,
delta
,
bn
,
scalingFactors
,
(
forceTorqueEnergy
+
1
)
);
//calculatePmeDirectElectrostaticPairIxn_T3_kernel( atomI, atomJ, delta, scalingFactors, forceTorqueEnergy );
//calculatePmeDirectElectrostaticPairIxnT3_kernel( atomI, atomJ, delta, bn, scalingFactors, (forceTorqueEnergy+2) );
// T3 == T1 w/ particles I and J reversed
// T4 == T2 w/ particles I and J reversed
delta
.
x
*=
-
1.0
f
;
delta
.
y
*=
-
1.0
f
;
delta
.
z
*=
-
1.0
f
;
calculatePmeDirectElectrostaticPairIxnT1_kernel
(
atomJ
,
atomI
,
delta
,
bn
,
scalingFactors
,
(
forceTorqueEnergy
+
2
)
);
calculatePmeDirectElectrostaticPairIxnT2_kernel
(
atomJ
,
atomI
,
delta
,
bn
,
scalingFactors
,
(
forceTorqueEnergy
+
2
)
);
//calculatePmeDirectElectrostaticPairIxn_T4_kernel( atomI, atomJ, delta, scalingFactors, forceTorqueEnergy );
//calculatePmeDirectElectrostaticPairIxnT4_kernel( atomI, atomJ, delta, scalingFactors, (forceTorqueEnergy+2) );
return
;
}
__device__
void
calculatePmeDirectElectrostaticPairIxnOrig_kernel
(
PmeDirectElectrostaticParticle
&
atomI
,
PmeDirectElectrostaticParticle
&
atomJ
,
float
*
scalingFactors
,
float4
forceTorqueEnergy
[
3
]
#ifdef AMOEBA_DEBUG
,
float4
*
debugArray
...
...
@@ -186,12 +1020,9 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float
qi1
=
atomI
.
labFrameQuadrupole
[
0
];
float
qi2
=
atomI
.
labFrameQuadrupole
[
1
];
float
qi3
=
atomI
.
labFrameQuadrupole
[
2
];
float
qi4
=
atomI
.
labFrameQuadrupole
[
3
];
float
qi5
=
atomI
.
labFrameQuadrupole
[
4
];
float
qi6
=
atomI
.
labFrameQuadrupole
[
5
];
float
qi7
=
atomI
.
labFrameQuadrupole
[
6
];
float
qi8
=
atomI
.
labFrameQuadrupole
[
7
];
float
qi9
=
atomI
.
labFrameQuadrupole
[
8
];
float
qi5
=
atomI
.
labFrameQuadrupole
[
3
];
float
qi6
=
atomI
.
labFrameQuadrupole
[
4
];
float
qi9
=
atomI
.
labFrameQuadrupole
[
5
];
float
dk1
=
atomJ
.
labFrameDipole
[
0
];
float
dk2
=
atomJ
.
labFrameDipole
[
1
];
...
...
@@ -200,12 +1031,9 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float
qk1
=
atomJ
.
labFrameQuadrupole
[
0
];
float
qk2
=
atomJ
.
labFrameQuadrupole
[
1
];
float
qk3
=
atomJ
.
labFrameQuadrupole
[
2
];
float
qk4
=
atomJ
.
labFrameQuadrupole
[
3
];
float
qk5
=
atomJ
.
labFrameQuadrupole
[
4
];
float
qk6
=
atomJ
.
labFrameQuadrupole
[
5
];
float
qk7
=
atomJ
.
labFrameQuadrupole
[
6
];
float
qk8
=
atomJ
.
labFrameQuadrupole
[
7
];
float
qk9
=
atomJ
.
labFrameQuadrupole
[
8
];
float
qk5
=
atomJ
.
labFrameQuadrupole
[
3
];
float
qk6
=
atomJ
.
labFrameQuadrupole
[
4
];
float
qk9
=
atomJ
.
labFrameQuadrupole
[
5
];
// calculate the real space error function terms
...
...
@@ -325,25 +1153,25 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float
dkxr2
=
dk3
*
xr
-
dk1
*
zr
;
float
dkxr3
=
dk1
*
yr
-
dk2
*
xr
;
float
qir1
=
qi1
*
xr
+
qi
4
*
yr
+
qi
7
*
zr
;
float
qir2
=
qi2
*
xr
+
qi5
*
yr
+
qi
8
*
zr
;
float
qir1
=
qi1
*
xr
+
qi
2
*
yr
+
qi
3
*
zr
;
float
qir2
=
qi2
*
xr
+
qi5
*
yr
+
qi
6
*
zr
;
float
qir3
=
qi3
*
xr
+
qi6
*
yr
+
qi9
*
zr
;
float
qkr1
=
qk1
*
xr
+
qk
4
*
yr
+
qk
7
*
zr
;
float
qkr2
=
qk2
*
xr
+
qk5
*
yr
+
qk
8
*
zr
;
float
qkr1
=
qk1
*
xr
+
qk
2
*
yr
+
qk
3
*
zr
;
float
qkr2
=
qk2
*
xr
+
qk5
*
yr
+
qk
6
*
zr
;
float
qkr3
=
qk3
*
xr
+
qk6
*
yr
+
qk9
*
zr
;
float
qiqkr1
=
qi1
*
qkr1
+
qi
4
*
qkr2
+
qi
7
*
qkr3
;
float
qiqkr2
=
qi2
*
qkr1
+
qi5
*
qkr2
+
qi
8
*
qkr3
;
float
qiqkr1
=
qi1
*
qkr1
+
qi
2
*
qkr2
+
qi
3
*
qkr3
;
float
qiqkr2
=
qi2
*
qkr1
+
qi5
*
qkr2
+
qi
6
*
qkr3
;
float
qiqkr3
=
qi3
*
qkr1
+
qi6
*
qkr2
+
qi9
*
qkr3
;
float
qkqir1
=
qk1
*
qir1
+
qk
4
*
qir2
+
qk
7
*
qir3
;
float
qkqir2
=
qk2
*
qir1
+
qk5
*
qir2
+
qk
8
*
qir3
;
float
qkqir1
=
qk1
*
qir1
+
qk
2
*
qir2
+
qk
3
*
qir3
;
float
qkqir2
=
qk2
*
qir1
+
qk5
*
qir2
+
qk
6
*
qir3
;
float
qkqir3
=
qk3
*
qir1
+
qk6
*
qir2
+
qk9
*
qir3
;
float
qixqk1
=
qi2
*
qk3
+
qi5
*
qk6
+
qi
8
*
qk9
-
qi3
*
qk2
-
qi6
*
qk5
-
qi9
*
qk
8
;
float
qixqk2
=
qi3
*
qk1
+
qi6
*
qk
4
+
qi9
*
qk
7
-
qi1
*
qk3
-
qi
4
*
qk6
-
qi
7
*
qk9
;
float
qixqk3
=
qi1
*
qk2
+
qi
4
*
qk5
+
qi
7
*
qk
8
-
qi2
*
qk1
-
qi5
*
qk
4
-
qi
8
*
qk
7
;
float
qixqk1
=
qi2
*
qk3
+
qi5
*
qk6
+
qi
6
*
qk9
-
qi3
*
qk2
-
qi6
*
qk5
-
qi9
*
qk
6
;
float
qixqk2
=
qi3
*
qk1
+
qi6
*
qk
2
+
qi9
*
qk
3
-
qi1
*
qk3
-
qi
2
*
qk6
-
qi
3
*
qk9
;
float
qixqk3
=
qi1
*
qk2
+
qi
2
*
qk5
+
qi
3
*
qk
6
-
qi2
*
qk1
-
qi5
*
qk
2
-
qi
6
*
qk
3
;
float
rxqir1
=
yr
*
qir3
-
zr
*
qir2
;
float
rxqir2
=
zr
*
qir1
-
xr
*
qir3
;
...
...
@@ -365,28 +1193,28 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float
qkrxqir2
=
qkr3
*
qir1
-
qkr1
*
qir3
;
float
qkrxqir3
=
qkr1
*
qir2
-
qkr2
*
qir1
;
float
qidk1
=
qi1
*
dk1
+
qi
4
*
dk2
+
qi
7
*
dk3
;
float
qidk2
=
qi2
*
dk1
+
qi5
*
dk2
+
qi
8
*
dk3
;
float
qidk1
=
qi1
*
dk1
+
qi
2
*
dk2
+
qi
3
*
dk3
;
float
qidk2
=
qi2
*
dk1
+
qi5
*
dk2
+
qi
6
*
dk3
;
float
qidk3
=
qi3
*
dk1
+
qi6
*
dk2
+
qi9
*
dk3
;
float
qkdi1
=
qk1
*
di1
+
qk
4
*
di2
+
qk
7
*
di3
;
float
qkdi2
=
qk2
*
di1
+
qk5
*
di2
+
qk
8
*
di3
;
float
qkdi1
=
qk1
*
di1
+
qk
2
*
di2
+
qk
3
*
di3
;
float
qkdi2
=
qk2
*
di1
+
qk5
*
di2
+
qk
6
*
di3
;
float
qkdi3
=
qk3
*
di1
+
qk6
*
di2
+
qk9
*
di3
;
float
qiuk1
=
qi1
*
atomJ
.
inducedDipole
[
0
]
+
qi
4
*
atomJ
.
inducedDipole
[
1
]
+
qi
7
*
atomJ
.
inducedDipole
[
2
];
float
qiuk2
=
qi2
*
atomJ
.
inducedDipole
[
0
]
+
qi5
*
atomJ
.
inducedDipole
[
1
]
+
qi
8
*
atomJ
.
inducedDipole
[
2
];
float
qiuk1
=
qi1
*
atomJ
.
inducedDipole
[
0
]
+
qi
2
*
atomJ
.
inducedDipole
[
1
]
+
qi
3
*
atomJ
.
inducedDipole
[
2
];
float
qiuk2
=
qi2
*
atomJ
.
inducedDipole
[
0
]
+
qi5
*
atomJ
.
inducedDipole
[
1
]
+
qi
6
*
atomJ
.
inducedDipole
[
2
];
float
qiuk3
=
qi3
*
atomJ
.
inducedDipole
[
0
]
+
qi6
*
atomJ
.
inducedDipole
[
1
]
+
qi9
*
atomJ
.
inducedDipole
[
2
];
float
qkui1
=
qk1
*
atomI
.
inducedDipole
[
0
]
+
qk
4
*
atomI
.
inducedDipole
[
1
]
+
qk
7
*
atomI
.
inducedDipole
[
2
];
float
qkui2
=
qk2
*
atomI
.
inducedDipole
[
0
]
+
qk5
*
atomI
.
inducedDipole
[
1
]
+
qk
8
*
atomI
.
inducedDipole
[
2
];
float
qkui1
=
qk1
*
atomI
.
inducedDipole
[
0
]
+
qk
2
*
atomI
.
inducedDipole
[
1
]
+
qk
3
*
atomI
.
inducedDipole
[
2
];
float
qkui2
=
qk2
*
atomI
.
inducedDipole
[
0
]
+
qk5
*
atomI
.
inducedDipole
[
1
]
+
qk
6
*
atomI
.
inducedDipole
[
2
];
float
qkui3
=
qk3
*
atomI
.
inducedDipole
[
0
]
+
qk6
*
atomI
.
inducedDipole
[
1
]
+
qk9
*
atomI
.
inducedDipole
[
2
];
float
qiukp1
=
qi1
*
atomJ
.
inducedDipoleP
[
0
]
+
qi
4
*
atomJ
.
inducedDipoleP
[
1
]
+
qi
7
*
atomJ
.
inducedDipoleP
[
2
];
float
qiukp2
=
qi2
*
atomJ
.
inducedDipoleP
[
0
]
+
qi5
*
atomJ
.
inducedDipoleP
[
1
]
+
qi
8
*
atomJ
.
inducedDipoleP
[
2
];
float
qiukp1
=
qi1
*
atomJ
.
inducedDipoleP
[
0
]
+
qi
2
*
atomJ
.
inducedDipoleP
[
1
]
+
qi
3
*
atomJ
.
inducedDipoleP
[
2
];
float
qiukp2
=
qi2
*
atomJ
.
inducedDipoleP
[
0
]
+
qi5
*
atomJ
.
inducedDipoleP
[
1
]
+
qi
6
*
atomJ
.
inducedDipoleP
[
2
];
float
qiukp3
=
qi3
*
atomJ
.
inducedDipoleP
[
0
]
+
qi6
*
atomJ
.
inducedDipoleP
[
1
]
+
qi9
*
atomJ
.
inducedDipoleP
[
2
];
float
qkuip1
=
qk1
*
atomI
.
inducedDipoleP
[
0
]
+
qk
4
*
atomI
.
inducedDipoleP
[
1
]
+
qk
7
*
atomI
.
inducedDipoleP
[
2
];
float
qkuip2
=
qk2
*
atomI
.
inducedDipoleP
[
0
]
+
qk5
*
atomI
.
inducedDipoleP
[
1
]
+
qk
8
*
atomI
.
inducedDipoleP
[
2
];
float
qkuip1
=
qk1
*
atomI
.
inducedDipoleP
[
0
]
+
qk
2
*
atomI
.
inducedDipoleP
[
1
]
+
qk
3
*
atomI
.
inducedDipoleP
[
2
];
float
qkuip2
=
qk2
*
atomI
.
inducedDipoleP
[
0
]
+
qk5
*
atomI
.
inducedDipoleP
[
1
]
+
qk
6
*
atomI
.
inducedDipoleP
[
2
];
float
qkuip3
=
qk3
*
atomI
.
inducedDipoleP
[
0
]
+
qk6
*
atomI
.
inducedDipoleP
[
1
]
+
qk9
*
atomI
.
inducedDipoleP
[
2
];
float
dixqkr1
=
di2
*
qkr3
-
di3
*
qkr2
;
...
...
@@ -448,8 +1276,8 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float
sc8
=
qkr1
*
di1
+
qkr2
*
di2
+
qkr3
*
di3
;
float
sc9
=
qir1
*
qkr1
+
qir2
*
qkr2
+
qir3
*
qkr3
;
float
sc10
=
qi1
*
qk1
+
qi2
*
qk2
+
qi3
*
qk3
+
qi
4
*
qk
4
+
qi5
*
qk5
+
qi6
*
qk6
+
qi
7
*
qk
7
+
qi
8
*
qk
8
+
qi9
*
qk9
;
+
qi
2
*
qk
2
+
qi5
*
qk5
+
qi6
*
qk6
+
qi
3
*
qk
3
+
qi
6
*
qk
6
+
qi9
*
qk9
;
// calculate the scalar products for induced components
...
...
@@ -1031,12 +1859,9 @@ __device__ void loadPmeDirectElectrostaticShared( struct PmeDirectElectrostaticP
sA
->
labFrameQuadrupole
[
0
]
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomI
*
9
];
sA
->
labFrameQuadrupole
[
1
]
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomI
*
9
+
1
];
sA
->
labFrameQuadrupole
[
2
]
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomI
*
9
+
2
];
sA
->
labFrameQuadrupole
[
3
]
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomI
*
9
+
3
];
sA
->
labFrameQuadrupole
[
4
]
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomI
*
9
+
4
];
sA
->
labFrameQuadrupole
[
5
]
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomI
*
9
+
5
];
sA
->
labFrameQuadrupole
[
6
]
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomI
*
9
+
6
];
sA
->
labFrameQuadrupole
[
7
]
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomI
*
9
+
7
];
sA
->
labFrameQuadrupole
[
8
]
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomI
*
9
+
8
];
sA
->
labFrameQuadrupole
[
3
]
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomI
*
9
+
4
];
sA
->
labFrameQuadrupole
[
4
]
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomI
*
9
+
5
];
sA
->
labFrameQuadrupole
[
5
]
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomI
*
9
+
8
];
// induced dipole
...
...
@@ -1120,28 +1945,35 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
// on first pass, set threads/block
static
unsigned
int
threadsPerBlock
=
0
;
static
const
int
maxL1
=
0
;
if
(
threadsPerBlock
==
0
){
unsigned
int
sharedMemoryPerBlock
=
gpu
->
sharedMemoryPerBlock
;
unsigned
int
maxThreads
;
if
(
gpu
->
sm_version
>=
SM_20
)
if
(
gpu
->
sm_version
>=
SM_20
)
{
maxThreads
=
384
;
else
if
(
gpu
->
sm_version
>=
SM_12
)
if
(
maxL1
){
sharedMemoryPerBlock
=
16384
;
cudaFuncSetCacheConfig
(
kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel
,
cudaFuncCachePreferL1
);
}
}
else
if
(
gpu
->
sm_version
>=
SM_12
){
maxThreads
=
128
;
else
}
else
{
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
PmeDirectElectrostaticParticle
)),
maxThreads
);
}
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
PmeDirectElectrostaticParticle
),
sharedMemoryPerBlock
),
maxThreads
);
}
kClearFields_3
(
amoebaGpu
,
1
);
#ifdef AMOEBA_DEBUG
//
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaPmeDirectElectrostaticCutoffForces:
numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u
gpu->nonbond_threads_per_block
=%
u
\n
"
,
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaPmeDirectElectrostaticCutoffForces: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u
maxL1
=%
d
\n
"
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
PmeDirectElectrostaticParticle
),
(
sizeof
(
PmeDirectElectrostaticParticle
))
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
,
gpu
->
sim
.
nonbond_threads_per_block
);
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
,
maxL1
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
//
#endif
if
(
gpu
->
bOutputBufferPerWarp
){
...
...
@@ -1156,12 +1988,6 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
}
else
{
/*
if (gpu->sm_version >= SM_20)
cudaFuncSetCacheConfig(kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel, cudaFuncCachePreferL1 );
//cudaFuncSetCacheConfig(kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel, cudaFuncCachePreferShared );
*/
kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
PmeDirectElectrostaticParticle
)
*
threadsPerBlock
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
#ifdef AMOEBA_DEBUG
...
...
@@ -1187,7 +2013,6 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
void
cudaComputeAmoebaPmeElectrostatic
(
amoebaGpuContext
amoebaGpu
)
{
cudaComputeAmoebaPmeDirectElectrostatic
(
amoebaGpu
);
kCalculateAmoebaPMEInducedDipoleForces
(
amoebaGpu
);
cudaComputeAmoebaMapTorqueAndAddToForce
(
amoebaGpu
,
amoebaGpu
->
psTorque
);
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
View file @
8a331fb9
...
...
@@ -239,6 +239,9 @@ if( atomI == targetAtom || atomJ == targetAtom ){
// No interactions in this block.
}
else
{
#ifdef CALCULATE_FULL_TILE
flags
=
0xFFFFFFFF
;
#endif
sA
[
threadIdx
.
x
].
force
[
0
]
=
0
.
0
f
;
sA
[
threadIdx
.
x
].
force
[
1
]
=
0
.
0
f
;
sA
[
threadIdx
.
x
].
force
[
2
]
=
0
.
0
f
;
...
...
@@ -311,7 +314,8 @@ if( atomI == targetAtom || atomJ == targetAtom ){
psA
[
jIdx
].
torque
[
0
]
+=
forceTorqueEnergy
[
2
].
x
;
psA
[
jIdx
].
torque
[
1
]
+=
forceTorqueEnergy
[
2
].
y
;
psA
[
jIdx
].
torque
[
2
]
+=
forceTorqueEnergy
[
2
].
z
;
#ifndef CALCULATE_FULL_TILE
}
else
{
sA
[
threadIdx
.
x
].
tempForce
[
0
]
=
forceTorqueEnergy
[
0
].
x
;
...
...
@@ -345,6 +349,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
psA
[
jIdx
].
torque
[
1
]
+=
sA
[
threadIdx
.
x
].
tempTorque
[
1
]
+
sA
[
threadIdx
.
x
+
16
].
tempTorque
[
1
];
psA
[
jIdx
].
torque
[
2
]
+=
sA
[
threadIdx
.
x
].
tempTorque
[
2
]
+
sA
[
threadIdx
.
x
+
16
].
tempTorque
[
2
];
}
#endif
}
}
// end of atoms out-of-bounds
}
// end of flags&(1<<j block
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
View file @
8a331fb9
...
...
@@ -437,7 +437,7 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
maxThreads
=
192
;
else
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
FixedFieldParticle
)),
maxThreads
);
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
FixedFieldParticle
)
,
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
if
(
gpu
->
bOutputBufferPerWarp
){
...
...
@@ -469,7 +469,7 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
if
(
amoebaGpu
->
log
){
gpu
->
psInteractionCount
->
Download
();
(
void
)
fprintf
(
amoebaGpu
->
log
,
"cudaComputeAmoebaPmeDirectFixedEField: threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u shrd=%u
\n
"
,
threadsPerBlock
,
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
FixedFieldParticle
)
+
sizeof
(
float3
)),
threadsPerBlock
,
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
FixedFieldParticle
)
+
sizeof
(
float3
)
,
gpu
->
sharedMemoryPerBlock
),
(
sizeof
(
FixedFieldParticle
)
+
sizeof
(
float3
)),
(
sizeof
(
FixedFieldParticle
)
+
sizeof
(
float3
))
*
threadsPerBlock
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"AmoebaCutoffForces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u warp=%d
\n
"
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
View file @
8a331fb9
...
...
@@ -37,10 +37,9 @@ void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
#undef AMOEBA_DEBUG
#undef INCLUDE_MI_FIELD_BUFFERS
#define INCLUDE_MI_FIELD_BUFFERS
//
#define INCLUDE_MI_FIELD_BUFFERS
#include "kCalculateAmoebaCudaMutualInducedParticle.h"
#undef INCLUDE_MI_FIELD_BUFFERS
#ifdef INCLUDE_MI_FIELD_BUFFERS
__device__
void
sumTempBuffer
(
MutualInducedParticle
&
atomI
,
MutualInducedParticle
&
atomJ
){
atomI
.
tempBuffer
[
0
]
+=
atomJ
.
tempBuffer
[
0
];
...
...
@@ -51,6 +50,93 @@ __device__ void sumTempBuffer( MutualInducedParticle& atomI, MutualInducedPartic
atomI
.
tempBufferP
[
1
]
+=
atomJ
.
tempBufferP
[
1
];
atomI
.
tempBufferP
[
2
]
+=
atomJ
.
tempBufferP
[
2
];
}
#endif
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
__device__
void
setupMutualInducedFieldPairIxn_kernel
(
const
MutualInducedParticle
&
atomI
,
const
MutualInducedParticle
&
atomJ
,
const
float
uscale
,
float4
*
delta
,
float
*
preFactor2
)
{
// compute thedelta->xeal space portion of the Ewald summation
delta
->
x
=
atomJ
.
x
-
atomI
.
x
;
delta
->
y
=
atomJ
.
y
-
atomI
.
y
;
delta
->
z
=
atomJ
.
z
-
atomI
.
z
;
// pdelta->xiodic boundary conditions
delta
->
x
-=
floor
(
delta
->
x
*
cSim
.
invPeriodicBoxSizeX
+
0.5
f
)
*
cSim
.
periodicBoxSizeX
;
delta
->
y
-=
floor
(
delta
->
y
*
cSim
.
invPeriodicBoxSizeY
+
0.5
f
)
*
cSim
.
periodicBoxSizeY
;
delta
->
z
-=
floor
(
delta
->
z
*
cSim
.
invPeriodicBoxSizeZ
+
0.5
f
)
*
cSim
.
periodicBoxSizeZ
;
float
r2
=
(
delta
->
x
*
delta
->
x
)
+
(
delta
->
y
*
delta
->
y
)
+
(
delta
->
z
*
delta
->
z
);
if
(
r2
<=
cSim
.
nonbondedCutoffSqr
){
float
r
=
sqrtf
(
r2
);
// calculate the error function damping terms
float
ralpha
=
cSim
.
alphaEwald
*
r
;
float
bn0
=
erfc
(
ralpha
)
/
r
;
float
alsq2
=
2.0
f
*
cSim
.
alphaEwald
*
cSim
.
alphaEwald
;
float
alsq2n
=
1.0
f
/
(
cAmoebaSim
.
sqrtPi
*
cSim
.
alphaEwald
);
float
exp2a
=
exp
(
-
(
ralpha
*
ralpha
));
alsq2n
*=
alsq2
;
float
bn1
=
(
bn0
+
alsq2n
*
exp2a
)
/
r2
;
alsq2n
*=
alsq2
;
float
bn2
=
(
3.0
f
*
bn1
+
alsq2n
*
exp2a
)
/
r2
;
// compute the error function scaled and unscaled terms
float
scale3
=
1.0
f
;
float
scale5
=
1.0
f
;
float
damp
=
atomI
.
damp
*
atomJ
.
damp
;
if
(
damp
!=
0.0
f
){
float
ratio
=
(
r
/
damp
);
ratio
=
ratio
*
ratio
*
ratio
;
float
pgamma
=
atomI
.
thole
<
atomJ
.
thole
?
atomI
.
thole
:
atomJ
.
thole
;
damp
=
-
pgamma
*
ratio
;
if
(
damp
>
-
50.0
f
)
{
float
expdamp
=
exp
(
damp
);
scale3
=
1.0
f
-
expdamp
;
scale5
=
1.0
f
-
expdamp
*
(
1.0
f
-
damp
);
}
}
float
dsc3
=
uscale
*
scale3
;
float
dsc5
=
uscale
*
scale5
;
float
r3
=
(
r
*
r2
);
float
r5
=
(
r3
*
r2
);
float
rr3
=
(
1.0
f
-
dsc3
)
/
r3
;
float
rr5
=
3.0
f
*
(
1.0
f
-
dsc5
)
/
r5
;
delta
->
w
=
rr3
-
bn1
;
*
preFactor2
=
bn2
-
rr5
;
}
else
{
delta
->
w
=
*
preFactor2
=
0.0
f
;
}
}
__device__
void
calculateMutualInducedFieldPairIxn_kernel
(
const
float
inducedDipole
[
3
],
const
float4
delta
,
const
float
preFactor2
,
float
fieldSum
[
3
]
)
{
float
preFactor3
=
preFactor2
*
(
inducedDipole
[
0
]
*
delta
.
x
+
inducedDipole
[
1
]
*
delta
.
y
+
inducedDipole
[
2
]
*
delta
.
z
);
fieldSum
[
0
]
+=
preFactor3
*
delta
.
x
+
delta
.
w
*
inducedDipole
[
0
];
fieldSum
[
1
]
+=
preFactor3
*
delta
.
y
+
delta
.
w
*
inducedDipole
[
1
];
fieldSum
[
2
]
+=
preFactor3
*
delta
.
z
+
delta
.
w
*
inducedDipole
[
2
];
}
__device__
void
calculateMutualInducedFieldPairIxnNoAdd_kernel
(
const
float
inducedDipole
[
3
],
const
float4
delta
,
const
float
preFactor2
,
float
fieldSum
[
3
]
)
{
float
preFactor3
=
preFactor2
*
(
inducedDipole
[
0
]
*
delta
.
x
+
inducedDipole
[
1
]
*
delta
.
y
+
inducedDipole
[
2
]
*
delta
.
z
);
fieldSum
[
0
]
=
preFactor3
*
delta
.
x
+
delta
.
w
*
inducedDipole
[
0
];
fieldSum
[
1
]
=
preFactor3
*
delta
.
y
+
delta
.
w
*
inducedDipole
[
1
];
fieldSum
[
2
]
=
preFactor3
*
delta
.
z
+
delta
.
w
*
inducedDipole
[
2
];
}
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
...
...
@@ -385,7 +471,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
maxThreads
=
128
;
else
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
MutualInducedParticle
)),
maxThreads
);
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
MutualInducedParticle
)
,
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
#ifdef AMOEBA_DEBUG
...
...
@@ -573,17 +659,17 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
amoebaGpu
->
psWorkVector
[
0
]
->
_pDevData
,
amoebaGpu
->
psWorkVector
[
1
]
->
_pDevData
);
LAUNCHERROR
(
"kSorUpdatePmeMutualInducedField"
);
if
(
0
){
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
std
::
vector
<
int
>
fileId
;
fileId
.
push_back
(
iteration
);
VectorOfDoubleVectors
outputVector
;
//
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData );
//
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData );
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psInducedDipole
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psInducedDipolePolar
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaPmeDirectMI"
,
fileId
,
outputVector
);
}
if
(
0
){
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
std
::
vector
<
int
>
fileId
;
fileId
.
push_back
(
iteration
);
VectorOfDoubleVectors
outputVector
;
// cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData );
// cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData );
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psInducedDipole
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psInducedDipolePolar
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaPmeDirectMI"
,
fileId
,
outputVector
);
}
// get total epsilon -- performing sums on gpu
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
View file @
8a331fb9
...
...
@@ -100,99 +100,17 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float4
ijField
[
3
];
// load coords, charge, ...
calculatePmeDirectMutualInducedFieldPairIxn_kernel
(
localParticle
,
psA
[
j
],
uscale
,
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
unsigned
int
mask
=
(
(
atomI
==
(
y
+
j
))
||
(
atomI
>=
cSim
.
atoms
)
||
((
y
+
j
)
>=
cSim
.
atoms
)
)
?
0
:
1
;
// add to field at atomI the field due atomJ's dipole
fieldSum
[
0
]
+=
mask
?
ijField
[
0
].
x
:
0
.
0
f
;
fieldSum
[
1
]
+=
mask
?
ijField
[
1
].
x
:
0
.
0
f
;
fieldSum
[
2
]
+=
mask
?
ijField
[
2
].
x
:
0
.
0
f
;
fieldPolarSum
[
0
]
+=
mask
?
ijField
[
0
].
z
:
0
.
0
f
;
fieldPolarSum
[
1
]
+=
mask
?
ijField
[
1
].
z
:
0
.
0
f
;
fieldPolarSum
[
2
]
+=
mask
?
ijField
[
2
].
z
:
0
.
0
f
;
#ifdef AMOEBA_DEBUG
/*
if( atomI == targetAtom || (y+j) == targetAtom ){
unsigned int index = atomI == targetAtom ? (y+j) : atomI;
unsigned int pullBackIndex = 0;
unsigned int indexI = 0;
unsigned int indexJ = indexI ? 0 : 2;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j);
debugArray[index].z = cSim.nonbondedCutoffSqr;
debugArray[index].w = 6.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
pullBackIndex++;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
index += cSim.paddedNumberOfAtoms;
float flag = 6.0f;
debugArray[index].x = ijField[0].x;
debugArray[index].y = ijField[1].x;
debugArray[index].z = ijField[2].x;
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[0].x;
debugArray[index].y = ijField[1].x;
debugArray[index].z = ijField[2].x;
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[0].z;
debugArray[index].y = ijField[1].z;
debugArray[index].z = ijField[2].z;
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[0].z;
debugArray[index].y = ijField[1].z;
debugArray[index].z = ijField[2].z;
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[0].x;
debugArray[index].y = match ? 0.0f : ijField[1].x;
debugArray[index].z = match ? 0.0f : ijField[2].x;
index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = + 10.0f;
}
*/
#endif
float4
delta
;
float
prefactor2
;
if
(
(
(
atomI
==
(
y
+
j
))
||
(
atomI
>=
cSim
.
atoms
)
||
((
y
+
j
)
>=
cSim
.
atoms
)
)
){
delta
.
w
=
prefactor2
=
0
.
0
f
;
}
else
{
setupMutualInducedFieldPairIxn_kernel
(
localParticle
,
psA
[
j
],
uscale
,
&
delta
,
&
prefactor2
);
}
calculateMutualInducedFieldPairIxn_kernel
(
psA
[
j
].
inducedDipole
,
delta
,
prefactor2
,
fieldSum
);
calculateMutualInducedFieldPairIxn_kernel
(
psA
[
j
].
inducedDipolePolar
,
delta
,
prefactor2
,
fieldPolarSum
);
}
...
...
@@ -226,6 +144,10 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
// No interactions in this block.
}
else
{
#ifndef INCLUDE_MI_FIELD_BUFFERS
flags
=
0xFFFFFFFF
;
#endif
// zero shared fields
zeroMutualInducedParticleSharedField
(
&
(
sA
[
threadIdx
.
x
])
);
...
...
@@ -235,53 +157,25 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
unsigned
int
jIdx
=
(
flags
==
0xFFFFFFFF
)
?
tj
:
j
;
float4
ijField
[
3
];
// load coords, charge, ...
calculatePmeDirectMutualInducedFieldPairIxn_kernel
(
localParticle
,
psA
[
jIdx
],
uscale
,
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
unsigned
int
mask
=
(
(
atomI
>=
cSim
.
atoms
)
||
((
y
+
jIdx
)
>=
cSim
.
atoms
)
)
?
0
:
1
;
// add to field at atomI the field due atomJ's dipole
fieldSum
[
0
]
+=
mask
?
ijField
[
0
].
x
:
0
.
0
f
;
fieldSum
[
1
]
+=
mask
?
ijField
[
1
].
x
:
0
.
0
f
;
fieldSum
[
2
]
+=
mask
?
ijField
[
2
].
x
:
0
.
0
f
;
// add to polar field at atomI the field due atomJ's dipole
fieldPolarSum
[
0
]
+=
mask
?
ijField
[
0
].
z
:
0
.
0
f
;
fieldPolarSum
[
1
]
+=
mask
?
ijField
[
1
].
z
:
0
.
0
f
;
fieldPolarSum
[
2
]
+=
mask
?
ijField
[
2
].
z
:
0
.
0
f
;
// add to field at atomJ the field due atomI's dipole
float4
delta
;
float
prefactor2
;
if
(
(
atomI
>=
cSim
.
atoms
)
||
((
y
+
jIdx
)
>=
cSim
.
atoms
)
){
delta
.
w
=
prefactor2
=
0
.
0
f
;
}
else
{
setupMutualInducedFieldPairIxn_kernel
(
localParticle
,
psA
[
jIdx
],
uscale
,
&
delta
,
&
prefactor2
);
}
calculateMutualInducedFieldPairIxn_kernel
(
psA
[
jIdx
].
inducedDipole
,
delta
,
prefactor2
,
fieldSum
);
calculateMutualInducedFieldPairIxn_kernel
(
psA
[
jIdx
].
inducedDipolePolar
,
delta
,
prefactor2
,
fieldPolarSum
);
#ifndef INCLUDE_MI_FIELD_BUFFERS
calculateMutualInducedFieldPairIxn_kernel
(
localParticle
.
inducedDipole
,
delta
,
prefactor2
,
psA
[
jIdx
].
field
);
calculateMutualInducedFieldPairIxn_kernel
(
localParticle
.
inducedDipolePolar
,
delta
,
prefactor2
,
psA
[
jIdx
].
fieldPolar
);
#else
if
(
flags
==
0xFFFFFFFF
){
psA
[
jIdx
].
field
[
0
]
+=
mask
?
ijField
[
0
].
y
:
0
.
0
f
;
psA
[
jIdx
].
field
[
1
]
+=
mask
?
ijField
[
1
].
y
:
0
.
0
f
;
psA
[
jIdx
].
field
[
2
]
+=
mask
?
ijField
[
2
].
y
:
0
.
0
f
;
// add to polar field at atomJ the field due atomI's dipole
psA
[
jIdx
].
fieldPolar
[
0
]
+=
mask
?
ijField
[
0
].
w
:
0
.
0
f
;
psA
[
jIdx
].
fieldPolar
[
1
]
+=
mask
?
ijField
[
1
].
w
:
0
.
0
f
;
psA
[
jIdx
].
fieldPolar
[
2
]
+=
mask
?
ijField
[
2
].
w
:
0
.
0
f
;
calculateMutualInducedFieldPairIxn_kernel
(
localParticle
.
inducedDipole
,
delta
,
prefactor2
,
psA
[
jIdx
].
field
);
calculateMutualInducedFieldPairIxn_kernel
(
localParticle
.
inducedDipolePolar
,
delta
,
prefactor2
,
psA
[
jIdx
].
fieldPolar
);
}
else
{
sA
[
threadIdx
.
x
].
tempBuffer
[
0
]
=
mask
?
ijField
[
0
].
y
:
0
.
0
;
sA
[
threadIdx
.
x
].
tempBuffer
[
1
]
=
mask
?
ijField
[
1
].
y
:
0
.
0
;
sA
[
threadIdx
.
x
].
tempBuffer
[
2
]
=
mask
?
ijField
[
2
].
y
:
0
.
0
;
sA
[
threadIdx
.
x
].
tempBufferP
[
0
]
=
mask
?
ijField
[
0
].
w
:
0
.
0
;
sA
[
threadIdx
.
x
].
tempBufferP
[
1
]
=
mask
?
ijField
[
1
].
w
:
0
.
0
;
sA
[
threadIdx
.
x
].
tempBufferP
[
2
]
=
mask
?
ijField
[
2
].
w
:
0
.
0
;
calculateMutualInducedFieldPairIxnNoAdd_kernel
(
localParticle
.
inducedDipole
,
delta
,
prefactor2
,
sA
[
threadIdx
.
x
].
tempBuffer
);
calculateMutualInducedFieldPairIxnNoAdd_kernel
(
localParticle
.
inducedDipolePolar
,
delta
,
prefactor2
,
sA
[
threadIdx
.
x
].
tempBufferP
);
if
(
tgx
%
2
==
0
){
sumTempBuffer
(
sA
[
threadIdx
.
x
],
sA
[
threadIdx
.
x
+
1
]
);
...
...
@@ -308,61 +202,8 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
}
}
/*
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom || (y+jIdx) == targetAtom ){
unsigned int index = atomI == targetAtom ? (y+jIdx) : atomI;
unsigned int pullBackIndex = 0;
unsigned int indexI = 0;
unsigned int indexJ = indexI ? 0 : 2;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + jIdx);
debugArray[index].z = cSim.nonbondedCutoffSqr;
debugArray[index].w = 7.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
pullBackIndex++;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
index += cSim.paddedNumberOfAtoms;
float flag = 7.0f;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI+1][0];
debugArray[index].y = ijField[indexI+1][1];
debugArray[index].z = ijField[indexI+1][2];
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ+1][0];
debugArray[index].y = ijField[indexJ+1][1];
debugArray[index].z = ijField[indexJ+1][2];
debugArray[index].w = flag;
}
#endif
*/
}
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.cu
View file @
8a331fb9
...
...
@@ -531,7 +531,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
maxThreads
=
192
;
else
maxThreads
=
128
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
Vdw14_7Particle
)),
maxThreads
);
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
Vdw14_7Particle
)
,
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
if
(
0
){
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.cu
View file @
8a331fb9
...
...
@@ -382,7 +382,7 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
maxThreads
=
192
;
else
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
WcaDispersionParticle
)),
maxThreads
);
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
WcaDispersionParticle
)
,
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
#ifdef AMOEBA_DEBUG
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment