Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
8a331fb9
Commit
8a331fb9
authored
Apr 20, 2011
by
Mark Friedrichs
Browse files
Direct space optimizations
parent
af4d503a
Changes
17
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
1046 additions
and
290 deletions
+1046
-290
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
+6
-5
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
...ins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
+1
-1
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
...rms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
+1
-1
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
...cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
+1
-1
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
...forms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
+1
-1
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
...latforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
+1
-1
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
...rms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
+1
-1
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
...c/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
+1
-1
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
...uda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
+1
-1
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedParticle.h
...a/src/kernels/kCalculateAmoebaCudaMutualInducedParticle.h
+0
-2
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
...src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
+890
-65
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
.../src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
+6
-1
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
...ms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
+2
-2
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
.../src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
+101
-15
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
...a/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
+31
-190
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.cu
...platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.cu
+1
-1
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.cu
...rms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.cu
+1
-1
No files found.
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
View file @
8a331fb9
...
@@ -3449,19 +3449,20 @@ tgx = 0;
...
@@ -3449,19 +3449,20 @@ tgx = 0;
Get threads/block
Get threads/block
@param amoebaGpu amoebaGpuContext
@param amoebaGpu amoebaGpuContext
@param sharedMemoryPerThread shared memory/thread
@param sharedMemoryPerThread shared memory/thread
@param sharedMemoryPerBlock shared memory/block
@return threadsPerBlock
@return threadsPerBlock
--------------------------------------------------------------------------------------- */
--------------------------------------------------------------------------------------- */
unsigned
int
getThreadsPerBlock
(
amoebaGpuContext
amoebaGpu
,
unsigned
int
sharedMemoryPerThread
)
unsigned
int
getThreadsPerBlock
(
amoebaGpuContext
amoebaGpu
,
unsigned
int
sharedMemoryPerThread
,
unsigned
int
sharedMemoryPerBlock
)
{
{
unsigned
int
grid
=
amoebaGpu
->
gpuContext
->
grid
;
unsigned
int
grid
=
amoebaGpu
->
gpuContext
->
grid
;
unsigned
int
threadsPerBlock
=
(
amoebaGpu
->
gpuContext
->
sharedMemoryPerBlock
+
grid
-
1
)
/
(
grid
*
sharedMemoryPerThread
);
unsigned
int
threadsPerBlock
=
(
sharedMemoryPerBlock
+
grid
-
1
)
/
(
grid
*
sharedMemoryPerThread
);
threadsPerBlock
=
threadsPerBlock
<
1
?
1
:
threadsPerBlock
;
threadsPerBlock
=
threadsPerBlock
<
1
?
1
:
threadsPerBlock
;
threadsPerBlock
*=
grid
;
threadsPerBlock
*=
grid
;
return
threadsPerBlock
;
return
threadsPerBlock
;
}
}
...
...
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
View file @
8a331fb9
...
@@ -160,7 +160,7 @@ extern void kClearFloat( amoebaGpuContext amoebaGpu, unsigned int entries, CUDAS
...
@@ -160,7 +160,7 @@ extern void kClearFloat( amoebaGpuContext amoebaGpu, unsigned int entries, CUDAS
extern
void
kClearFloat4
(
amoebaGpuContext
amoebaGpu
,
unsigned
int
entries
,
CUDAStream
<
float4
>*
fieldToClear
);
extern
void
kClearFloat4
(
amoebaGpuContext
amoebaGpu
,
unsigned
int
entries
,
CUDAStream
<
float4
>*
fieldToClear
);
extern
void
kClearFields_1
(
amoebaGpuContext
amoebaGpu
);
extern
void
kClearFields_1
(
amoebaGpuContext
amoebaGpu
);
extern
void
kClearFields_3
(
amoebaGpuContext
amoebaGpu
,
unsigned
int
numberToClear
);
extern
void
kClearFields_3
(
amoebaGpuContext
amoebaGpu
,
unsigned
int
numberToClear
);
extern
unsigned
int
getThreadsPerBlock
(
amoebaGpuContext
amoebaGpu
,
unsigned
int
sharedMemoryPerThread
);
extern
unsigned
int
getThreadsPerBlock
(
amoebaGpuContext
amoebaGpu
,
unsigned
int
sharedMemoryPerThread
,
unsigned
int
sharedMemoryPerBlock
);
//extern int isNanOrInfinity( double number );
//extern int isNanOrInfinity( double number );
extern
void
trackMutualInducedIterations
(
amoebaGpuContext
amoebaGpu
,
int
iteration
);
extern
void
trackMutualInducedIterations
(
amoebaGpuContext
amoebaGpu
,
int
iteration
);
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
View file @
8a331fb9
...
@@ -759,7 +759,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
...
@@ -759,7 +759,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
maxThreads
=
128
;
maxThreads
=
128
;
else
else
maxThreads
=
64
;
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
ElectrostaticParticle
)),
maxThreads
);
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
ElectrostaticParticle
)
,
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
}
kClearFields_3
(
amoebaGpu
,
1
);
kClearFields_3
(
amoebaGpu
,
1
);
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
View file @
8a331fb9
...
@@ -362,7 +362,7 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
...
@@ -362,7 +362,7 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
maxThreads
=
128
;
maxThreads
=
128
;
else
else
maxThreads
=
64
;
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
FixedFieldParticle
)),
maxThreads
);
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
FixedFieldParticle
)
,
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
}
kClearFields_3
(
amoebaGpu
,
3
);
kClearFields_3
(
amoebaGpu
,
3
);
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
View file @
8a331fb9
...
@@ -108,7 +108,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
...
@@ -108,7 +108,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
maxThreads
=
128
;
maxThreads
=
128
;
else
else
maxThreads
=
64
;
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
FixedFieldParticle
)),
maxThreads
);
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
FixedFieldParticle
)
,
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
}
#ifdef AMOEBA_DEBUG
#ifdef AMOEBA_DEBUG
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
View file @
8a331fb9
...
@@ -1813,7 +1813,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
...
@@ -1813,7 +1813,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
maxThreads
=
128
;
maxThreads
=
128
;
else
else
maxThreads
=
64
;
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
KirkwoodParticle
)),
maxThreads
);
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
KirkwoodParticle
)
,
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
#ifdef AMOEBA_DEBUG
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
if
(
amoebaGpu
->
log
){
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
View file @
8a331fb9
...
@@ -978,7 +978,7 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
...
@@ -978,7 +978,7 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
maxThreads
=
96
;
maxThreads
=
96
;
else
else
maxThreads
=
32
;
maxThreads
=
32
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
KirkwoodEDiffParticle
)),
maxThreads
);
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
KirkwoodEDiffParticle
)
,
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
}
#ifdef AMOEBA_DEBUG
#ifdef AMOEBA_DEBUG
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
View file @
8a331fb9
...
@@ -490,7 +490,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
...
@@ -490,7 +490,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
maxThreads
=
128
;
maxThreads
=
128
;
else
else
maxThreads
=
64
;
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
MutualInducedParticle
)),
maxThreads
);
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
MutualInducedParticle
)
,
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
}
#ifdef AMOEBA_DEBUG
#ifdef AMOEBA_DEBUG
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
View file @
8a331fb9
...
@@ -276,7 +276,7 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
...
@@ -276,7 +276,7 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
maxThreads
=
128
;
maxThreads
=
128
;
else
else
maxThreads
=
64
;
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
MutualInducedParticle
)),
maxThreads
);
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
MutualInducedParticle
)
,
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
}
#ifdef AMOEBA_DEBUG
#ifdef AMOEBA_DEBUG
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedParticle.h
View file @
8a331fb9
...
@@ -4,7 +4,6 @@ struct MutualInducedParticle {
...
@@ -4,7 +4,6 @@ struct MutualInducedParticle {
float
x
;
float
x
;
float
y
;
float
y
;
float
z
;
float
z
;
float
q
;
float
inducedDipole
[
3
];
float
inducedDipole
[
3
];
float
inducedDipolePolar
[
3
];
float
inducedDipolePolar
[
3
];
...
@@ -41,7 +40,6 @@ __device__ static void loadMutualInducedShared( MutualInducedParticle* sA, unsig
...
@@ -41,7 +40,6 @@ __device__ static void loadMutualInducedShared( MutualInducedParticle* sA, unsig
sA
->
x
=
posq
.
x
;
sA
->
x
=
posq
.
x
;
sA
->
y
=
posq
.
y
;
sA
->
y
=
posq
.
y
;
sA
->
z
=
posq
.
z
;
sA
->
z
=
posq
.
z
;
sA
->
q
=
posq
.
w
;
// dipole
// dipole
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
View file @
8a331fb9
...
@@ -38,6 +38,8 @@ static int const UScaleIndex = 2;
...
@@ -38,6 +38,8 @@ static int const UScaleIndex = 2;
static
int
const
MScaleIndex
=
3
;
static
int
const
MScaleIndex
=
3
;
static
int
const
LastScalingIndex
=
4
;
static
int
const
LastScalingIndex
=
4
;
#define CALCULATE_FULL_TILE
struct
PmeDirectElectrostaticParticle
{
struct
PmeDirectElectrostaticParticle
{
// coordinates charge
// coordinates charge
...
@@ -53,7 +55,7 @@ struct PmeDirectElectrostaticParticle {
...
@@ -53,7 +55,7 @@ struct PmeDirectElectrostaticParticle {
// lab frame quadrupole
// lab frame quadrupole
float
labFrameQuadrupole
[
9
];
float
labFrameQuadrupole
[
6
];
// induced dipole
// induced dipole
...
@@ -69,14 +71,16 @@ struct PmeDirectElectrostaticParticle {
...
@@ -69,14 +71,16 @@ struct PmeDirectElectrostaticParticle {
float
damp
;
float
damp
;
float
force
[
3
];
float
force
[
3
];
float
torque
[
3
];
float
torque
[
3
];
float
padding
;
//
float padding;
#ifndef CALCULATE_FULL_TILE
float
tempForce
[
3
];
float
tempForce
[
3
];
float
tempTorque
[
3
];
float
tempTorque
[
3
];
#endif
};
};
#ifndef CALCULATE_FULL_TILE
__device__
void
sumTempBuffer
(
PmeDirectElectrostaticParticle
&
atomI
,
PmeDirectElectrostaticParticle
&
atomJ
){
__device__
void
sumTempBuffer
(
PmeDirectElectrostaticParticle
&
atomI
,
PmeDirectElectrostaticParticle
&
atomJ
){
atomI
.
tempForce
[
0
]
+=
atomJ
.
tempForce
[
0
];
atomI
.
tempForce
[
0
]
+=
atomJ
.
tempForce
[
0
];
...
@@ -87,6 +91,7 @@ __device__ void sumTempBuffer( PmeDirectElectrostaticParticle& atomI, PmeDirectE
...
@@ -87,6 +91,7 @@ __device__ void sumTempBuffer( PmeDirectElectrostaticParticle& atomI, PmeDirectE
atomI
.
tempTorque
[
1
]
+=
atomJ
.
tempTorque
[
1
];
atomI
.
tempTorque
[
1
]
+=
atomJ
.
tempTorque
[
1
];
atomI
.
tempTorque
[
2
]
+=
atomJ
.
tempTorque
[
2
];
atomI
.
tempTorque
[
2
]
+=
atomJ
.
tempTorque
[
2
];
}
}
#endif
/*
/*
__device__ static void debugSetup( unsigned int atomI, unsigned int atomJ,
__device__ static void debugSetup( unsigned int atomI, unsigned int atomJ,
...
@@ -124,11 +129,11 @@ __device__ static void calculatePmeSelfEnergyElectrostaticPairIxn_kernel( PmeDir
...
@@ -124,11 +129,11 @@ __device__ static void calculatePmeSelfEnergyElectrostaticPairIxn_kernel( PmeDir
atomI
.
labFrameDipole
[
2
]
*
atomI
.
labFrameDipole
[
2
];
atomI
.
labFrameDipole
[
2
]
*
atomI
.
labFrameDipole
[
2
];
float
qii
=
atomI
.
labFrameQuadrupole
[
0
]
*
atomI
.
labFrameQuadrupole
[
0
]
+
float
qii
=
atomI
.
labFrameQuadrupole
[
0
]
*
atomI
.
labFrameQuadrupole
[
0
]
+
atomI
.
labFrameQuadrupole
[
4
]
*
atomI
.
labFrameQuadrupole
[
4
]
+
atomI
.
labFrameQuadrupole
[
3
]
*
atomI
.
labFrameQuadrupole
[
3
]
+
atomI
.
labFrameQuadrupole
[
8
]
*
atomI
.
labFrameQuadrupole
[
8
]
+
2.0
f
*
(
atomI
.
labFrameQuadrupole
[
5
]
*
atomI
.
labFrameQuadrupole
[
5
]
+
2.0
f
*
(
atomI
.
labFrameQuadrupole
[
1
]
*
atomI
.
labFrameQuadrupole
[
1
]
+
atomI
.
labFrameQuadrupole
[
1
]
*
atomI
.
labFrameQuadrupole
[
1
]
+
atomI
.
labFrameQuadrupole
[
2
]
*
atomI
.
labFrameQuadrupole
[
2
]
+
atomI
.
labFrameQuadrupole
[
2
]
*
atomI
.
labFrameQuadrupole
[
2
]
+
atomI
.
labFrameQuadrupole
[
5
]
*
atomI
.
labFrameQuadrupole
[
5
]);
atomI
.
labFrameQuadrupole
[
4
]
*
atomI
.
labFrameQuadrupole
[
4
]);
float
uii
=
atomI
.
labFrameDipole
[
0
]
*
atomI
.
inducedDipole
[
0
]
+
atomI
.
labFrameDipole
[
1
]
*
atomI
.
inducedDipole
[
1
]
+
atomI
.
labFrameDipole
[
2
]
*
atomI
.
inducedDipole
[
2
];
float
uii
=
atomI
.
labFrameDipole
[
0
]
*
atomI
.
inducedDipole
[
0
]
+
atomI
.
labFrameDipole
[
1
]
*
atomI
.
inducedDipole
[
1
]
+
atomI
.
labFrameDipole
[
2
]
*
atomI
.
inducedDipole
[
2
];
...
@@ -152,7 +157,836 @@ __device__ static void calculatePmeSelfTorqueElectrostaticPairIxn_kernel( PmeDir
...
@@ -152,7 +157,836 @@ __device__ static void calculatePmeSelfTorqueElectrostaticPairIxn_kernel( PmeDir
atomI
.
torque
[
2
]
+=
term
*
(
atomI
.
labFrameDipole
[
0
]
*
uiy
-
atomI
.
labFrameDipole
[
1
]
*
uix
);
atomI
.
torque
[
2
]
+=
term
*
(
atomI
.
labFrameDipole
[
0
]
*
uiy
-
atomI
.
labFrameDipole
[
1
]
*
uix
);
}
}
__device__
void
calculatePmeDirectElectrostaticPairIxn_kernel
(
PmeDirectElectrostaticParticle
&
atomI
,
PmeDirectElectrostaticParticle
&
atomJ
,
/*
#undef SUB_METHOD_NAME
#undef F1
#define SUB_METHOD_NAME(a, b) a##F1##b
#define F1
//#define T1
#include "kCalculateAmoebaCudaPmeDirectElectrostatic_b.h"
#undef F1
//#undef T1
#undef SUB_METHOD_NAME
#define SUB_METHOD_NAME(a, b) a##F2##b
#define F2
#include "kCalculateAmoebaCudaPmeDirectElectrostatic_b.h"
#undef F2
#undef SUB_METHOD_NAME
#undef SUB_METHOD_NAME
#undef T1
#define SUB_METHOD_NAME(a, b) a##T1##b
#define T1
#include "kCalculateAmoebaCudaPmeDirectElectrostatic_b.h"
#undef T1
#undef SUB_METHOD_NAME
#define SUB_METHOD_NAME(a, b) a##T2##b
#define T2
#include "kCalculateAmoebaCudaPmeDirectElectrostatic_b.h"
#undef T2
#undef SUB_METHOD_NAME
#undef SUB_METHOD_NAME
#undef T3
#define SUB_METHOD_NAME(a, b) a##T3##b
#define T3
#include "kCalculateAmoebaCudaPmeDirectElectrostatic_b.h"
#undef T3
#undef SUB_METHOD_NAME
#define SUB_METHOD_NAME(a, b) a##T4##b
#define T4
#include "kCalculateAmoebaCudaPmeDirectElectrostatic_b.h"
#undef T4
#undef SUB_METHOD_NAME
*/
static
__device__
void
calculatePmeDirectElectrostaticPairIxnF1_kernel
(
const
PmeDirectElectrostaticParticle
&
atomI
,
const
PmeDirectElectrostaticParticle
&
atomJ
,
const
float4
delta
,
const
float4
bn
,
const
float
bn5
,
const
float
*
scalingFactors
,
float4
*
forceTorqueEnergy
){
float
xr
=
delta
.
x
;
float
yr
=
delta
.
y
;
float
zr
=
delta
.
z
;
float
r
=
delta
.
w
;
// set the permanent multipole and induced dipole values;
float
ci
=
atomI
.
q
;
float
di1
=
atomI
.
labFrameDipole
[
0
];
float
di2
=
atomI
.
labFrameDipole
[
1
];
float
di3
=
atomI
.
labFrameDipole
[
2
];
float
qi1
=
atomI
.
labFrameQuadrupole
[
0
];
float
qi2
=
atomI
.
labFrameQuadrupole
[
1
];
float
qi3
=
atomI
.
labFrameQuadrupole
[
2
];
float
qi5
=
atomI
.
labFrameQuadrupole
[
3
];
float
qi6
=
atomI
.
labFrameQuadrupole
[
4
];
float
qi9
=
atomI
.
labFrameQuadrupole
[
5
];
float
ck
=
atomJ
.
q
;
float
dk1
=
atomJ
.
labFrameDipole
[
0
];
float
dk2
=
atomJ
.
labFrameDipole
[
1
];
float
dk3
=
atomJ
.
labFrameDipole
[
2
];
float
qk1
=
atomJ
.
labFrameQuadrupole
[
0
];
float
qk2
=
atomJ
.
labFrameQuadrupole
[
1
];
float
qk3
=
atomJ
.
labFrameQuadrupole
[
2
];
float
qk5
=
atomJ
.
labFrameQuadrupole
[
3
];
float
qk6
=
atomJ
.
labFrameQuadrupole
[
4
];
float
qk9
=
atomJ
.
labFrameQuadrupole
[
5
];
float
bn1
=
bn
.
x
;
float
bn2
=
bn
.
y
;
float
bn3
=
bn
.
z
;
float
bn4
=
bn
.
w
;
// apply Thole polarization damping to scale factors
float
rr1
=
1.0
f
/
r
;
float
rr2
=
rr1
*
rr1
;
float
rr3
=
rr1
*
rr2
;
float
rr5
=
3.0
f
*
rr3
*
rr2
;
float
rr7
=
5.0
f
*
rr5
*
rr2
;
float
rr9
=
7.0
f
*
rr7
*
rr2
;
float
rr11
=
9.0
f
*
rr9
*
rr2
;
float
qir1
=
qi1
*
xr
+
qi2
*
yr
+
qi3
*
zr
;
float
qir2
=
qi2
*
xr
+
qi5
*
yr
+
qi6
*
zr
;
float
qir3
=
qi3
*
xr
+
qi6
*
yr
+
qi9
*
zr
;
float
qkr1
=
qk1
*
xr
+
qk2
*
yr
+
qk3
*
zr
;
float
qkr2
=
qk2
*
xr
+
qk5
*
yr
+
qk6
*
zr
;
float
qkr3
=
qk3
*
xr
+
qk6
*
yr
+
qk9
*
zr
;
float
offset
=
1.0
f
-
scalingFactors
[
MScaleIndex
];
float
gf4
=
2.0
f
*
(
bn2
-
offset
*
rr5
);
float
qidk1
=
qi1
*
dk1
+
qi2
*
dk2
+
qi3
*
dk3
;
float
qkdi1
=
qk1
*
di1
+
qk2
*
di2
+
qk3
*
di3
;
float
ftm21
=
gf4
*
(
qkdi1
-
qidk1
);
float
qidk2
=
qi2
*
dk1
+
qi5
*
dk2
+
qi6
*
dk3
;
float
qkdi2
=
qk2
*
di1
+
qk5
*
di2
+
qk6
*
di3
;
float
ftm22
=
gf4
*
(
qkdi2
-
qidk2
);
float
qidk3
=
qi3
*
dk1
+
qi6
*
dk2
+
qi9
*
dk3
;
float
qkdi3
=
qk3
*
di1
+
qk6
*
di2
+
qk9
*
di3
;
float
ftm23
=
gf4
*
(
qkdi3
-
qidk3
);
float
gf7
=
4.0
f
*
(
bn3
-
offset
*
rr7
);
float
qiqkr1
=
qi1
*
qkr1
+
qi2
*
qkr2
+
qi3
*
qkr3
;
float
qkqir1
=
qk1
*
qir1
+
qk2
*
qir2
+
qk3
*
qir3
;
ftm21
+=
gf7
*
(
qiqkr1
+
qkqir1
);
float
qiqkr2
=
qi2
*
qkr1
+
qi5
*
qkr2
+
qi6
*
qkr3
;
float
qkqir2
=
qk2
*
qir1
+
qk5
*
qir2
+
qk6
*
qir3
;
ftm22
+=
gf7
*
(
qiqkr2
+
qkqir2
);
float
qiqkr3
=
qi3
*
qkr1
+
qi6
*
qkr2
+
qi9
*
qkr3
;
float
qkqir3
=
qk3
*
qir1
+
qk6
*
qir2
+
qk9
*
qir3
;
ftm23
+=
gf7
*
(
qiqkr3
+
qkqir3
);
// calculate the scalar products for permanent components
float
sc2
=
di1
*
dk1
+
di2
*
dk2
+
di3
*
dk3
;
float
sc7
=
qir1
*
dk1
+
qir2
*
dk2
+
qir3
*
dk3
;
float
sc8
=
qkr1
*
di1
+
qkr2
*
di2
+
qkr3
*
di3
;
float
sc9
=
qir1
*
qkr1
+
qir2
*
qkr2
+
qir3
*
qkr3
;
float
sc10
=
qi1
*
qk1
+
qi2
*
qk2
+
qi3
*
qk3
+
qi2
*
qk2
+
qi5
*
qk5
+
qi6
*
qk6
+
qi3
*
qk3
+
qi6
*
qk6
+
qi9
*
qk9
;
float
sc3
=
di1
*
xr
+
di2
*
yr
+
di3
*
zr
;
float
sc5
=
qir1
*
xr
+
qir2
*
yr
+
qir3
*
zr
;
float
sc4
=
dk1
*
xr
+
dk2
*
yr
+
dk3
*
zr
;
float
sc6
=
qkr1
*
xr
+
qkr2
*
yr
+
qkr3
*
zr
;
// calculate the scalar products for induced components
// calculate the gl functions for permanent components
float
gl0
=
ci
*
ck
;
float
gl1
=
ck
*
sc3
-
ci
*
sc4
;
float
gl2
=
ci
*
sc6
+
ck
*
sc5
-
sc3
*
sc4
;
float
gl3
=
sc3
*
sc6
-
sc4
*
sc5
;
float
gl4
=
sc5
*
sc6
;
float
gl5
=
-
4.0
f
*
sc9
;
float
gl6
=
sc2
;
float
gl7
=
2.0
f
*
(
sc7
-
sc8
);
float
gl8
=
2.0
f
*
sc10
;
forceTorqueEnergy
->
w
+=
-
offset
*
rr1
*
gl0
+
(
bn1
-
offset
*
rr3
)
*
(
gl1
+
gl6
)
+
(
bn2
-
offset
*
rr5
)
*
(
gl2
+
gl7
+
gl8
)
+
(
bn3
-
offset
*
rr7
)
*
(
gl3
+
gl5
)
+
(
bn4
-
offset
*
rr9
)
*
gl4
;
// calculate the gl functions for induced components
// intermediate variables for permanent force terms
float
gf1
=
bn1
*
gl0
+
bn2
*
(
gl1
+
gl6
)
+
bn3
*
(
gl2
+
gl7
+
gl8
)
+
bn4
*
(
gl3
+
gl5
)
+
bn5
*
gl4
;
gf1
-=
offset
*
(
rr3
*
gl0
+
rr5
*
(
gl1
+
gl6
)
+
rr7
*
(
gl2
+
gl7
+
gl8
)
+
rr9
*
(
gl3
+
gl5
)
+
rr11
*
gl4
);
ftm21
+=
gf1
*
xr
;
ftm22
+=
gf1
*
yr
;
ftm23
+=
gf1
*
zr
;
float
gf2
=
-
ck
*
bn1
+
sc4
*
bn2
-
sc6
*
bn3
-
offset
*
(
-
ck
*
rr3
+
sc4
*
rr5
-
sc6
*
rr7
);
ftm21
+=
gf2
*
di1
;
ftm22
+=
gf2
*
di2
;
ftm23
+=
gf2
*
di3
;
float
gf3
=
ci
*
bn1
+
sc3
*
bn2
+
sc5
*
bn3
-
offset
*
(
ci
*
rr3
+
sc3
*
rr5
+
sc5
*
rr7
);
ftm21
+=
gf3
*
dk1
;
ftm22
+=
gf3
*
dk2
;
ftm23
+=
gf3
*
dk3
;
float
gf5
=
2.0
f
*
(
-
ck
*
bn2
+
sc4
*
bn3
-
sc6
*
bn4
-
offset
*
(
-
ck
*
rr5
+
sc4
*
rr7
-
sc6
*
rr9
));
ftm21
+=
gf5
*
qir1
;
ftm22
+=
gf5
*
qir2
;
ftm23
+=
gf5
*
qir3
;
float
gf6
=
2.0
f
*
(
-
ci
*
bn2
-
sc3
*
bn3
-
sc5
*
bn4
-
offset
*
(
-
ci
*
rr5
-
sc3
*
rr7
-
sc5
*
rr9
));
ftm21
+=
gf6
*
qkr1
;
ftm22
+=
gf6
*
qkr2
;
ftm23
+=
gf6
*
qkr3
;
forceTorqueEnergy
->
x
=
ftm21
;
forceTorqueEnergy
->
y
=
ftm22
;
forceTorqueEnergy
->
z
=
ftm23
;
return
;
}
static
__device__
void
calculatePmeDirectElectrostaticPairIxnF2_kernel
(
const
PmeDirectElectrostaticParticle
&
atomI
,
const
PmeDirectElectrostaticParticle
&
atomJ
,
const
float4
delta
,
const
float4
bn
,
const
float
*
scalingFactors
,
float4
*
forceTorqueEnergy
){
float
xr
=
delta
.
x
;
float
yr
=
delta
.
y
;
float
zr
=
delta
.
z
;
float
r
=
delta
.
w
;
// set the permanent multipole and induced dipole values;
float
ci
=
atomI
.
q
;
float
di1
=
atomI
.
labFrameDipole
[
0
];
float
di2
=
atomI
.
labFrameDipole
[
1
];
float
di3
=
atomI
.
labFrameDipole
[
2
];
float
qi1
=
atomI
.
labFrameQuadrupole
[
0
];
float
qi2
=
atomI
.
labFrameQuadrupole
[
1
];
float
qi3
=
atomI
.
labFrameQuadrupole
[
2
];
float
qi5
=
atomI
.
labFrameQuadrupole
[
3
];
float
qi6
=
atomI
.
labFrameQuadrupole
[
4
];
float
qi9
=
atomI
.
labFrameQuadrupole
[
5
];
float
ck
=
atomJ
.
q
;
float
dk1
=
atomJ
.
labFrameDipole
[
0
];
float
dk2
=
atomJ
.
labFrameDipole
[
1
];
float
dk3
=
atomJ
.
labFrameDipole
[
2
];
float
qk1
=
atomJ
.
labFrameQuadrupole
[
0
];
float
qk2
=
atomJ
.
labFrameQuadrupole
[
1
];
float
qk3
=
atomJ
.
labFrameQuadrupole
[
2
];
float
qk5
=
atomJ
.
labFrameQuadrupole
[
3
];
float
qk6
=
atomJ
.
labFrameQuadrupole
[
4
];
float
qk9
=
atomJ
.
labFrameQuadrupole
[
5
];
float
bn1
=
bn
.
x
;
float
bn2
=
bn
.
y
;
float
bn3
=
bn
.
z
;
float
bn4
=
bn
.
w
;
// apply Thole polarization damping to scale factors
float
rr1
=
1.0
f
/
r
;
float
rr2
=
rr1
*
rr1
;
float
rr3
=
rr1
*
rr2
;
float
rr5
=
3.0
f
*
rr3
*
rr2
;
float
rr7
=
5.0
f
*
rr5
*
rr2
;
float
rr9
=
7.0
f
*
rr7
*
rr2
;
float
scale3
=
1.0
f
;
float
scale5
=
1.0
f
;
float
scale7
=
1.0
f
;
float
damp
=
atomI
.
damp
*
atomJ
.
damp
;
if
(
damp
!=
0.0
f
){
float
pgamma
=
atomI
.
thole
<
atomJ
.
thole
?
atomI
.
thole
:
atomJ
.
thole
;
float
ratio
=
r
/
damp
;
damp
=
-
pgamma
*
ratio
*
ratio
*
ratio
;
if
(
damp
>
-
50.0
f
){
float
expdamp
=
exp
(
damp
);
scale3
=
1.0
f
-
expdamp
;
scale5
=
1.0
f
-
(
1.0
f
-
damp
)
*
expdamp
;
scale7
=
1.0
f
-
(
1.0
f
-
damp
+
0.6
f
*
damp
*
damp
)
*
expdamp
;
}
else
{
damp
=
0.0
f
;
}
}
float
psc5
=
1.0
f
-
scale5
*
scalingFactors
[
PScaleIndex
];
float
dsc5
=
1.0
f
-
scale5
*
scalingFactors
[
DScaleIndex
];
float
qiuk1
=
qi1
*
atomJ
.
inducedDipole
[
0
]
+
qi2
*
atomJ
.
inducedDipole
[
1
]
+
qi3
*
atomJ
.
inducedDipole
[
2
];
float
qkui1
=
qk1
*
atomI
.
inducedDipole
[
0
]
+
qk2
*
atomI
.
inducedDipole
[
1
]
+
qk3
*
atomI
.
inducedDipole
[
2
];
float
qiukp1
=
qi1
*
atomJ
.
inducedDipoleP
[
0
]
+
qi2
*
atomJ
.
inducedDipoleP
[
1
]
+
qi3
*
atomJ
.
inducedDipoleP
[
2
];
float
qkuip1
=
qk1
*
atomI
.
inducedDipoleP
[
0
]
+
qk2
*
atomI
.
inducedDipoleP
[
1
]
+
qk3
*
atomI
.
inducedDipoleP
[
2
];
forceTorqueEnergy
->
x
+=
bn2
*
(
qkui1
+
qkuip1
-
qiuk1
-
qiukp1
);
forceTorqueEnergy
->
x
-=
rr5
*
((
qkui1
-
qiuk1
)
*
psc5
+
(
qkuip1
-
qiukp1
)
*
dsc5
);
float
qiuk2
=
qi2
*
atomJ
.
inducedDipole
[
0
]
+
qi5
*
atomJ
.
inducedDipole
[
1
]
+
qi6
*
atomJ
.
inducedDipole
[
2
];
float
qkui2
=
qk2
*
atomI
.
inducedDipole
[
0
]
+
qk5
*
atomI
.
inducedDipole
[
1
]
+
qk6
*
atomI
.
inducedDipole
[
2
];
float
qiukp2
=
qi2
*
atomJ
.
inducedDipoleP
[
0
]
+
qi5
*
atomJ
.
inducedDipoleP
[
1
]
+
qi6
*
atomJ
.
inducedDipoleP
[
2
];
float
qkuip2
=
qk2
*
atomI
.
inducedDipoleP
[
0
]
+
qk5
*
atomI
.
inducedDipoleP
[
1
]
+
qk6
*
atomI
.
inducedDipoleP
[
2
];
forceTorqueEnergy
->
y
+=
bn2
*
(
qkui2
+
qkuip2
-
qiuk2
-
qiukp2
);
forceTorqueEnergy
->
y
-=
rr5
*
((
qkui2
-
qiuk2
)
*
psc5
+
(
qkuip2
-
qiukp2
)
*
dsc5
);
float
qiuk3
=
qi3
*
atomJ
.
inducedDipole
[
0
]
+
qi6
*
atomJ
.
inducedDipole
[
1
]
+
qi9
*
atomJ
.
inducedDipole
[
2
];
float
qkui3
=
qk3
*
atomI
.
inducedDipole
[
0
]
+
qk6
*
atomI
.
inducedDipole
[
1
]
+
qk9
*
atomI
.
inducedDipole
[
2
];
float
qiukp3
=
qi3
*
atomJ
.
inducedDipoleP
[
0
]
+
qi6
*
atomJ
.
inducedDipoleP
[
1
]
+
qi9
*
atomJ
.
inducedDipoleP
[
2
];
float
qkuip3
=
qk3
*
atomI
.
inducedDipoleP
[
0
]
+
qk6
*
atomI
.
inducedDipoleP
[
1
]
+
qk9
*
atomI
.
inducedDipoleP
[
2
];
forceTorqueEnergy
->
z
+=
bn2
*
(
qkui3
+
qkuip3
-
qiuk3
-
qiukp3
);
forceTorqueEnergy
->
z
-=
rr5
*
((
qkui3
-
qiuk3
)
*
psc5
+
(
qkuip3
-
qiukp3
)
*
dsc5
);
float
sc3
=
di1
*
xr
+
di2
*
yr
+
di3
*
zr
;
float
sc4
=
dk1
*
xr
+
dk2
*
yr
+
dk3
*
zr
;
float
qir1
=
qi1
*
xr
+
qi2
*
yr
+
qi3
*
zr
;
float
qir2
=
qi2
*
xr
+
qi5
*
yr
+
qi6
*
zr
;
float
qir3
=
qi3
*
xr
+
qi6
*
yr
+
qi9
*
zr
;
float
sc5
=
qir1
*
xr
+
qir2
*
yr
+
qir3
*
zr
;
float
qkr1
=
qk1
*
xr
+
qk2
*
yr
+
qk3
*
zr
;
float
qkr2
=
qk2
*
xr
+
qk5
*
yr
+
qk6
*
zr
;
float
qkr3
=
qk3
*
xr
+
qk6
*
yr
+
qk9
*
zr
;
float
sc6
=
qkr1
*
xr
+
qkr2
*
yr
+
qkr3
*
zr
;
// calculate the scalar products for induced components
float
sci3
=
atomI
.
inducedDipole
[
0
]
*
xr
+
atomI
.
inducedDipole
[
1
]
*
yr
+
atomI
.
inducedDipole
[
2
]
*
zr
;
float
scip3
=
atomI
.
inducedDipoleP
[
0
]
*
xr
+
atomI
.
inducedDipoleP
[
1
]
*
yr
+
atomI
.
inducedDipoleP
[
2
]
*
zr
;
float
sci4
=
atomJ
.
inducedDipole
[
0
]
*
xr
+
atomJ
.
inducedDipole
[
1
]
*
yr
+
atomJ
.
inducedDipole
[
2
]
*
zr
;
float
scip4
=
atomJ
.
inducedDipoleP
[
0
]
*
xr
+
atomJ
.
inducedDipoleP
[
1
]
*
yr
+
atomJ
.
inducedDipoleP
[
2
]
*
zr
;
float
usc5
=
1.0
f
-
scale5
*
scalingFactors
[
UScaleIndex
];
float
prefactor1
=
0.5
f
*
(
bn2
-
rr5
*
usc5
);
forceTorqueEnergy
->
x
+=
prefactor1
*
(
(
sci4
*
atomI
.
inducedDipoleP
[
0
]
+
scip4
*
atomI
.
inducedDipole
[
0
])
+
sci3
*
atomJ
.
inducedDipoleP
[
0
]
+
scip3
*
atomJ
.
inducedDipole
[
0
]
);
forceTorqueEnergy
->
y
+=
prefactor1
*
(
(
sci4
*
atomI
.
inducedDipoleP
[
1
]
+
scip4
*
atomI
.
inducedDipole
[
1
])
+
sci3
*
atomJ
.
inducedDipoleP
[
1
]
+
scip3
*
atomJ
.
inducedDipole
[
1
]
);
forceTorqueEnergy
->
z
+=
prefactor1
*
(
(
sci4
*
atomI
.
inducedDipoleP
[
2
]
+
scip4
*
atomI
.
inducedDipole
[
2
])
+
sci3
*
atomJ
.
inducedDipoleP
[
2
]
+
scip3
*
atomJ
.
inducedDipole
[
2
]
);
float
dsc7
=
1.0
f
-
scale7
*
scalingFactors
[
DScaleIndex
];
float
psc7
=
1.0
f
-
scale7
*
scalingFactors
[
PScaleIndex
];
float
gfi5
=
bn3
*
(
sci4
+
scip4
)
-
rr7
*
(
sci4
*
psc7
+
scip4
*
dsc7
);
forceTorqueEnergy
->
x
+=
gfi5
*
qir1
;
forceTorqueEnergy
->
y
+=
gfi5
*
qir2
;
forceTorqueEnergy
->
z
+=
gfi5
*
qir3
;
prefactor1
=
0.5
f
*
(
bn2
*
(
sci4
+
scip4
)
-
rr5
*
(
sci4
*
psc5
+
scip4
*
dsc5
)
);
float
prefactor2
=
0.5
f
*
(
bn2
*
(
sci3
+
scip3
)
-
rr5
*
(
sci3
*
psc5
+
scip3
*
dsc5
)
);
forceTorqueEnergy
->
x
+=
prefactor1
*
di1
+
prefactor2
*
dk1
;
forceTorqueEnergy
->
y
+=
prefactor1
*
di2
+
prefactor2
*
dk2
;
forceTorqueEnergy
->
z
+=
prefactor1
*
di3
+
prefactor2
*
dk3
;
float
gfi6
=
-
bn3
*
(
sci3
+
scip3
)
+
rr7
*
(
sci3
*
psc7
+
scip3
*
dsc7
);
forceTorqueEnergy
->
x
+=
gfi6
*
qkr1
;
forceTorqueEnergy
->
y
+=
gfi6
*
qkr2
;
forceTorqueEnergy
->
z
+=
gfi6
*
qkr3
;
float
sci1
=
atomI
.
inducedDipole
[
0
]
*
dk1
+
atomI
.
inducedDipole
[
1
]
*
dk2
+
atomI
.
inducedDipole
[
2
]
*
dk3
+
di1
*
atomJ
.
inducedDipole
[
0
]
+
di2
*
atomJ
.
inducedDipole
[
1
]
+
di3
*
atomJ
.
inducedDipole
[
2
];
float
sci7
=
qir1
*
atomJ
.
inducedDipole
[
0
]
+
qir2
*
atomJ
.
inducedDipole
[
1
]
+
qir3
*
atomJ
.
inducedDipole
[
2
];
float
sci8
=
qkr1
*
atomI
.
inducedDipole
[
0
]
+
qkr2
*
atomI
.
inducedDipole
[
1
]
+
qkr3
*
atomI
.
inducedDipole
[
2
];
float
scip1
=
atomI
.
inducedDipoleP
[
0
]
*
dk1
+
atomI
.
inducedDipoleP
[
1
]
*
dk2
+
atomI
.
inducedDipoleP
[
2
]
*
dk3
+
di1
*
atomJ
.
inducedDipoleP
[
0
]
+
di2
*
atomJ
.
inducedDipoleP
[
1
]
+
di3
*
atomJ
.
inducedDipoleP
[
2
];
float
scip2
=
atomI
.
inducedDipole
[
0
]
*
atomJ
.
inducedDipoleP
[
0
]
+
atomI
.
inducedDipole
[
1
]
*
atomJ
.
inducedDipoleP
[
1
]
+
atomI
.
inducedDipole
[
2
]
*
atomJ
.
inducedDipoleP
[
2
]
+
atomJ
.
inducedDipole
[
0
]
*
atomI
.
inducedDipoleP
[
0
]
+
atomJ
.
inducedDipole
[
1
]
*
atomI
.
inducedDipoleP
[
1
]
+
atomJ
.
inducedDipole
[
2
]
*
atomI
.
inducedDipoleP
[
2
];
float
scip7
=
qir1
*
atomJ
.
inducedDipoleP
[
0
]
+
qir2
*
atomJ
.
inducedDipoleP
[
1
]
+
qir3
*
atomJ
.
inducedDipoleP
[
2
];
float
scip8
=
qkr1
*
atomI
.
inducedDipoleP
[
0
]
+
qkr2
*
atomI
.
inducedDipoleP
[
1
]
+
qkr3
*
atomI
.
inducedDipoleP
[
2
];
// calculate the gl functions for permanent components
// calculate the gl functions for induced components
float
gli1
=
ck
*
sci3
-
ci
*
sci4
;
float
gli2
=
-
sc3
*
sci4
-
sci3
*
sc4
;
float
gli3
=
sci3
*
sc6
-
sci4
*
sc5
;
float
gli6
=
sci1
;
float
gli7
=
2.0
f
*
(
sci7
-
sci8
);
float
glip1
=
ck
*
scip3
-
ci
*
scip4
;
float
glip2
=
-
sc3
*
scip4
-
scip3
*
sc4
;
float
glip3
=
scip3
*
sc6
-
scip4
*
sc5
;
float
glip6
=
scip1
;
float
glip7
=
2.0
f
*
(
scip7
-
scip8
);
float
ei
=
(
bn1
*
(
gli1
+
gli6
)
+
bn2
*
(
gli2
+
gli7
)
+
bn3
*
gli3
);
float
psc3
=
1.0
f
-
scale3
*
scalingFactors
[
PScaleIndex
];
ei
-=
(
rr3
*
(
gli1
+
gli6
)
*
psc3
+
rr5
*
(
gli2
+
gli7
)
*
psc5
+
rr7
*
gli3
*
psc7
);
forceTorqueEnergy
->
w
+=
0.5
f
*
ei
;
float
dsc3
=
1.0
f
-
scale3
*
scalingFactors
[
DScaleIndex
];
float
usc3
=
1.0
f
-
scale3
*
scalingFactors
[
UScaleIndex
];
float
gfi1
=
(
bn2
*
(
gli1
+
glip1
+
gli6
+
glip6
)
+
bn2
*
scip2
+
bn3
*
(
gli2
+
glip2
+
gli7
+
glip7
)
-
bn3
*
(
sci3
*
scip4
+
scip3
*
sci4
)
+
bn4
*
(
gli3
+
glip3
));
gfi1
-=
rr5
*
((
gli1
+
gli6
)
*
psc3
+
(
glip1
+
glip6
)
*
dsc3
+
scip2
*
usc3
)
+
rr7
*
((
gli7
+
gli2
)
*
psc5
+
(
glip7
+
glip2
)
*
dsc5
-
(
sci3
*
scip4
+
scip3
*
sci4
)
*
usc5
)
+
rr9
*
(
gli3
*
psc7
+
glip3
*
dsc7
);
gfi1
*=
0.5
f
;
forceTorqueEnergy
->
x
+=
gfi1
*
xr
;
forceTorqueEnergy
->
y
+=
gfi1
*
yr
;
forceTorqueEnergy
->
z
+=
gfi1
*
zr
;
float
gfi2
=
-
ck
*
bn1
+
sc4
*
bn2
-
sc6
*
bn3
;
float
gfi3
=
ci
*
bn1
+
sc3
*
bn2
+
sc5
*
bn3
;
float
ftm2i1
=
gfi2
*
(
atomI
.
inducedDipole
[
0
]
+
atomI
.
inducedDipoleP
[
0
])
+
gfi3
*
(
atomJ
.
inducedDipole
[
0
]
+
atomJ
.
inducedDipoleP
[
0
]);
float
ftm2i2
=
gfi2
*
(
atomI
.
inducedDipole
[
1
]
+
atomI
.
inducedDipoleP
[
1
])
+
gfi3
*
(
atomJ
.
inducedDipole
[
1
]
+
atomJ
.
inducedDipoleP
[
1
]);
float
ftm2i3
=
gfi2
*
(
atomI
.
inducedDipole
[
2
]
+
atomI
.
inducedDipoleP
[
2
])
+
gfi3
*
(
atomJ
.
inducedDipole
[
2
]
+
atomJ
.
inducedDipoleP
[
2
]);
forceTorqueEnergy
->
x
-=
0.5
f
*
(
-
rr3
*
ck
*
(
atomI
.
inducedDipole
[
0
]
*
psc3
+
atomI
.
inducedDipoleP
[
0
]
*
dsc3
)
+
rr5
*
sc4
*
(
atomI
.
inducedDipole
[
0
]
*
psc5
+
atomI
.
inducedDipoleP
[
0
]
*
dsc5
)
-
rr7
*
sc6
*
(
atomI
.
inducedDipole
[
0
]
*
psc7
+
atomI
.
inducedDipoleP
[
0
]
*
dsc7
)
+
rr3
*
ci
*
(
atomJ
.
inducedDipole
[
0
]
*
psc3
+
atomJ
.
inducedDipoleP
[
0
]
*
dsc3
)
+
rr5
*
sc3
*
(
atomJ
.
inducedDipole
[
0
]
*
psc5
+
atomJ
.
inducedDipoleP
[
0
]
*
dsc5
)
+
rr7
*
sc5
*
(
atomJ
.
inducedDipole
[
0
]
*
psc7
+
atomJ
.
inducedDipoleP
[
0
]
*
dsc7
));
forceTorqueEnergy
->
y
-=
0.5
f
*
(
-
rr3
*
ck
*
(
atomI
.
inducedDipole
[
1
]
*
psc3
+
atomI
.
inducedDipoleP
[
1
]
*
dsc3
)
+
rr5
*
sc4
*
(
atomI
.
inducedDipole
[
1
]
*
psc5
+
atomI
.
inducedDipoleP
[
1
]
*
dsc5
)
-
rr7
*
sc6
*
(
atomI
.
inducedDipole
[
1
]
*
psc7
+
atomI
.
inducedDipoleP
[
1
]
*
dsc7
)
+
rr3
*
ci
*
(
atomJ
.
inducedDipole
[
1
]
*
psc3
+
atomJ
.
inducedDipoleP
[
1
]
*
dsc3
)
+
rr5
*
sc3
*
(
atomJ
.
inducedDipole
[
1
]
*
psc5
+
atomJ
.
inducedDipoleP
[
1
]
*
dsc5
)
+
rr7
*
sc5
*
(
atomJ
.
inducedDipole
[
1
]
*
psc7
+
atomJ
.
inducedDipoleP
[
1
]
*
dsc7
));
forceTorqueEnergy
->
z
-=
0.5
f
*
(
-
rr3
*
ck
*
(
atomI
.
inducedDipole
[
2
]
*
psc3
+
atomI
.
inducedDipoleP
[
2
]
*
dsc3
)
+
rr5
*
sc4
*
(
atomI
.
inducedDipole
[
2
]
*
psc5
+
atomI
.
inducedDipoleP
[
2
]
*
dsc5
)
-
rr7
*
sc6
*
(
atomI
.
inducedDipole
[
2
]
*
psc7
+
atomI
.
inducedDipoleP
[
2
]
*
dsc7
)
+
rr3
*
ci
*
(
atomJ
.
inducedDipole
[
2
]
*
psc3
+
atomJ
.
inducedDipoleP
[
2
]
*
dsc3
)
+
rr5
*
sc3
*
(
atomJ
.
inducedDipole
[
2
]
*
psc5
+
atomJ
.
inducedDipoleP
[
2
]
*
dsc5
)
+
rr7
*
sc5
*
(
atomJ
.
inducedDipole
[
2
]
*
psc7
+
atomJ
.
inducedDipoleP
[
2
]
*
dsc7
));
if
(
damp
!=
0.0
f
){
float
expdamp
=
exp
(
damp
);
float
temp3
=
-
3.0
f
*
damp
*
expdamp
*
rr2
;
float
temp5
=
-
damp
;
float
temp7
=
-
0.2
f
-
0.6
f
*
damp
;
float
ddsc31
=
temp3
*
xr
;
float
ddsc32
=
temp3
*
yr
;
float
ddsc33
=
temp3
*
zr
;
float
ddsc51
=
temp5
*
ddsc31
;
float
ddsc52
=
temp5
*
ddsc32
;
float
ddsc53
=
temp5
*
ddsc33
;
float
ddsc71
=
temp7
*
ddsc51
;
float
ddsc72
=
temp7
*
ddsc52
;
float
ddsc73
=
temp7
*
ddsc53
;
temp3
=
rr3
*
((
gli1
+
gli6
)
*
scalingFactors
[
PScaleIndex
]
+
(
glip1
+
glip6
)
*
scalingFactors
[
DScaleIndex
]);
temp5
=
rr5
*
((
gli2
+
gli7
)
*
scalingFactors
[
PScaleIndex
]
+
(
glip2
+
glip7
)
*
scalingFactors
[
DScaleIndex
]);
temp7
=
rr7
*
(
gli3
*
scalingFactors
[
PScaleIndex
]
+
glip3
*
scalingFactors
[
DScaleIndex
]);
ftm2i1
-=
(
temp3
*
ddsc31
+
temp5
*
ddsc51
+
temp7
*
ddsc71
);
ftm2i2
-=
(
temp3
*
ddsc32
+
temp5
*
ddsc52
+
temp7
*
ddsc72
);
ftm2i3
-=
(
temp3
*
ddsc33
+
temp5
*
ddsc53
+
temp7
*
ddsc73
);
if
(
cAmoebaSim
.
polarizationType
==
0
){
temp3
=
rr3
*
scalingFactors
[
UScaleIndex
]
*
scip2
;
temp5
=
-
rr5
*
scalingFactors
[
UScaleIndex
]
*
(
sci3
*
scip4
+
scip3
*
sci4
);
ftm2i1
-=
(
temp3
*
ddsc31
+
temp5
*
ddsc51
);
ftm2i2
-=
(
temp3
*
ddsc32
+
temp5
*
ddsc52
);
ftm2i3
-=
(
temp3
*
ddsc33
+
temp5
*
ddsc53
);
}
}
if
(
cAmoebaSim
.
polarizationType
){
float
gfd
=
(
bn2
*
scip2
-
bn3
*
(
scip3
*
sci4
+
sci3
*
scip4
))
-
(
rr5
*
scip2
*
usc3
-
rr7
*
(
scip3
*
sci4
+
sci3
*
scip4
)
*
usc5
);
float
p1
=
(
bn2
-
usc5
*
rr5
);
ftm2i1
-=
gfd
*
xr
+
p1
*
(
sci4
*
atomI
.
inducedDipoleP
[
0
]
+
scip4
*
atomI
.
inducedDipole
[
0
]
+
sci3
*
atomJ
.
inducedDipoleP
[
0
]
+
scip3
*
atomJ
.
inducedDipole
[
0
]);
ftm2i2
-=
gfd
*
yr
+
p1
*
(
sci4
*
atomI
.
inducedDipoleP
[
1
]
+
scip4
*
atomI
.
inducedDipole
[
1
]
+
sci3
*
atomJ
.
inducedDipoleP
[
1
]
+
scip3
*
atomJ
.
inducedDipole
[
1
]);
ftm2i3
-=
gfd
*
zr
+
p1
*
(
sci4
*
atomI
.
inducedDipoleP
[
2
]
+
scip4
*
atomI
.
inducedDipole
[
2
]
+
sci3
*
atomJ
.
inducedDipoleP
[
2
]
+
scip3
*
atomJ
.
inducedDipole
[
2
]);
}
forceTorqueEnergy
->
x
+=
0.5
f
*
ftm2i1
;
forceTorqueEnergy
->
y
+=
0.5
f
*
ftm2i2
;
forceTorqueEnergy
->
z
+=
0.5
f
*
ftm2i3
;
return
;
}
static
__device__
void
calculatePmeDirectElectrostaticPairIxnT1_kernel
(
const
PmeDirectElectrostaticParticle
&
atomI
,
const
PmeDirectElectrostaticParticle
&
atomJ
,
const
float4
delta
,
const
float4
bn
,
const
float
*
scalingFactors
,
float4
*
forceTorqueEnergy
){
float
xr
=
delta
.
x
;
float
yr
=
delta
.
y
;
float
zr
=
delta
.
z
;
float
r
=
delta
.
w
;
// set the permanent multipole and induced dipole values;
float
di1
=
atomI
.
labFrameDipole
[
0
];
float
di2
=
atomI
.
labFrameDipole
[
1
];
float
di3
=
atomI
.
labFrameDipole
[
2
];
float
qi1
=
atomI
.
labFrameQuadrupole
[
0
];
float
qi2
=
atomI
.
labFrameQuadrupole
[
1
];
float
qi3
=
atomI
.
labFrameQuadrupole
[
2
];
float
qi5
=
atomI
.
labFrameQuadrupole
[
3
];
float
qi6
=
atomI
.
labFrameQuadrupole
[
4
];
float
qi9
=
atomI
.
labFrameQuadrupole
[
5
];
float
ck
=
atomJ
.
q
;
float
dk1
=
atomJ
.
labFrameDipole
[
0
];
float
dk2
=
atomJ
.
labFrameDipole
[
1
];
float
dk3
=
atomJ
.
labFrameDipole
[
2
];
float
qk1
=
atomJ
.
labFrameQuadrupole
[
0
];
float
qk2
=
atomJ
.
labFrameQuadrupole
[
1
];
float
qk3
=
atomJ
.
labFrameQuadrupole
[
2
];
float
qk5
=
atomJ
.
labFrameQuadrupole
[
3
];
float
qk6
=
atomJ
.
labFrameQuadrupole
[
4
];
float
qk9
=
atomJ
.
labFrameQuadrupole
[
5
];
float
bn1
=
bn
.
x
;
float
bn2
=
bn
.
y
;
float
bn3
=
bn
.
z
;
float
bn4
=
bn
.
w
;
// apply Thole polarization damping to scale factors
float
rr1
=
1.0
f
/
r
;
float
rr2
=
rr1
*
rr1
;
float
rr3
=
rr1
*
rr2
;
float
rr5
=
3.0
f
*
rr3
*
rr2
;
float
rr7
=
5.0
f
*
rr5
*
rr2
;
float
rr9
=
7.0
f
*
rr7
*
rr2
;
float
scale
=
1.0
f
-
scalingFactors
[
MScaleIndex
];
float
prefactor
=
scale
*
rr3
-
bn1
;
float
dixdk1
=
di2
*
dk3
-
di3
*
dk2
;
float
ttm21
=
prefactor
*
dixdk1
;
float
dixdk2
=
di3
*
dk1
-
di1
*
dk3
;
float
ttm22
=
prefactor
*
dixdk2
;
float
dixdk3
=
di1
*
dk2
-
di2
*
dk1
;
float
ttm23
=
prefactor
*
dixdk3
;
float
qir1
=
qi1
*
xr
+
qi2
*
yr
+
qi3
*
zr
;
float
qir2
=
qi2
*
xr
+
qi5
*
yr
+
qi6
*
zr
;
float
qir3
=
qi3
*
xr
+
qi6
*
yr
+
qi9
*
zr
;
float
qkr1
=
qk1
*
xr
+
qk2
*
yr
+
qk3
*
zr
;
float
qkr2
=
qk2
*
xr
+
qk5
*
yr
+
qk6
*
zr
;
float
qkr3
=
qk3
*
xr
+
qk6
*
yr
+
qk9
*
zr
;
float
qiqkr1
=
qi1
*
qkr1
+
qi2
*
qkr2
+
qi3
*
qkr3
;
float
qiqkr2
=
qi2
*
qkr1
+
qi5
*
qkr2
+
qi6
*
qkr3
;
float
qiqkr3
=
qi3
*
qkr1
+
qi6
*
qkr2
+
qi9
*
qkr3
;
float
rxqikr1
=
yr
*
qiqkr3
-
zr
*
qiqkr2
;
float
qkrxqir1
=
qkr2
*
qir3
-
qkr3
*
qir2
;
prefactor
=
4.0
f
*
(
bn3
-
scale
*
rr7
);
ttm21
-=
prefactor
*
(
rxqikr1
+
qkrxqir1
);
float
rxqikr2
=
zr
*
qiqkr1
-
xr
*
qiqkr3
;
float
qkrxqir2
=
qkr3
*
qir1
-
qkr1
*
qir3
;
ttm22
-=
prefactor
*
(
rxqikr2
+
qkrxqir2
);
float
rxqikr3
=
xr
*
qiqkr2
-
yr
*
qiqkr1
;
float
qkrxqir3
=
qkr1
*
qir2
-
qkr2
*
qir1
;
ttm23
-=
prefactor
*
(
rxqikr3
+
qkrxqir3
);
float
qidk1
=
qi1
*
dk1
+
qi2
*
dk2
+
qi3
*
dk3
;
float
qidk2
=
qi2
*
dk1
+
qi5
*
dk2
+
qi6
*
dk3
;
float
qidk3
=
qi3
*
dk1
+
qi6
*
dk2
+
qi9
*
dk3
;
float
dixqkr1
=
di2
*
qkr3
-
di3
*
qkr2
;
float
dkxqir1
=
dk2
*
qir3
-
dk3
*
qir2
;
float
rxqidk1
=
yr
*
qidk3
-
zr
*
qidk2
;
float
qixqk1
=
qi2
*
qk3
+
qi5
*
qk6
+
qi6
*
qk9
-
qi3
*
qk2
-
qi6
*
qk5
-
qi9
*
qk6
;
prefactor
=
2.0
f
*
(
bn2
-
scale
*
rr5
);
ttm21
+=
prefactor
*
(
dixqkr1
+
dkxqir1
+
rxqidk1
-
2.0
f
*
qixqk1
);
float
dixqkr2
=
di3
*
qkr1
-
di1
*
qkr3
;
float
dkxqir2
=
dk3
*
qir1
-
dk1
*
qir3
;
float
rxqidk2
=
zr
*
qidk1
-
xr
*
qidk3
;
float
qixqk2
=
qi3
*
qk1
+
qi6
*
qk2
+
qi9
*
qk3
-
qi1
*
qk3
-
qi2
*
qk6
-
qi3
*
qk9
;
ttm22
+=
prefactor
*
(
dixqkr2
+
dkxqir2
+
rxqidk2
-
2.0
f
*
qixqk2
);
float
dixqkr3
=
di1
*
qkr2
-
di2
*
qkr1
;
float
dkxqir3
=
dk1
*
qir2
-
dk2
*
qir1
;
float
rxqidk3
=
xr
*
qidk2
-
yr
*
qidk1
;
float
qixqk3
=
qi1
*
qk2
+
qi2
*
qk5
+
qi3
*
qk6
-
qi2
*
qk1
-
qi5
*
qk2
-
qi6
*
qk3
;
ttm23
+=
prefactor
*
(
dixqkr3
+
dkxqir3
+
rxqidk3
-
2.0
f
*
qixqk3
);
float
sc4
=
dk1
*
xr
+
dk2
*
yr
+
dk3
*
zr
;
float
sc6
=
qkr1
*
xr
+
qkr2
*
yr
+
qkr3
*
zr
;
float
gf2
=
-
ck
*
bn1
+
sc4
*
bn2
-
sc6
*
bn3
;
float
gfr2
=
-
ck
*
rr3
+
sc4
*
rr5
-
sc6
*
rr7
;
prefactor
=
(
gf2
-
scale
*
gfr2
);
ttm21
+=
prefactor
*
(
di2
*
zr
-
di3
*
yr
);
ttm22
+=
prefactor
*
(
di3
*
xr
-
di1
*
zr
);
ttm23
+=
prefactor
*
(
di1
*
yr
-
di2
*
xr
);
float
gf5
=
(
-
ck
*
bn2
+
sc4
*
bn3
-
sc6
*
bn4
);
float
gfr5
=
(
-
ck
*
rr5
+
sc4
*
rr7
-
sc6
*
rr9
);
prefactor
=
2.0
f
*
(
gf5
-
scale
*
gfr5
);
float
rxqir1
=
yr
*
qir3
-
zr
*
qir2
;
float
rxqir2
=
zr
*
qir1
-
xr
*
qir3
;
float
rxqir3
=
xr
*
qir2
-
yr
*
qir1
;
ttm21
-=
prefactor
*
rxqir1
;
ttm22
-=
prefactor
*
rxqir2
;
ttm23
-=
prefactor
*
rxqir3
;
forceTorqueEnergy
->
x
=
ttm21
;
forceTorqueEnergy
->
y
=
ttm22
;
forceTorqueEnergy
->
z
=
ttm23
;
return
;
}
static
__device__
void
calculatePmeDirectElectrostaticPairIxnT2_kernel
(
const
PmeDirectElectrostaticParticle
&
atomI
,
const
PmeDirectElectrostaticParticle
&
atomJ
,
const
float4
delta
,
const
float4
bn
,
const
float
*
scalingFactors
,
float4
*
forceTorqueEnergy
){
float
xr
=
delta
.
x
;
float
yr
=
delta
.
y
;
float
zr
=
delta
.
z
;
float
r
=
delta
.
w
;
// set the permanent multipole and induced dipole values;
float
di1
=
atomI
.
labFrameDipole
[
0
];
float
di2
=
atomI
.
labFrameDipole
[
1
];
float
di3
=
atomI
.
labFrameDipole
[
2
];
float
qi1
=
atomI
.
labFrameQuadrupole
[
0
];
float
qi2
=
atomI
.
labFrameQuadrupole
[
1
];
float
qi3
=
atomI
.
labFrameQuadrupole
[
2
];
float
qi5
=
atomI
.
labFrameQuadrupole
[
3
];
float
qi6
=
atomI
.
labFrameQuadrupole
[
4
];
float
qi9
=
atomI
.
labFrameQuadrupole
[
5
];
float
bn1
=
bn
.
x
;
float
bn2
=
bn
.
y
;
float
bn3
=
bn
.
z
;
// apply Thole polarization damping to scale factors
float
rr1
=
1.0
f
/
r
;
float
rr2
=
rr1
*
rr1
;
float
rr3
=
rr1
*
rr2
;
float
rr5
=
3.0
f
*
rr3
*
rr2
;
float
rr7
=
5.0
f
*
rr5
*
rr2
;
float
scale3
=
1.0
f
;
float
scale5
=
1.0
f
;
float
scale7
=
1.0
f
;
float
damp
=
atomI
.
damp
*
atomJ
.
damp
;
if
(
damp
!=
0.0
f
){
float
pgamma
=
atomI
.
thole
<
atomJ
.
thole
?
atomI
.
thole
:
atomJ
.
thole
;
float
ratio
=
r
/
damp
;
damp
=
-
pgamma
*
ratio
*
ratio
*
ratio
;
if
(
damp
>
-
50.0
f
){
float
expdamp
=
exp
(
damp
);
scale3
=
1.0
f
-
expdamp
;
scale5
=
1.0
f
-
(
1.0
f
-
damp
)
*
expdamp
;
scale7
=
1.0
f
-
(
1.0
f
-
damp
+
0.6
f
*
damp
*
damp
)
*
expdamp
;
}
}
float
dsc3
=
1.0
f
-
scale3
*
scalingFactors
[
DScaleIndex
];
float
dsc5
=
1.0
f
-
scale5
*
scalingFactors
[
DScaleIndex
];
float
dsc7
=
1.0
f
-
scale7
*
scalingFactors
[
DScaleIndex
];
float
psc3
=
1.0
f
-
scale3
*
scalingFactors
[
PScaleIndex
];
float
psc5
=
1.0
f
-
scale5
*
scalingFactors
[
PScaleIndex
];
float
psc7
=
1.0
f
-
scale7
*
scalingFactors
[
PScaleIndex
];
float
prefactor1
=
0.5
f
*
(
psc3
*
rr3
-
bn1
);
float
prefactor2
=
0.5
f
*
(
dsc3
*
rr3
-
bn1
);
float
dixuk1
=
di2
*
atomJ
.
inducedDipole
[
2
]
-
di3
*
atomJ
.
inducedDipole
[
1
];
float
dixukp1
=
di2
*
atomJ
.
inducedDipoleP
[
2
]
-
di3
*
atomJ
.
inducedDipoleP
[
1
];
float
ttm2i1
=
prefactor1
*
dixuk1
+
prefactor2
*
dixukp1
;
float
dixuk2
=
di3
*
atomJ
.
inducedDipole
[
0
]
-
di1
*
atomJ
.
inducedDipole
[
2
];
float
dixukp2
=
di3
*
atomJ
.
inducedDipoleP
[
0
]
-
di1
*
atomJ
.
inducedDipoleP
[
2
];
float
ttm2i2
=
prefactor1
*
dixuk2
+
prefactor2
*
dixukp2
;
float
dixuk3
=
di1
*
atomJ
.
inducedDipole
[
1
]
-
di2
*
atomJ
.
inducedDipole
[
0
];
float
dixukp3
=
di1
*
atomJ
.
inducedDipoleP
[
1
]
-
di2
*
atomJ
.
inducedDipoleP
[
0
];
float
ttm2i3
=
prefactor1
*
dixuk3
+
prefactor2
*
dixukp3
;
float
sci4
=
atomJ
.
inducedDipole
[
0
]
*
xr
+
atomJ
.
inducedDipole
[
1
]
*
yr
+
atomJ
.
inducedDipole
[
2
]
*
zr
;
float
scip4
=
atomJ
.
inducedDipoleP
[
0
]
*
xr
+
atomJ
.
inducedDipoleP
[
1
]
*
yr
+
atomJ
.
inducedDipoleP
[
2
]
*
zr
;
float
gti2
=
bn2
*
(
sci4
+
scip4
);
float
gtri2
=
rr5
*
(
sci4
*
psc5
+
scip4
*
dsc5
);
prefactor1
=
0.5
f
*
(
gti2
-
gtri2
);
ttm2i1
+=
prefactor1
*
(
di2
*
zr
-
di3
*
yr
);
ttm2i2
+=
prefactor1
*
(
di3
*
xr
-
di1
*
zr
);
ttm2i3
+=
prefactor1
*
(
di1
*
yr
-
di2
*
xr
);
float
qir1
=
qi1
*
xr
+
qi2
*
yr
+
qi3
*
zr
;
float
qir2
=
qi2
*
xr
+
qi5
*
yr
+
qi6
*
zr
;
float
qir3
=
qi3
*
xr
+
qi6
*
yr
+
qi9
*
zr
;
prefactor1
=
rr7
*
(
sci4
*
psc7
+
scip4
*
dsc7
)
-
bn3
*
(
sci4
+
scip4
);
ttm2i1
+=
prefactor1
*
(
yr
*
qir3
-
zr
*
qir2
);
ttm2i2
+=
prefactor1
*
(
zr
*
qir1
-
xr
*
qir3
);
ttm2i3
+=
prefactor1
*
(
xr
*
qir2
-
yr
*
qir1
);
float
qiuk1
=
qi1
*
atomJ
.
inducedDipole
[
0
]
+
qi2
*
atomJ
.
inducedDipole
[
1
]
+
qi3
*
atomJ
.
inducedDipole
[
2
];
float
qiuk2
=
qi2
*
atomJ
.
inducedDipole
[
0
]
+
qi5
*
atomJ
.
inducedDipole
[
1
]
+
qi6
*
atomJ
.
inducedDipole
[
2
];
float
qiuk3
=
qi3
*
atomJ
.
inducedDipole
[
0
]
+
qi6
*
atomJ
.
inducedDipole
[
1
]
+
qi9
*
atomJ
.
inducedDipole
[
2
];
float
qiukp1
=
qi1
*
atomJ
.
inducedDipoleP
[
0
]
+
qi2
*
atomJ
.
inducedDipoleP
[
1
]
+
qi3
*
atomJ
.
inducedDipoleP
[
2
];
float
qiukp2
=
qi2
*
atomJ
.
inducedDipoleP
[
0
]
+
qi5
*
atomJ
.
inducedDipoleP
[
1
]
+
qi6
*
atomJ
.
inducedDipoleP
[
2
];
float
qiukp3
=
qi3
*
atomJ
.
inducedDipoleP
[
0
]
+
qi6
*
atomJ
.
inducedDipoleP
[
1
]
+
qi9
*
atomJ
.
inducedDipoleP
[
2
];
prefactor1
=
(
bn2
-
rr5
*
psc5
);
prefactor2
=
(
bn2
-
rr5
*
dsc5
);
float
ukxqir1
=
atomJ
.
inducedDipole
[
1
]
*
qir3
-
atomJ
.
inducedDipole
[
2
]
*
qir2
;
float
ukxqirp1
=
atomJ
.
inducedDipoleP
[
1
]
*
qir3
-
atomJ
.
inducedDipoleP
[
2
]
*
qir2
;
float
rxqiuk1
=
yr
*
qiuk3
-
zr
*
qiuk2
;
float
rxqiukp1
=
yr
*
qiukp3
-
zr
*
qiukp2
;
ttm2i1
+=
prefactor1
*
(
ukxqir1
+
rxqiuk1
)
+
prefactor2
*
(
ukxqirp1
+
rxqiukp1
);
float
ukxqir2
=
atomJ
.
inducedDipole
[
2
]
*
qir1
-
atomJ
.
inducedDipole
[
0
]
*
qir3
;
float
ukxqirp2
=
atomJ
.
inducedDipoleP
[
2
]
*
qir1
-
atomJ
.
inducedDipoleP
[
0
]
*
qir3
;
float
rxqiuk2
=
zr
*
qiuk1
-
xr
*
qiuk3
;
float
rxqiukp2
=
zr
*
qiukp1
-
xr
*
qiukp3
;
ttm2i2
+=
prefactor1
*
(
ukxqir2
+
rxqiuk2
)
+
prefactor2
*
(
ukxqirp2
+
rxqiukp2
);
float
ukxqir3
=
atomJ
.
inducedDipole
[
0
]
*
qir2
-
atomJ
.
inducedDipole
[
1
]
*
qir1
;
float
ukxqirp3
=
atomJ
.
inducedDipoleP
[
0
]
*
qir2
-
atomJ
.
inducedDipoleP
[
1
]
*
qir1
;
float
rxqiuk3
=
xr
*
qiuk2
-
yr
*
qiuk1
;
float
rxqiukp3
=
xr
*
qiukp2
-
yr
*
qiukp1
;
ttm2i3
+=
prefactor1
*
(
ukxqir3
+
rxqiuk3
)
+
prefactor2
*
(
ukxqirp3
+
rxqiukp3
);
forceTorqueEnergy
->
x
+=
ttm2i1
;
forceTorqueEnergy
->
y
+=
ttm2i2
;
forceTorqueEnergy
->
z
+=
ttm2i3
;
return
;
}
__device__
void
calculatePmeDirectElectrostaticPairIxn_kernel
(
const
PmeDirectElectrostaticParticle
&
atomI
,
const
PmeDirectElectrostaticParticle
&
atomJ
,
const
float
*
scalingFactors
,
float4
forceTorqueEnergy
[
3
]
){
float4
delta
;
delta
.
x
=
atomJ
.
x
-
atomI
.
x
;
delta
.
y
=
atomJ
.
y
-
atomI
.
y
;
delta
.
z
=
atomJ
.
z
-
atomI
.
z
;
// periodic box
delta
.
x
-=
floor
(
delta
.
x
*
cSim
.
invPeriodicBoxSizeX
+
0.5
f
)
*
cSim
.
periodicBoxSizeX
;
delta
.
y
-=
floor
(
delta
.
y
*
cSim
.
invPeriodicBoxSizeY
+
0.5
f
)
*
cSim
.
periodicBoxSizeY
;
delta
.
z
-=
floor
(
delta
.
z
*
cSim
.
invPeriodicBoxSizeZ
+
0.5
f
)
*
cSim
.
periodicBoxSizeZ
;
delta
.
w
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
if
(
delta
.
w
>
cSim
.
nonbondedCutoffSqr
||
delta
.
w
==
0.0
f
){
forceTorqueEnergy
[
0
].
x
=
forceTorqueEnergy
[
0
].
y
=
forceTorqueEnergy
[
0
].
z
=
forceTorqueEnergy
[
0
].
w
=
0.0
f
;
forceTorqueEnergy
[
1
].
x
=
forceTorqueEnergy
[
1
].
y
=
forceTorqueEnergy
[
1
].
z
=
0.0
f
;
forceTorqueEnergy
[
2
].
x
=
forceTorqueEnergy
[
2
].
y
=
forceTorqueEnergy
[
2
].
z
=
0.0
f
;
return
;
}
delta
.
w
=
sqrt
(
delta
.
w
);
float
r
=
delta
.
w
;
float
ralpha
=
cSim
.
alphaEwald
*
r
;
float
alsq2
=
2.0
f
*
cSim
.
alphaEwald
*
cSim
.
alphaEwald
;
float
alsq2n
=
0.0
f
;
if
(
cSim
.
alphaEwald
>
0.0
f
){
alsq2n
=
1.0
f
/
(
cAmoebaSim
.
sqrtPi
*
cSim
.
alphaEwald
);
}
float
exp2a
=
exp
(
-
(
ralpha
*
ralpha
));
float
rr1
=
1.0
f
/
r
;
float
bn0
=
erfc
(
ralpha
)
*
rr1
;
float
rr2
=
rr1
*
rr1
;
forceTorqueEnergy
[
0
].
w
=
atomI
.
q
*
atomJ
.
q
*
bn0
;
alsq2n
*=
alsq2
;
float4
bn
;
bn
.
x
=
(
bn0
+
alsq2n
*
exp2a
)
*
rr2
;
alsq2n
*=
alsq2
;
bn
.
y
=
(
3.0
f
*
bn
.
x
+
alsq2n
*
exp2a
)
*
rr2
;
alsq2n
*=
alsq2
;
bn
.
z
=
(
5.0
f
*
bn
.
y
+
alsq2n
*
exp2a
)
*
rr2
;
alsq2n
*=
alsq2
;
bn
.
w
=
(
7.0
f
*
bn
.
z
+
alsq2n
*
exp2a
)
*
rr2
;
alsq2n
*=
alsq2
;
float
bn5
=
(
9.0
f
*
bn
.
w
+
alsq2n
*
exp2a
)
*
rr2
;
//calculatePmeDirectElectrostaticPairIxn_F1_kernel( atomI, atomJ, delta, scalingFactors, forceTorqueEnergy );
calculatePmeDirectElectrostaticPairIxnF1_kernel
(
atomI
,
atomJ
,
delta
,
bn
,
bn5
,
scalingFactors
,
forceTorqueEnergy
);
//calculatePmeDirectElectrostaticPairIxn_F2_kernel( atomI, atomJ, delta, scalingFactors, forceTorqueEnergy );
calculatePmeDirectElectrostaticPairIxnF2_kernel
(
atomI
,
atomJ
,
delta
,
bn
,
scalingFactors
,
forceTorqueEnergy
);
//calculatePmeDirectElectrostaticPairIxn_T1_kernel( atomI, atomJ, delta, scalingFactors, forceTorqueEnergy );
calculatePmeDirectElectrostaticPairIxnT1_kernel
(
atomI
,
atomJ
,
delta
,
bn
,
scalingFactors
,
(
forceTorqueEnergy
+
1
)
);
//calculatePmeDirectElectrostaticPairIxn_T2_kernel( atomI, atomJ, delta, scalingFactors, forceTorqueEnergy );
calculatePmeDirectElectrostaticPairIxnT2_kernel
(
atomI
,
atomJ
,
delta
,
bn
,
scalingFactors
,
(
forceTorqueEnergy
+
1
)
);
//calculatePmeDirectElectrostaticPairIxn_T3_kernel( atomI, atomJ, delta, scalingFactors, forceTorqueEnergy );
//calculatePmeDirectElectrostaticPairIxnT3_kernel( atomI, atomJ, delta, bn, scalingFactors, (forceTorqueEnergy+2) );
// T3 == T1 w/ particles I and J reversed
// T4 == T2 w/ particles I and J reversed
delta
.
x
*=
-
1.0
f
;
delta
.
y
*=
-
1.0
f
;
delta
.
z
*=
-
1.0
f
;
calculatePmeDirectElectrostaticPairIxnT1_kernel
(
atomJ
,
atomI
,
delta
,
bn
,
scalingFactors
,
(
forceTorqueEnergy
+
2
)
);
calculatePmeDirectElectrostaticPairIxnT2_kernel
(
atomJ
,
atomI
,
delta
,
bn
,
scalingFactors
,
(
forceTorqueEnergy
+
2
)
);
//calculatePmeDirectElectrostaticPairIxn_T4_kernel( atomI, atomJ, delta, scalingFactors, forceTorqueEnergy );
//calculatePmeDirectElectrostaticPairIxnT4_kernel( atomI, atomJ, delta, scalingFactors, (forceTorqueEnergy+2) );
return
;
}
__device__
void
calculatePmeDirectElectrostaticPairIxnOrig_kernel
(
PmeDirectElectrostaticParticle
&
atomI
,
PmeDirectElectrostaticParticle
&
atomJ
,
float
*
scalingFactors
,
float4
forceTorqueEnergy
[
3
]
float
*
scalingFactors
,
float4
forceTorqueEnergy
[
3
]
#ifdef AMOEBA_DEBUG
#ifdef AMOEBA_DEBUG
,
float4
*
debugArray
,
float4
*
debugArray
...
@@ -186,12 +1020,9 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
...
@@ -186,12 +1020,9 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float
qi1
=
atomI
.
labFrameQuadrupole
[
0
];
float
qi1
=
atomI
.
labFrameQuadrupole
[
0
];
float
qi2
=
atomI
.
labFrameQuadrupole
[
1
];
float
qi2
=
atomI
.
labFrameQuadrupole
[
1
];
float
qi3
=
atomI
.
labFrameQuadrupole
[
2
];
float
qi3
=
atomI
.
labFrameQuadrupole
[
2
];
float
qi4
=
atomI
.
labFrameQuadrupole
[
3
];
float
qi5
=
atomI
.
labFrameQuadrupole
[
3
];
float
qi5
=
atomI
.
labFrameQuadrupole
[
4
];
float
qi6
=
atomI
.
labFrameQuadrupole
[
4
];
float
qi6
=
atomI
.
labFrameQuadrupole
[
5
];
float
qi9
=
atomI
.
labFrameQuadrupole
[
5
];
float
qi7
=
atomI
.
labFrameQuadrupole
[
6
];
float
qi8
=
atomI
.
labFrameQuadrupole
[
7
];
float
qi9
=
atomI
.
labFrameQuadrupole
[
8
];
float
dk1
=
atomJ
.
labFrameDipole
[
0
];
float
dk1
=
atomJ
.
labFrameDipole
[
0
];
float
dk2
=
atomJ
.
labFrameDipole
[
1
];
float
dk2
=
atomJ
.
labFrameDipole
[
1
];
...
@@ -200,12 +1031,9 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
...
@@ -200,12 +1031,9 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float
qk1
=
atomJ
.
labFrameQuadrupole
[
0
];
float
qk1
=
atomJ
.
labFrameQuadrupole
[
0
];
float
qk2
=
atomJ
.
labFrameQuadrupole
[
1
];
float
qk2
=
atomJ
.
labFrameQuadrupole
[
1
];
float
qk3
=
atomJ
.
labFrameQuadrupole
[
2
];
float
qk3
=
atomJ
.
labFrameQuadrupole
[
2
];
float
qk4
=
atomJ
.
labFrameQuadrupole
[
3
];
float
qk5
=
atomJ
.
labFrameQuadrupole
[
3
];
float
qk5
=
atomJ
.
labFrameQuadrupole
[
4
];
float
qk6
=
atomJ
.
labFrameQuadrupole
[
4
];
float
qk6
=
atomJ
.
labFrameQuadrupole
[
5
];
float
qk9
=
atomJ
.
labFrameQuadrupole
[
5
];
float
qk7
=
atomJ
.
labFrameQuadrupole
[
6
];
float
qk8
=
atomJ
.
labFrameQuadrupole
[
7
];
float
qk9
=
atomJ
.
labFrameQuadrupole
[
8
];
// calculate the real space error function terms
// calculate the real space error function terms
...
@@ -325,25 +1153,25 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
...
@@ -325,25 +1153,25 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float
dkxr2
=
dk3
*
xr
-
dk1
*
zr
;
float
dkxr2
=
dk3
*
xr
-
dk1
*
zr
;
float
dkxr3
=
dk1
*
yr
-
dk2
*
xr
;
float
dkxr3
=
dk1
*
yr
-
dk2
*
xr
;
float
qir1
=
qi1
*
xr
+
qi
4
*
yr
+
qi
7
*
zr
;
float
qir1
=
qi1
*
xr
+
qi
2
*
yr
+
qi
3
*
zr
;
float
qir2
=
qi2
*
xr
+
qi5
*
yr
+
qi
8
*
zr
;
float
qir2
=
qi2
*
xr
+
qi5
*
yr
+
qi
6
*
zr
;
float
qir3
=
qi3
*
xr
+
qi6
*
yr
+
qi9
*
zr
;
float
qir3
=
qi3
*
xr
+
qi6
*
yr
+
qi9
*
zr
;
float
qkr1
=
qk1
*
xr
+
qk
4
*
yr
+
qk
7
*
zr
;
float
qkr1
=
qk1
*
xr
+
qk
2
*
yr
+
qk
3
*
zr
;
float
qkr2
=
qk2
*
xr
+
qk5
*
yr
+
qk
8
*
zr
;
float
qkr2
=
qk2
*
xr
+
qk5
*
yr
+
qk
6
*
zr
;
float
qkr3
=
qk3
*
xr
+
qk6
*
yr
+
qk9
*
zr
;
float
qkr3
=
qk3
*
xr
+
qk6
*
yr
+
qk9
*
zr
;
float
qiqkr1
=
qi1
*
qkr1
+
qi
4
*
qkr2
+
qi
7
*
qkr3
;
float
qiqkr1
=
qi1
*
qkr1
+
qi
2
*
qkr2
+
qi
3
*
qkr3
;
float
qiqkr2
=
qi2
*
qkr1
+
qi5
*
qkr2
+
qi
8
*
qkr3
;
float
qiqkr2
=
qi2
*
qkr1
+
qi5
*
qkr2
+
qi
6
*
qkr3
;
float
qiqkr3
=
qi3
*
qkr1
+
qi6
*
qkr2
+
qi9
*
qkr3
;
float
qiqkr3
=
qi3
*
qkr1
+
qi6
*
qkr2
+
qi9
*
qkr3
;
float
qkqir1
=
qk1
*
qir1
+
qk
4
*
qir2
+
qk
7
*
qir3
;
float
qkqir1
=
qk1
*
qir1
+
qk
2
*
qir2
+
qk
3
*
qir3
;
float
qkqir2
=
qk2
*
qir1
+
qk5
*
qir2
+
qk
8
*
qir3
;
float
qkqir2
=
qk2
*
qir1
+
qk5
*
qir2
+
qk
6
*
qir3
;
float
qkqir3
=
qk3
*
qir1
+
qk6
*
qir2
+
qk9
*
qir3
;
float
qkqir3
=
qk3
*
qir1
+
qk6
*
qir2
+
qk9
*
qir3
;
float
qixqk1
=
qi2
*
qk3
+
qi5
*
qk6
+
qi
8
*
qk9
-
qi3
*
qk2
-
qi6
*
qk5
-
qi9
*
qk
8
;
float
qixqk1
=
qi2
*
qk3
+
qi5
*
qk6
+
qi
6
*
qk9
-
qi3
*
qk2
-
qi6
*
qk5
-
qi9
*
qk
6
;
float
qixqk2
=
qi3
*
qk1
+
qi6
*
qk
4
+
qi9
*
qk
7
-
qi1
*
qk3
-
qi
4
*
qk6
-
qi
7
*
qk9
;
float
qixqk2
=
qi3
*
qk1
+
qi6
*
qk
2
+
qi9
*
qk
3
-
qi1
*
qk3
-
qi
2
*
qk6
-
qi
3
*
qk9
;
float
qixqk3
=
qi1
*
qk2
+
qi
4
*
qk5
+
qi
7
*
qk
8
-
qi2
*
qk1
-
qi5
*
qk
4
-
qi
8
*
qk
7
;
float
qixqk3
=
qi1
*
qk2
+
qi
2
*
qk5
+
qi
3
*
qk
6
-
qi2
*
qk1
-
qi5
*
qk
2
-
qi
6
*
qk
3
;
float
rxqir1
=
yr
*
qir3
-
zr
*
qir2
;
float
rxqir1
=
yr
*
qir3
-
zr
*
qir2
;
float
rxqir2
=
zr
*
qir1
-
xr
*
qir3
;
float
rxqir2
=
zr
*
qir1
-
xr
*
qir3
;
...
@@ -365,28 +1193,28 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
...
@@ -365,28 +1193,28 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float
qkrxqir2
=
qkr3
*
qir1
-
qkr1
*
qir3
;
float
qkrxqir2
=
qkr3
*
qir1
-
qkr1
*
qir3
;
float
qkrxqir3
=
qkr1
*
qir2
-
qkr2
*
qir1
;
float
qkrxqir3
=
qkr1
*
qir2
-
qkr2
*
qir1
;
float
qidk1
=
qi1
*
dk1
+
qi
4
*
dk2
+
qi
7
*
dk3
;
float
qidk1
=
qi1
*
dk1
+
qi
2
*
dk2
+
qi
3
*
dk3
;
float
qidk2
=
qi2
*
dk1
+
qi5
*
dk2
+
qi
8
*
dk3
;
float
qidk2
=
qi2
*
dk1
+
qi5
*
dk2
+
qi
6
*
dk3
;
float
qidk3
=
qi3
*
dk1
+
qi6
*
dk2
+
qi9
*
dk3
;
float
qidk3
=
qi3
*
dk1
+
qi6
*
dk2
+
qi9
*
dk3
;
float
qkdi1
=
qk1
*
di1
+
qk
4
*
di2
+
qk
7
*
di3
;
float
qkdi1
=
qk1
*
di1
+
qk
2
*
di2
+
qk
3
*
di3
;
float
qkdi2
=
qk2
*
di1
+
qk5
*
di2
+
qk
8
*
di3
;
float
qkdi2
=
qk2
*
di1
+
qk5
*
di2
+
qk
6
*
di3
;
float
qkdi3
=
qk3
*
di1
+
qk6
*
di2
+
qk9
*
di3
;
float
qkdi3
=
qk3
*
di1
+
qk6
*
di2
+
qk9
*
di3
;
float
qiuk1
=
qi1
*
atomJ
.
inducedDipole
[
0
]
+
qi
4
*
atomJ
.
inducedDipole
[
1
]
+
qi
7
*
atomJ
.
inducedDipole
[
2
];
float
qiuk1
=
qi1
*
atomJ
.
inducedDipole
[
0
]
+
qi
2
*
atomJ
.
inducedDipole
[
1
]
+
qi
3
*
atomJ
.
inducedDipole
[
2
];
float
qiuk2
=
qi2
*
atomJ
.
inducedDipole
[
0
]
+
qi5
*
atomJ
.
inducedDipole
[
1
]
+
qi
8
*
atomJ
.
inducedDipole
[
2
];
float
qiuk2
=
qi2
*
atomJ
.
inducedDipole
[
0
]
+
qi5
*
atomJ
.
inducedDipole
[
1
]
+
qi
6
*
atomJ
.
inducedDipole
[
2
];
float
qiuk3
=
qi3
*
atomJ
.
inducedDipole
[
0
]
+
qi6
*
atomJ
.
inducedDipole
[
1
]
+
qi9
*
atomJ
.
inducedDipole
[
2
];
float
qiuk3
=
qi3
*
atomJ
.
inducedDipole
[
0
]
+
qi6
*
atomJ
.
inducedDipole
[
1
]
+
qi9
*
atomJ
.
inducedDipole
[
2
];
float
qkui1
=
qk1
*
atomI
.
inducedDipole
[
0
]
+
qk
4
*
atomI
.
inducedDipole
[
1
]
+
qk
7
*
atomI
.
inducedDipole
[
2
];
float
qkui1
=
qk1
*
atomI
.
inducedDipole
[
0
]
+
qk
2
*
atomI
.
inducedDipole
[
1
]
+
qk
3
*
atomI
.
inducedDipole
[
2
];
float
qkui2
=
qk2
*
atomI
.
inducedDipole
[
0
]
+
qk5
*
atomI
.
inducedDipole
[
1
]
+
qk
8
*
atomI
.
inducedDipole
[
2
];
float
qkui2
=
qk2
*
atomI
.
inducedDipole
[
0
]
+
qk5
*
atomI
.
inducedDipole
[
1
]
+
qk
6
*
atomI
.
inducedDipole
[
2
];
float
qkui3
=
qk3
*
atomI
.
inducedDipole
[
0
]
+
qk6
*
atomI
.
inducedDipole
[
1
]
+
qk9
*
atomI
.
inducedDipole
[
2
];
float
qkui3
=
qk3
*
atomI
.
inducedDipole
[
0
]
+
qk6
*
atomI
.
inducedDipole
[
1
]
+
qk9
*
atomI
.
inducedDipole
[
2
];
float
qiukp1
=
qi1
*
atomJ
.
inducedDipoleP
[
0
]
+
qi
4
*
atomJ
.
inducedDipoleP
[
1
]
+
qi
7
*
atomJ
.
inducedDipoleP
[
2
];
float
qiukp1
=
qi1
*
atomJ
.
inducedDipoleP
[
0
]
+
qi
2
*
atomJ
.
inducedDipoleP
[
1
]
+
qi
3
*
atomJ
.
inducedDipoleP
[
2
];
float
qiukp2
=
qi2
*
atomJ
.
inducedDipoleP
[
0
]
+
qi5
*
atomJ
.
inducedDipoleP
[
1
]
+
qi
8
*
atomJ
.
inducedDipoleP
[
2
];
float
qiukp2
=
qi2
*
atomJ
.
inducedDipoleP
[
0
]
+
qi5
*
atomJ
.
inducedDipoleP
[
1
]
+
qi
6
*
atomJ
.
inducedDipoleP
[
2
];
float
qiukp3
=
qi3
*
atomJ
.
inducedDipoleP
[
0
]
+
qi6
*
atomJ
.
inducedDipoleP
[
1
]
+
qi9
*
atomJ
.
inducedDipoleP
[
2
];
float
qiukp3
=
qi3
*
atomJ
.
inducedDipoleP
[
0
]
+
qi6
*
atomJ
.
inducedDipoleP
[
1
]
+
qi9
*
atomJ
.
inducedDipoleP
[
2
];
float
qkuip1
=
qk1
*
atomI
.
inducedDipoleP
[
0
]
+
qk
4
*
atomI
.
inducedDipoleP
[
1
]
+
qk
7
*
atomI
.
inducedDipoleP
[
2
];
float
qkuip1
=
qk1
*
atomI
.
inducedDipoleP
[
0
]
+
qk
2
*
atomI
.
inducedDipoleP
[
1
]
+
qk
3
*
atomI
.
inducedDipoleP
[
2
];
float
qkuip2
=
qk2
*
atomI
.
inducedDipoleP
[
0
]
+
qk5
*
atomI
.
inducedDipoleP
[
1
]
+
qk
8
*
atomI
.
inducedDipoleP
[
2
];
float
qkuip2
=
qk2
*
atomI
.
inducedDipoleP
[
0
]
+
qk5
*
atomI
.
inducedDipoleP
[
1
]
+
qk
6
*
atomI
.
inducedDipoleP
[
2
];
float
qkuip3
=
qk3
*
atomI
.
inducedDipoleP
[
0
]
+
qk6
*
atomI
.
inducedDipoleP
[
1
]
+
qk9
*
atomI
.
inducedDipoleP
[
2
];
float
qkuip3
=
qk3
*
atomI
.
inducedDipoleP
[
0
]
+
qk6
*
atomI
.
inducedDipoleP
[
1
]
+
qk9
*
atomI
.
inducedDipoleP
[
2
];
float
dixqkr1
=
di2
*
qkr3
-
di3
*
qkr2
;
float
dixqkr1
=
di2
*
qkr3
-
di3
*
qkr2
;
...
@@ -448,8 +1276,8 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
...
@@ -448,8 +1276,8 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float
sc8
=
qkr1
*
di1
+
qkr2
*
di2
+
qkr3
*
di3
;
float
sc8
=
qkr1
*
di1
+
qkr2
*
di2
+
qkr3
*
di3
;
float
sc9
=
qir1
*
qkr1
+
qir2
*
qkr2
+
qir3
*
qkr3
;
float
sc9
=
qir1
*
qkr1
+
qir2
*
qkr2
+
qir3
*
qkr3
;
float
sc10
=
qi1
*
qk1
+
qi2
*
qk2
+
qi3
*
qk3
float
sc10
=
qi1
*
qk1
+
qi2
*
qk2
+
qi3
*
qk3
+
qi
4
*
qk
4
+
qi5
*
qk5
+
qi6
*
qk6
+
qi
2
*
qk
2
+
qi5
*
qk5
+
qi6
*
qk6
+
qi
7
*
qk
7
+
qi
8
*
qk
8
+
qi9
*
qk9
;
+
qi
3
*
qk
3
+
qi
6
*
qk
6
+
qi9
*
qk9
;
// calculate the scalar products for induced components
// calculate the scalar products for induced components
...
@@ -1031,12 +1859,9 @@ __device__ void loadPmeDirectElectrostaticShared( struct PmeDirectElectrostaticP
...
@@ -1031,12 +1859,9 @@ __device__ void loadPmeDirectElectrostaticShared( struct PmeDirectElectrostaticP
sA
->
labFrameQuadrupole
[
0
]
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomI
*
9
];
sA
->
labFrameQuadrupole
[
0
]
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomI
*
9
];
sA
->
labFrameQuadrupole
[
1
]
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomI
*
9
+
1
];
sA
->
labFrameQuadrupole
[
1
]
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomI
*
9
+
1
];
sA
->
labFrameQuadrupole
[
2
]
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomI
*
9
+
2
];
sA
->
labFrameQuadrupole
[
2
]
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomI
*
9
+
2
];
sA
->
labFrameQuadrupole
[
3
]
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomI
*
9
+
3
];
sA
->
labFrameQuadrupole
[
3
]
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomI
*
9
+
4
];
sA
->
labFrameQuadrupole
[
4
]
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomI
*
9
+
4
];
sA
->
labFrameQuadrupole
[
4
]
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomI
*
9
+
5
];
sA
->
labFrameQuadrupole
[
5
]
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomI
*
9
+
5
];
sA
->
labFrameQuadrupole
[
5
]
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomI
*
9
+
8
];
sA
->
labFrameQuadrupole
[
6
]
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomI
*
9
+
6
];
sA
->
labFrameQuadrupole
[
7
]
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomI
*
9
+
7
];
sA
->
labFrameQuadrupole
[
8
]
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomI
*
9
+
8
];
// induced dipole
// induced dipole
...
@@ -1120,28 +1945,35 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
...
@@ -1120,28 +1945,35 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
// on first pass, set threads/block
// on first pass, set threads/block
static
unsigned
int
threadsPerBlock
=
0
;
static
unsigned
int
threadsPerBlock
=
0
;
static
const
int
maxL1
=
0
;
if
(
threadsPerBlock
==
0
){
if
(
threadsPerBlock
==
0
){
unsigned
int
sharedMemoryPerBlock
=
gpu
->
sharedMemoryPerBlock
;
unsigned
int
maxThreads
;
unsigned
int
maxThreads
;
if
(
gpu
->
sm_version
>=
SM_20
)
if
(
gpu
->
sm_version
>=
SM_20
)
{
maxThreads
=
384
;
maxThreads
=
384
;
else
if
(
gpu
->
sm_version
>=
SM_12
)
if
(
maxL1
){
sharedMemoryPerBlock
=
16384
;
cudaFuncSetCacheConfig
(
kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel
,
cudaFuncCachePreferL1
);
}
}
else
if
(
gpu
->
sm_version
>=
SM_12
){
maxThreads
=
128
;
maxThreads
=
128
;
else
}
else
{
maxThreads
=
64
;
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
PmeDirectElectrostaticParticle
)),
maxThreads
);
}
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
PmeDirectElectrostaticParticle
),
sharedMemoryPerBlock
),
maxThreads
);
}
}
kClearFields_3
(
amoebaGpu
,
1
);
kClearFields_3
(
amoebaGpu
,
1
);
#ifdef AMOEBA_DEBUG
//
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaPmeDirectElectrostaticCutoffForces:
numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u
gpu->nonbond_threads_per_block
=%
u
\n
"
,
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaPmeDirectElectrostaticCutoffForces: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u
maxL1
=%
d
\n
"
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
sizeof
(
PmeDirectElectrostaticParticle
),
(
sizeof
(
PmeDirectElectrostaticParticle
))
*
threadsPerBlock
,
sizeof
(
PmeDirectElectrostaticParticle
),
(
sizeof
(
PmeDirectElectrostaticParticle
))
*
threadsPerBlock
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
,
gpu
->
sim
.
nonbond_threads_per_block
);
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
,
maxL1
);
(
void
)
fflush
(
amoebaGpu
->
log
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
}
#endif
//
#endif
if
(
gpu
->
bOutputBufferPerWarp
){
if
(
gpu
->
bOutputBufferPerWarp
){
...
@@ -1156,12 +1988,6 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
...
@@ -1156,12 +1988,6 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
}
else
{
}
else
{
/*
if (gpu->sm_version >= SM_20)
cudaFuncSetCacheConfig(kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel, cudaFuncCachePreferL1 );
//cudaFuncSetCacheConfig(kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel, cudaFuncCachePreferShared );
*/
kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
PmeDirectElectrostaticParticle
)
*
threadsPerBlock
>>>
(
kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
sizeof
(
PmeDirectElectrostaticParticle
)
*
threadsPerBlock
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pInteractingWorkUnit
,
#ifdef AMOEBA_DEBUG
#ifdef AMOEBA_DEBUG
...
@@ -1187,7 +2013,6 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
...
@@ -1187,7 +2013,6 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
void
cudaComputeAmoebaPmeElectrostatic
(
amoebaGpuContext
amoebaGpu
)
void
cudaComputeAmoebaPmeElectrostatic
(
amoebaGpuContext
amoebaGpu
)
{
{
cudaComputeAmoebaPmeDirectElectrostatic
(
amoebaGpu
);
cudaComputeAmoebaPmeDirectElectrostatic
(
amoebaGpu
);
kCalculateAmoebaPMEInducedDipoleForces
(
amoebaGpu
);
kCalculateAmoebaPMEInducedDipoleForces
(
amoebaGpu
);
cudaComputeAmoebaMapTorqueAndAddToForce
(
amoebaGpu
,
amoebaGpu
->
psTorque
);
cudaComputeAmoebaMapTorqueAndAddToForce
(
amoebaGpu
,
amoebaGpu
->
psTorque
);
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
View file @
8a331fb9
...
@@ -239,6 +239,9 @@ if( atomI == targetAtom || atomJ == targetAtom ){
...
@@ -239,6 +239,9 @@ if( atomI == targetAtom || atomJ == targetAtom ){
// No interactions in this block.
// No interactions in this block.
}
else
{
}
else
{
#ifdef CALCULATE_FULL_TILE
flags
=
0xFFFFFFFF
;
#endif
sA
[
threadIdx
.
x
].
force
[
0
]
=
0
.
0
f
;
sA
[
threadIdx
.
x
].
force
[
0
]
=
0
.
0
f
;
sA
[
threadIdx
.
x
].
force
[
1
]
=
0
.
0
f
;
sA
[
threadIdx
.
x
].
force
[
1
]
=
0
.
0
f
;
sA
[
threadIdx
.
x
].
force
[
2
]
=
0
.
0
f
;
sA
[
threadIdx
.
x
].
force
[
2
]
=
0
.
0
f
;
...
@@ -311,7 +314,8 @@ if( atomI == targetAtom || atomJ == targetAtom ){
...
@@ -311,7 +314,8 @@ if( atomI == targetAtom || atomJ == targetAtom ){
psA
[
jIdx
].
torque
[
0
]
+=
forceTorqueEnergy
[
2
].
x
;
psA
[
jIdx
].
torque
[
0
]
+=
forceTorqueEnergy
[
2
].
x
;
psA
[
jIdx
].
torque
[
1
]
+=
forceTorqueEnergy
[
2
].
y
;
psA
[
jIdx
].
torque
[
1
]
+=
forceTorqueEnergy
[
2
].
y
;
psA
[
jIdx
].
torque
[
2
]
+=
forceTorqueEnergy
[
2
].
z
;
psA
[
jIdx
].
torque
[
2
]
+=
forceTorqueEnergy
[
2
].
z
;
#ifndef CALCULATE_FULL_TILE
}
else
{
}
else
{
sA
[
threadIdx
.
x
].
tempForce
[
0
]
=
forceTorqueEnergy
[
0
].
x
;
sA
[
threadIdx
.
x
].
tempForce
[
0
]
=
forceTorqueEnergy
[
0
].
x
;
...
@@ -345,6 +349,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
...
@@ -345,6 +349,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
psA
[
jIdx
].
torque
[
1
]
+=
sA
[
threadIdx
.
x
].
tempTorque
[
1
]
+
sA
[
threadIdx
.
x
+
16
].
tempTorque
[
1
];
psA
[
jIdx
].
torque
[
1
]
+=
sA
[
threadIdx
.
x
].
tempTorque
[
1
]
+
sA
[
threadIdx
.
x
+
16
].
tempTorque
[
1
];
psA
[
jIdx
].
torque
[
2
]
+=
sA
[
threadIdx
.
x
].
tempTorque
[
2
]
+
sA
[
threadIdx
.
x
+
16
].
tempTorque
[
2
];
psA
[
jIdx
].
torque
[
2
]
+=
sA
[
threadIdx
.
x
].
tempTorque
[
2
]
+
sA
[
threadIdx
.
x
+
16
].
tempTorque
[
2
];
}
}
#endif
}
}
}
// end of atoms out-of-bounds
}
// end of atoms out-of-bounds
}
// end of flags&(1<<j block
}
// end of flags&(1<<j block
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
View file @
8a331fb9
...
@@ -437,7 +437,7 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
...
@@ -437,7 +437,7 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
maxThreads
=
192
;
maxThreads
=
192
;
else
else
maxThreads
=
64
;
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
FixedFieldParticle
)),
maxThreads
);
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
FixedFieldParticle
)
,
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
}
if
(
gpu
->
bOutputBufferPerWarp
){
if
(
gpu
->
bOutputBufferPerWarp
){
...
@@ -469,7 +469,7 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
...
@@ -469,7 +469,7 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
if
(
amoebaGpu
->
log
){
if
(
amoebaGpu
->
log
){
gpu
->
psInteractionCount
->
Download
();
gpu
->
psInteractionCount
->
Download
();
(
void
)
fprintf
(
amoebaGpu
->
log
,
"cudaComputeAmoebaPmeDirectFixedEField: threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u shrd=%u
\n
"
,
(
void
)
fprintf
(
amoebaGpu
->
log
,
"cudaComputeAmoebaPmeDirectFixedEField: threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u shrd=%u
\n
"
,
threadsPerBlock
,
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
FixedFieldParticle
)
+
sizeof
(
float3
)),
threadsPerBlock
,
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
FixedFieldParticle
)
+
sizeof
(
float3
)
,
gpu
->
sharedMemoryPerBlock
),
(
sizeof
(
FixedFieldParticle
)
+
sizeof
(
float3
)),
(
sizeof
(
FixedFieldParticle
)
+
sizeof
(
float3
))
*
threadsPerBlock
);
(
sizeof
(
FixedFieldParticle
)
+
sizeof
(
float3
)),
(
sizeof
(
FixedFieldParticle
)
+
sizeof
(
float3
))
*
threadsPerBlock
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"AmoebaCutoffForces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u warp=%d
\n
"
,
(
void
)
fprintf
(
amoebaGpu
->
log
,
"AmoebaCutoffForces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u warp=%d
\n
"
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
gpu
->
sim
.
nonbond_blocks
,
threadsPerBlock
,
gpu
->
bOutputBufferPerWarp
,
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
View file @
8a331fb9
...
@@ -37,10 +37,9 @@ void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
...
@@ -37,10 +37,9 @@ void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
#undef AMOEBA_DEBUG
#undef AMOEBA_DEBUG
#undef INCLUDE_MI_FIELD_BUFFERS
#undef INCLUDE_MI_FIELD_BUFFERS
#define INCLUDE_MI_FIELD_BUFFERS
//
#define INCLUDE_MI_FIELD_BUFFERS
#include "kCalculateAmoebaCudaMutualInducedParticle.h"
#include "kCalculateAmoebaCudaMutualInducedParticle.h"
#undef INCLUDE_MI_FIELD_BUFFERS
#ifdef INCLUDE_MI_FIELD_BUFFERS
__device__
void
sumTempBuffer
(
MutualInducedParticle
&
atomI
,
MutualInducedParticle
&
atomJ
){
__device__
void
sumTempBuffer
(
MutualInducedParticle
&
atomI
,
MutualInducedParticle
&
atomJ
){
atomI
.
tempBuffer
[
0
]
+=
atomJ
.
tempBuffer
[
0
];
atomI
.
tempBuffer
[
0
]
+=
atomJ
.
tempBuffer
[
0
];
...
@@ -51,6 +50,93 @@ __device__ void sumTempBuffer( MutualInducedParticle& atomI, MutualInducedPartic
...
@@ -51,6 +50,93 @@ __device__ void sumTempBuffer( MutualInducedParticle& atomI, MutualInducedPartic
atomI
.
tempBufferP
[
1
]
+=
atomJ
.
tempBufferP
[
1
];
atomI
.
tempBufferP
[
1
]
+=
atomJ
.
tempBufferP
[
1
];
atomI
.
tempBufferP
[
2
]
+=
atomJ
.
tempBufferP
[
2
];
atomI
.
tempBufferP
[
2
]
+=
atomJ
.
tempBufferP
[
2
];
}
}
#endif
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
__device__
void
setupMutualInducedFieldPairIxn_kernel
(
const
MutualInducedParticle
&
atomI
,
const
MutualInducedParticle
&
atomJ
,
const
float
uscale
,
float4
*
delta
,
float
*
preFactor2
)
{
// compute thedelta->xeal space portion of the Ewald summation
delta
->
x
=
atomJ
.
x
-
atomI
.
x
;
delta
->
y
=
atomJ
.
y
-
atomI
.
y
;
delta
->
z
=
atomJ
.
z
-
atomI
.
z
;
// pdelta->xiodic boundary conditions
delta
->
x
-=
floor
(
delta
->
x
*
cSim
.
invPeriodicBoxSizeX
+
0.5
f
)
*
cSim
.
periodicBoxSizeX
;
delta
->
y
-=
floor
(
delta
->
y
*
cSim
.
invPeriodicBoxSizeY
+
0.5
f
)
*
cSim
.
periodicBoxSizeY
;
delta
->
z
-=
floor
(
delta
->
z
*
cSim
.
invPeriodicBoxSizeZ
+
0.5
f
)
*
cSim
.
periodicBoxSizeZ
;
float
r2
=
(
delta
->
x
*
delta
->
x
)
+
(
delta
->
y
*
delta
->
y
)
+
(
delta
->
z
*
delta
->
z
);
if
(
r2
<=
cSim
.
nonbondedCutoffSqr
){
float
r
=
sqrtf
(
r2
);
// calculate the error function damping terms
float
ralpha
=
cSim
.
alphaEwald
*
r
;
float
bn0
=
erfc
(
ralpha
)
/
r
;
float
alsq2
=
2.0
f
*
cSim
.
alphaEwald
*
cSim
.
alphaEwald
;
float
alsq2n
=
1.0
f
/
(
cAmoebaSim
.
sqrtPi
*
cSim
.
alphaEwald
);
float
exp2a
=
exp
(
-
(
ralpha
*
ralpha
));
alsq2n
*=
alsq2
;
float
bn1
=
(
bn0
+
alsq2n
*
exp2a
)
/
r2
;
alsq2n
*=
alsq2
;
float
bn2
=
(
3.0
f
*
bn1
+
alsq2n
*
exp2a
)
/
r2
;
// compute the error function scaled and unscaled terms
float
scale3
=
1.0
f
;
float
scale5
=
1.0
f
;
float
damp
=
atomI
.
damp
*
atomJ
.
damp
;
if
(
damp
!=
0.0
f
){
float
ratio
=
(
r
/
damp
);
ratio
=
ratio
*
ratio
*
ratio
;
float
pgamma
=
atomI
.
thole
<
atomJ
.
thole
?
atomI
.
thole
:
atomJ
.
thole
;
damp
=
-
pgamma
*
ratio
;
if
(
damp
>
-
50.0
f
)
{
float
expdamp
=
exp
(
damp
);
scale3
=
1.0
f
-
expdamp
;
scale5
=
1.0
f
-
expdamp
*
(
1.0
f
-
damp
);
}
}
float
dsc3
=
uscale
*
scale3
;
float
dsc5
=
uscale
*
scale5
;
float
r3
=
(
r
*
r2
);
float
r5
=
(
r3
*
r2
);
float
rr3
=
(
1.0
f
-
dsc3
)
/
r3
;
float
rr5
=
3.0
f
*
(
1.0
f
-
dsc5
)
/
r5
;
delta
->
w
=
rr3
-
bn1
;
*
preFactor2
=
bn2
-
rr5
;
}
else
{
delta
->
w
=
*
preFactor2
=
0.0
f
;
}
}
__device__
void
calculateMutualInducedFieldPairIxn_kernel
(
const
float
inducedDipole
[
3
],
const
float4
delta
,
const
float
preFactor2
,
float
fieldSum
[
3
]
)
{
float
preFactor3
=
preFactor2
*
(
inducedDipole
[
0
]
*
delta
.
x
+
inducedDipole
[
1
]
*
delta
.
y
+
inducedDipole
[
2
]
*
delta
.
z
);
fieldSum
[
0
]
+=
preFactor3
*
delta
.
x
+
delta
.
w
*
inducedDipole
[
0
];
fieldSum
[
1
]
+=
preFactor3
*
delta
.
y
+
delta
.
w
*
inducedDipole
[
1
];
fieldSum
[
2
]
+=
preFactor3
*
delta
.
z
+
delta
.
w
*
inducedDipole
[
2
];
}
__device__
void
calculateMutualInducedFieldPairIxnNoAdd_kernel
(
const
float
inducedDipole
[
3
],
const
float4
delta
,
const
float
preFactor2
,
float
fieldSum
[
3
]
)
{
float
preFactor3
=
preFactor2
*
(
inducedDipole
[
0
]
*
delta
.
x
+
inducedDipole
[
1
]
*
delta
.
y
+
inducedDipole
[
2
]
*
delta
.
z
);
fieldSum
[
0
]
=
preFactor3
*
delta
.
x
+
delta
.
w
*
inducedDipole
[
0
];
fieldSum
[
1
]
=
preFactor3
*
delta
.
y
+
delta
.
w
*
inducedDipole
[
1
];
fieldSum
[
2
]
=
preFactor3
*
delta
.
z
+
delta
.
w
*
inducedDipole
[
2
];
}
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
...
@@ -385,7 +471,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
...
@@ -385,7 +471,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
maxThreads
=
128
;
maxThreads
=
128
;
else
else
maxThreads
=
64
;
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
MutualInducedParticle
)),
maxThreads
);
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
MutualInducedParticle
)
,
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
}
#ifdef AMOEBA_DEBUG
#ifdef AMOEBA_DEBUG
...
@@ -573,17 +659,17 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
...
@@ -573,17 +659,17 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
amoebaGpu
->
psWorkVector
[
0
]
->
_pDevData
,
amoebaGpu
->
psWorkVector
[
1
]
->
_pDevData
);
amoebaGpu
->
psWorkVector
[
0
]
->
_pDevData
,
amoebaGpu
->
psWorkVector
[
1
]
->
_pDevData
);
LAUNCHERROR
(
"kSorUpdatePmeMutualInducedField"
);
LAUNCHERROR
(
"kSorUpdatePmeMutualInducedField"
);
if
(
0
){
if
(
0
){
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
std
::
vector
<
int
>
fileId
;
std
::
vector
<
int
>
fileId
;
fileId
.
push_back
(
iteration
);
fileId
.
push_back
(
iteration
);
VectorOfDoubleVectors
outputVector
;
VectorOfDoubleVectors
outputVector
;
//
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData );
// cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData );
//
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData );
// cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData );
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psInducedDipole
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psInducedDipole
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psInducedDipolePolar
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psInducedDipolePolar
,
outputVector
,
gpu
->
psAtomIndex
->
_pSysData
,
1.0
f
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaPmeDirectMI"
,
fileId
,
outputVector
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaPmeDirectMI"
,
fileId
,
outputVector
);
}
}
// get total epsilon -- performing sums on gpu
// get total epsilon -- performing sums on gpu
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
View file @
8a331fb9
...
@@ -100,99 +100,17 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
...
@@ -100,99 +100,17 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
{
float4
ijField
[
3
];
// load coords, charge, ...
// load coords, charge, ...
calculatePmeDirectMutualInducedFieldPairIxn_kernel
(
localParticle
,
psA
[
j
],
uscale
,
ijField
float4
delta
;
#ifdef AMOEBA_DEBUG
float
prefactor2
;
,
pullBack
if
(
(
(
atomI
==
(
y
+
j
))
||
(
atomI
>=
cSim
.
atoms
)
||
((
y
+
j
)
>=
cSim
.
atoms
)
)
){
#endif
delta
.
w
=
prefactor2
=
0
.
0
f
;
);
}
else
{
setupMutualInducedFieldPairIxn_kernel
(
localParticle
,
psA
[
j
],
uscale
,
&
delta
,
&
prefactor2
);
unsigned
int
mask
=
(
(
atomI
==
(
y
+
j
))
||
(
atomI
>=
cSim
.
atoms
)
||
((
y
+
j
)
>=
cSim
.
atoms
)
)
?
0
:
1
;
}
calculateMutualInducedFieldPairIxn_kernel
(
psA
[
j
].
inducedDipole
,
delta
,
prefactor2
,
fieldSum
);
// add to field at atomI the field due atomJ's dipole
calculateMutualInducedFieldPairIxn_kernel
(
psA
[
j
].
inducedDipolePolar
,
delta
,
prefactor2
,
fieldPolarSum
);
fieldSum
[
0
]
+=
mask
?
ijField
[
0
].
x
:
0
.
0
f
;
fieldSum
[
1
]
+=
mask
?
ijField
[
1
].
x
:
0
.
0
f
;
fieldSum
[
2
]
+=
mask
?
ijField
[
2
].
x
:
0
.
0
f
;
fieldPolarSum
[
0
]
+=
mask
?
ijField
[
0
].
z
:
0
.
0
f
;
fieldPolarSum
[
1
]
+=
mask
?
ijField
[
1
].
z
:
0
.
0
f
;
fieldPolarSum
[
2
]
+=
mask
?
ijField
[
2
].
z
:
0
.
0
f
;
#ifdef AMOEBA_DEBUG
/*
if( atomI == targetAtom || (y+j) == targetAtom ){
unsigned int index = atomI == targetAtom ? (y+j) : atomI;
unsigned int pullBackIndex = 0;
unsigned int indexI = 0;
unsigned int indexJ = indexI ? 0 : 2;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j);
debugArray[index].z = cSim.nonbondedCutoffSqr;
debugArray[index].w = 6.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
pullBackIndex++;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
index += cSim.paddedNumberOfAtoms;
float flag = 6.0f;
debugArray[index].x = ijField[0].x;
debugArray[index].y = ijField[1].x;
debugArray[index].z = ijField[2].x;
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[0].x;
debugArray[index].y = ijField[1].x;
debugArray[index].z = ijField[2].x;
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[0].z;
debugArray[index].y = ijField[1].z;
debugArray[index].z = ijField[2].z;
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[0].z;
debugArray[index].y = ijField[1].z;
debugArray[index].z = ijField[2].z;
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[0].x;
debugArray[index].y = match ? 0.0f : ijField[1].x;
debugArray[index].z = match ? 0.0f : ijField[2].x;
index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = + 10.0f;
}
*/
#endif
}
}
...
@@ -226,6 +144,10 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
...
@@ -226,6 +144,10 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
// No interactions in this block.
// No interactions in this block.
}
else
{
}
else
{
#ifndef INCLUDE_MI_FIELD_BUFFERS
flags
=
0xFFFFFFFF
;
#endif
// zero shared fields
// zero shared fields
zeroMutualInducedParticleSharedField
(
&
(
sA
[
threadIdx
.
x
])
);
zeroMutualInducedParticleSharedField
(
&
(
sA
[
threadIdx
.
x
])
);
...
@@ -235,53 +157,25 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
...
@@ -235,53 +157,25 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
if
((
flags
&
(
1
<<
j
))
!=
0
)
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
{
unsigned
int
jIdx
=
(
flags
==
0xFFFFFFFF
)
?
tj
:
j
;
unsigned
int
jIdx
=
(
flags
==
0xFFFFFFFF
)
?
tj
:
j
;
float4
ijField
[
3
];
float4
delta
;
float
prefactor2
;
// load coords, charge, ...
if
(
(
atomI
>=
cSim
.
atoms
)
||
((
y
+
jIdx
)
>=
cSim
.
atoms
)
){
delta
.
w
=
prefactor2
=
0
.
0
f
;
calculatePmeDirectMutualInducedFieldPairIxn_kernel
(
localParticle
,
psA
[
jIdx
],
uscale
,
ijField
}
else
{
#ifdef AMOEBA_DEBUG
setupMutualInducedFieldPairIxn_kernel
(
localParticle
,
psA
[
jIdx
],
uscale
,
&
delta
,
&
prefactor2
);
,
pullBack
}
#endif
calculateMutualInducedFieldPairIxn_kernel
(
psA
[
jIdx
].
inducedDipole
,
delta
,
prefactor2
,
fieldSum
);
);
calculateMutualInducedFieldPairIxn_kernel
(
psA
[
jIdx
].
inducedDipolePolar
,
delta
,
prefactor2
,
fieldPolarSum
);
#ifndef INCLUDE_MI_FIELD_BUFFERS
unsigned
int
mask
=
(
(
atomI
>=
cSim
.
atoms
)
||
((
y
+
jIdx
)
>=
cSim
.
atoms
)
)
?
0
:
1
;
calculateMutualInducedFieldPairIxn_kernel
(
localParticle
.
inducedDipole
,
delta
,
prefactor2
,
psA
[
jIdx
].
field
);
calculateMutualInducedFieldPairIxn_kernel
(
localParticle
.
inducedDipolePolar
,
delta
,
prefactor2
,
psA
[
jIdx
].
fieldPolar
);
// add to field at atomI the field due atomJ's dipole
#else
fieldSum
[
0
]
+=
mask
?
ijField
[
0
].
x
:
0
.
0
f
;
fieldSum
[
1
]
+=
mask
?
ijField
[
1
].
x
:
0
.
0
f
;
fieldSum
[
2
]
+=
mask
?
ijField
[
2
].
x
:
0
.
0
f
;
// add to polar field at atomI the field due atomJ's dipole
fieldPolarSum
[
0
]
+=
mask
?
ijField
[
0
].
z
:
0
.
0
f
;
fieldPolarSum
[
1
]
+=
mask
?
ijField
[
1
].
z
:
0
.
0
f
;
fieldPolarSum
[
2
]
+=
mask
?
ijField
[
2
].
z
:
0
.
0
f
;
// add to field at atomJ the field due atomI's dipole
if
(
flags
==
0xFFFFFFFF
){
if
(
flags
==
0xFFFFFFFF
){
calculateMutualInducedFieldPairIxn_kernel
(
localParticle
.
inducedDipole
,
delta
,
prefactor2
,
psA
[
jIdx
].
field
);
psA
[
jIdx
].
field
[
0
]
+=
mask
?
ijField
[
0
].
y
:
0
.
0
f
;
calculateMutualInducedFieldPairIxn_kernel
(
localParticle
.
inducedDipolePolar
,
delta
,
prefactor2
,
psA
[
jIdx
].
fieldPolar
);
psA
[
jIdx
].
field
[
1
]
+=
mask
?
ijField
[
1
].
y
:
0
.
0
f
;
psA
[
jIdx
].
field
[
2
]
+=
mask
?
ijField
[
2
].
y
:
0
.
0
f
;
// add to polar field at atomJ the field due atomI's dipole
psA
[
jIdx
].
fieldPolar
[
0
]
+=
mask
?
ijField
[
0
].
w
:
0
.
0
f
;
psA
[
jIdx
].
fieldPolar
[
1
]
+=
mask
?
ijField
[
1
].
w
:
0
.
0
f
;
psA
[
jIdx
].
fieldPolar
[
2
]
+=
mask
?
ijField
[
2
].
w
:
0
.
0
f
;
}
else
{
}
else
{
calculateMutualInducedFieldPairIxnNoAdd_kernel
(
localParticle
.
inducedDipole
,
delta
,
prefactor2
,
sA
[
threadIdx
.
x
].
tempBuffer
);
sA
[
threadIdx
.
x
].
tempBuffer
[
0
]
=
mask
?
ijField
[
0
].
y
:
0
.
0
;
calculateMutualInducedFieldPairIxnNoAdd_kernel
(
localParticle
.
inducedDipolePolar
,
delta
,
prefactor2
,
sA
[
threadIdx
.
x
].
tempBufferP
);
sA
[
threadIdx
.
x
].
tempBuffer
[
1
]
=
mask
?
ijField
[
1
].
y
:
0
.
0
;
sA
[
threadIdx
.
x
].
tempBuffer
[
2
]
=
mask
?
ijField
[
2
].
y
:
0
.
0
;
sA
[
threadIdx
.
x
].
tempBufferP
[
0
]
=
mask
?
ijField
[
0
].
w
:
0
.
0
;
sA
[
threadIdx
.
x
].
tempBufferP
[
1
]
=
mask
?
ijField
[
1
].
w
:
0
.
0
;
sA
[
threadIdx
.
x
].
tempBufferP
[
2
]
=
mask
?
ijField
[
2
].
w
:
0
.
0
;
if
(
tgx
%
2
==
0
){
if
(
tgx
%
2
==
0
){
sumTempBuffer
(
sA
[
threadIdx
.
x
],
sA
[
threadIdx
.
x
+
1
]
);
sumTempBuffer
(
sA
[
threadIdx
.
x
],
sA
[
threadIdx
.
x
+
1
]
);
...
@@ -308,61 +202,8 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
...
@@ -308,61 +202,8 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
}
}
}
}
/*
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom || (y+jIdx) == targetAtom ){
unsigned int index = atomI == targetAtom ? (y+jIdx) : atomI;
unsigned int pullBackIndex = 0;
unsigned int indexI = 0;
unsigned int indexJ = indexI ? 0 : 2;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + jIdx);
debugArray[index].z = cSim.nonbondedCutoffSqr;
debugArray[index].w = 7.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
pullBackIndex++;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
index += cSim.paddedNumberOfAtoms;
float flag = 7.0f;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI+1][0];
debugArray[index].y = ijField[indexI+1][1];
debugArray[index].z = ijField[indexI+1][2];
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ+1][0];
debugArray[index].y = ijField[indexJ+1][1];
debugArray[index].z = ijField[indexJ+1][2];
debugArray[index].w = flag;
}
#endif
#endif
*/
}
}
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.cu
View file @
8a331fb9
...
@@ -531,7 +531,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
...
@@ -531,7 +531,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
maxThreads
=
192
;
maxThreads
=
192
;
else
else
maxThreads
=
128
;
maxThreads
=
128
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
Vdw14_7Particle
)),
maxThreads
);
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
Vdw14_7Particle
)
,
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
}
if
(
0
){
if
(
0
){
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.cu
View file @
8a331fb9
...
@@ -382,7 +382,7 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
...
@@ -382,7 +382,7 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
maxThreads
=
192
;
maxThreads
=
192
;
else
else
maxThreads
=
64
;
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
WcaDispersionParticle
)),
maxThreads
);
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
WcaDispersionParticle
)
,
gpu
->
sharedMemoryPerBlock
),
maxThreads
);
}
}
#ifdef AMOEBA_DEBUG
#ifdef AMOEBA_DEBUG
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment