Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
6bad9d44
Commit
6bad9d44
authored
Apr 08, 2011
by
Mark Friedrichs
Browse files
Removal of several arrays no longer needed
parent
1beac75d
Changes
7
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
273 additions
and
539 deletions
+273
-539
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
+153
-140
plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
+3
-14
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
...c/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
+51
-105
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
...uda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
+31
-38
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
.../src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
+35
-44
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaReduce.cu
.../platforms/cuda/src/kernels/kCalculateAmoebaCudaReduce.cu
+0
-86
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaReduce.h
...a/platforms/cuda/src/kernels/kCalculateAmoebaCudaReduce.h
+0
-112
No files found.
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
View file @
6bad9d44
This diff is collapsed.
Click to expand it.
plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
View file @
6bad9d44
...
...
@@ -30,20 +30,10 @@
#include "kernels/gputypes.h"
#include "amoebaCudaTypes.h"
#define THREADS_PER_BLOCK 256
#include <map>
typedef
std
::
map
<
int
,
float
>
MapIntFloat
;
typedef
MapIntFloat
::
const_iterator
MapIntFloatCI
;
/*
* Remove
* pMapArray, dMapArray, paddedNumberOfAtoms, nonbondBlocks, nonbondThreadsPerBlock, nonbondOutputBuffers
* allocation of torqueMapForce psCovalentDegree psPolarizationDegree
*
THREADS_PER_BLOCK
*/
struct
_amoebaGpuContext
{
_gpuContext
*
gpuContext
;
...
...
@@ -112,7 +102,6 @@ struct _amoebaGpuContext {
// multipole parameters
CUDAStream
<
int4
>*
psMultipoleParticlesIdsAndAxisType
;
CUDAStream
<
int
>*
psMultipoleAxisOffset
;
// buffer indices used for mapping torques onto forces
...
...
@@ -133,10 +122,10 @@ struct _amoebaGpuContext {
CUDAStream
<
float2
>*
psDampingFactorAndThole
;
//
slated for removal -- no longer used
//
used to setup scaling constants
CUDAStream
<
int
>
*
psC
ovalentDegree
;
CUDAStream
<
int
>
*
p
sP
olarizationDegree
;
std
::
vector
<
int
>
c
ovalentDegree
;
std
::
vector
<
int
>
polarizationDegree
;
// fixed-E field
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
View file @
6bad9d44
...
...
@@ -255,18 +255,22 @@ void kInitializeMutualInducedAndGkField_kernel(
float
*
inducedDipolePolarS
)
{
int
threadId
=
__mul24
(
blockIdx
.
x
,
blockDim
.
x
)
+
threadIdx
.
x
;
if
(
threadId
>=
3
*
cSim
.
atoms
)
return
;
int
pos
=
__mul24
(
blockIdx
.
x
,
blockDim
.
x
)
+
threadIdx
.
x
;
while
(
pos
<
3
*
cSim
.
atoms
)
{
fixedEField
[
pos
]
*=
polarizability
[
pos
];
inducedDipole
[
pos
]
=
fixedEField
[
pos
];
fixedEField
[
threadId
]
*=
polarizability
[
threadId
];
inducedDipole
[
threadId
]
=
fixedEField
[
threadId
];
fixedEField
Polar
[
pos
]
*=
polarizability
[
pos
];
inducedDipole
Polar
[
pos
]
=
fixedEField
Polar
[
pos
];
fixedEFieldPolar
[
threadId
]
*=
polarizability
[
threadId
];
inducedDipolePolar
[
threadId
]
=
fixedEFieldPolar
[
threadId
];
fixedGkField
[
pos
]
*=
polarizability
[
pos
];
inducedDipoleS
[
pos
]
=
fixedEField
[
pos
]
+
fixedGkField
[
pos
];
inducedDipolePolarS
[
pos
]
=
fixedEFieldPolar
[
pos
]
+
fixedGkField
[
pos
];
fixedGkField
[
threadId
]
*=
polarizability
[
threadId
];
inducedDipoleS
[
threadId
]
=
fixedEField
[
threadId
]
+
fixedGkField
[
threadId
];
inducedDipolePolarS
[
threadId
]
=
fixedEFieldPolar
[
threadId
]
+
fixedGkField
[
threadId
];
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
...
...
@@ -355,21 +359,24 @@ void kSorUpdateMutualInducedAndGkField_kernel(
{
float
polarSOR
=
0.70
f
;
int
threadId
=
__mul24
(
blockIdx
.
x
,
blockDim
.
x
)
+
threadIdx
.
x
;
if
(
threadId
>=
3
*
cSim
.
atoms
)
return
;
int
pos
=
__mul24
(
blockIdx
.
x
,
blockDim
.
x
)
+
threadIdx
.
x
;
while
(
pos
<
3
*
cSim
.
atoms
)
{
float
previousDipole
=
inducedDipole
[
threadId
];
float
previousDipoleP
=
inducedDipoleP
[
threadId
];
float
previousDipole
=
inducedDipole
[
pos
];
float
previousDipoleP
=
inducedDipoleP
[
pos
];
inducedDipole
[
threadId
]
=
fixedEField
[
threadId
]
+
polarizability
[
threadId
]
*
matrixProduct
[
threadId
];
inducedDipoleP
[
threadId
]
=
fixedEFieldP
[
threadId
]
+
polarizability
[
threadId
]
*
matrixProductP
[
threadId
];
inducedDipole
[
pos
]
=
fixedEField
[
pos
]
+
polarizability
[
pos
]
*
matrixProduct
[
pos
];
inducedDipoleP
[
pos
]
=
fixedEFieldP
[
pos
]
+
polarizability
[
pos
]
*
matrixProductP
[
pos
];
inducedDipole
[
threadId
]
=
previousDipole
+
polarSOR
*
(
inducedDipole
[
threadId
]
-
previousDipole
);
inducedDipoleP
[
threadId
]
=
previousDipoleP
+
polarSOR
*
(
inducedDipoleP
[
threadId
]
-
previousDipoleP
);
inducedDipole
[
pos
]
=
previousDipole
+
polarSOR
*
(
inducedDipole
[
pos
]
-
previousDipole
);
inducedDipoleP
[
pos
]
=
previousDipoleP
+
polarSOR
*
(
inducedDipoleP
[
pos
]
-
previousDipoleP
);
matrixProduct
[
threadId
]
=
(
inducedDipole
[
threadId
]
-
previousDipole
)
*
(
inducedDipole
[
threadId
]
-
previousDipole
);
matrixProductP
[
threadId
]
=
(
inducedDipoleP
[
threadId
]
-
previousDipoleP
)
*
(
inducedDipoleP
[
threadId
]
-
previousDipoleP
);
matrixProduct
[
pos
]
=
(
inducedDipole
[
pos
]
-
previousDipole
)
*
(
inducedDipole
[
pos
]
-
previousDipole
);
matrixProductP
[
pos
]
=
(
inducedDipoleP
[
pos
]
-
previousDipoleP
)
*
(
inducedDipoleP
[
pos
]
-
previousDipoleP
);
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
__global__
...
...
@@ -389,21 +396,23 @@ void kSorUpdateMutualInducedAndGkFieldS_kernel(
{
float
polarSOR
=
0.70
f
;
int
threadId
=
__mul24
(
blockIdx
.
x
,
blockDim
.
x
)
+
threadIdx
.
x
;
if
(
threadId
>=
3
*
cSim
.
atoms
)
return
;
float
previousDipole
=
inducedDipole
[
threadId
];
float
previousDipoleP
=
inducedDipoleP
[
threadId
];
int
pos
=
__mul24
(
blockIdx
.
x
,
blockDim
.
x
)
+
threadIdx
.
x
;
while
(
pos
<
3
*
cSim
.
atoms
)
{
float
previousDipole
=
inducedDipole
[
pos
];
float
previousDipoleP
=
inducedDipoleP
[
pos
];
inducedDipole
[
threadId
]
=
fixedGkField
[
threadId
]
+
fixedEField
[
threadId
]
+
polarizability
[
threadId
]
*
matrixProduct
[
threadId
];
inducedDipoleP
[
threadId
]
=
fixedGkField
[
threadId
]
+
fixedEFieldP
[
threadId
]
+
polarizability
[
threadId
]
*
matrixProductP
[
threadId
];
inducedDipole
[
pos
]
=
fixedGkField
[
pos
]
+
fixedEField
[
pos
]
+
polarizability
[
pos
]
*
matrixProduct
[
pos
];
inducedDipoleP
[
pos
]
=
fixedGkField
[
pos
]
+
fixedEFieldP
[
pos
]
+
polarizability
[
pos
]
*
matrixProductP
[
pos
];
inducedDipole
[
threadId
]
=
previousDipole
+
polarSOR
*
(
inducedDipole
[
threadId
]
-
previousDipole
);
inducedDipoleP
[
threadId
]
=
previousDipoleP
+
polarSOR
*
(
inducedDipoleP
[
threadId
]
-
previousDipoleP
);
inducedDipole
[
pos
]
=
previousDipole
+
polarSOR
*
(
inducedDipole
[
pos
]
-
previousDipole
);
inducedDipoleP
[
pos
]
=
previousDipoleP
+
polarSOR
*
(
inducedDipoleP
[
pos
]
-
previousDipoleP
);
matrixProduct
[
threadId
]
=
(
inducedDipole
[
threadId
]
-
previousDipole
)
*
(
inducedDipole
[
threadId
]
-
previousDipole
);
matrixProductP
[
threadId
]
=
(
inducedDipoleP
[
threadId
]
-
previousDipoleP
)
*
(
inducedDipoleP
[
threadId
]
-
previousDipoleP
);
matrixProduct
[
pos
]
=
(
inducedDipole
[
pos
]
-
previousDipole
)
*
(
inducedDipole
[
pos
]
-
previousDipole
);
matrixProductP
[
pos
]
=
(
inducedDipoleP
[
pos
]
-
previousDipoleP
)
*
(
inducedDipoleP
[
pos
]
-
previousDipoleP
);
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
// reduce psWorkArray_3_1 -> outputArray
...
...
@@ -437,46 +446,6 @@ static void kReduceMutualInducedAndGkFields(amoebaGpuContext amoebaGpu,
LAUNCHERROR
(
"kReduceMutualInducedAndGkFields4"
);
}
#ifdef AMOEBA_DEBUG
#if 0
static void printMiFieldBuffer( amoebaGpuContext amoebaGpu, unsigned int bufferIndex )
{
(void) fprintf( amoebaGpu->log, "MI Field Buffer %u\n", bufferIndex );
unsigned int start = bufferIndex*3*gpu->sim.paddedNumberOfAtoms;
unsigned int stop = (bufferIndex+1)*3*gpu->sim.paddedNumberOfAtoms;
for( unsigned int ii = start; ii < stop; ii += 3 ){
unsigned int ii3Index = ii/3;
unsigned int bufferIndex = ii3Index/(gpu->sim.paddedNumberOfAtoms);
unsigned int particleIndex = ii3Index - bufferIndex*(gpu->sim.paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, " %6u %3u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
ii/3, bufferIndex, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysData[ii],
amoebaGpu->psWorkArray_3_1->_pSysData[ii+1],
amoebaGpu->psWorkArray_3_1->_pSysData[ii+2],
amoebaGpu->psWorkArray_3_2->_pSysData[ii],
amoebaGpu->psWorkArray_3_2->_pSysData[ii+1],
amoebaGpu->psWorkArray_3_2->_pSysData[ii+2] );
}
}
static void printMiFieldAtomBuffers( amoebaGpuContext amoebaGpu, unsigned int targetAtom )
{
(void) fprintf( amoebaGpu->log, "MI Field atom %u\n", targetAtom );
for( unsigned int ii = 0; ii < gpu->sim.outputBuffers; ii++ ){
unsigned int particleIndex = 3*(targetAtom + ii*gpu->sim.paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, " %2u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
ii, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex],
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+1],
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+2],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+1],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+2] );
}
}
#endif
#endif
/**---------------------------------------------------------------------------------------
Compute mutual induce field
...
...
@@ -576,14 +545,6 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
amoebaGpu
->
psWorkArray_3_3
->
Download
();
amoebaGpu
->
psWorkArray_3_4
->
Download
();
//printMiFieldAtomBuffers( amoebaGpu, (targetAtom + 0) );
//printMiFieldAtomBuffers( amoebaGpu, (targetAtom + 1) );
//printMiFieldAtomBuffers( amoebaGpu, 100 );
//printMiFieldBuffer( amoebaGpu, 0 );
//printMiFieldBuffer( amoebaGpu, 1 );
//printMiFieldBuffer( amoebaGpu, 37 );
//printMiFieldBuffer( amoebaGpu, 38 );
if
(
amoebaGpu
->
log
&&
iteration
==
1
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Finished MI kernel execution %d
\n
"
,
iteration
);
(
void
)
fflush
(
amoebaGpu
->
log
);
...
...
@@ -711,28 +672,13 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
int
iteration
;
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
int
numOfElems
=
gpu
->
natoms
*
3
;
int
numThreads
=
min
(
THREADS_PER_BLOCK
,
numOfElems
);
int
numBlocks
=
numOfElems
/
numThreads
;
if
(
(
numOfElems
%
numThreads
)
!=
0
)
numBlocks
++
;
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
&&
timestep
==
1
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s %d numOfElems=%d numThreads=%d numBlocks=%d "
"maxIterations=%d targetEpsilon=%.3e
\n
"
,
methodName
,
gpu
->
natoms
,
numOfElems
,
numThreads
,
numBlocks
,
amoebaGpu
->
mutualInducedMaxIterations
,
amoebaGpu
->
mutualInducedTargetEpsilon
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
// ---------------------------------------------------------------------------------------
// set E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
// initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
kInitializeMutualInducedAndGkField_kernel
<<<
numBlocks
,
numThreads
>>>
(
kInitializeMutualInducedAndGkField_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
bsf_reduce_threads_per_block
>>>
(
amoebaGpu
->
psE_Field
->
_pDevData
,
amoebaGpu
->
psE_FieldPolar
->
_pDevData
,
amoebaGpu
->
psGk_Field
->
_pDevData
,
...
...
@@ -812,14 +758,14 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
// post matrix multiply
kSorUpdateMutualInducedAndGkField_kernel
<<<
numBlocks
,
numThreads
>>>
(
kSorUpdateMutualInducedAndGkField_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
bsf_reduce_threads_per_block
>>>
(
amoebaGpu
->
psPolarizability
->
_pDevData
,
amoebaGpu
->
psInducedDipole
->
_pDevData
,
amoebaGpu
->
psInducedDipolePolar
->
_pDevData
,
amoebaGpu
->
psE_Field
->
_pDevData
,
amoebaGpu
->
psE_FieldPolar
->
_pDevData
,
amoebaGpu
->
psWorkVector
[
0
]
->
_pDevData
,
amoebaGpu
->
psWorkVector
[
1
]
->
_pDevData
);
LAUNCHERROR
(
"cudaComputeAmoebaMutualInducedAndGkFieldSorUpdate1"
);
kSorUpdateMutualInducedAndGkFieldS_kernel
<<<
numBlocks
,
numThreads
>>>
(
kSorUpdateMutualInducedAndGkFieldS_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
bsf_reduce_threads_per_block
>>>
(
amoebaGpu
->
psPolarizability
->
_pDevData
,
amoebaGpu
->
psInducedDipoleS
->
_pDevData
,
amoebaGpu
->
psInducedDipolePolarS
->
_pDevData
,
amoebaGpu
->
psE_Field
->
_pDevData
,
amoebaGpu
->
psE_FieldPolar
->
_pDevData
,
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
View file @
6bad9d44
...
...
@@ -120,14 +120,18 @@ void kInitializeMutualInducedField_kernel(
float
*
inducedDipolePolar
)
{
int
threadId
=
__mul24
(
blockIdx
.
x
,
blockDim
.
x
)
+
threadIdx
.
x
;
if
(
threadId
>=
3
*
numberOfAtoms
)
return
;
int
pos
=
__mul24
(
blockIdx
.
x
,
blockDim
.
x
)
+
threadIdx
.
x
;
while
(
pos
<
3
*
cSim
.
atoms
)
{
fixedEField
[
pos
]
*=
polarizability
[
pos
];
inducedDipole
[
pos
]
=
fixedEField
[
pos
];
fixedEField
[
threadId
]
*=
polarizability
[
threadId
];
inducedDipole
[
threadId
]
=
fixedEField
[
threadId
];
fixedEField
Polar
[
pos
]
*=
polarizability
[
pos
];
inducedDipole
Polar
[
pos
]
=
fixedEField
Polar
[
pos
];
fixedEFieldPolar
[
threadId
]
*
=
polarizability
[
threadId
]
;
inducedDipolePolar
[
threadId
]
=
fixedEFieldPolar
[
threadId
];
pos
+
=
blockDim
.
x
*
gridDim
.
x
;
}
}
...
...
@@ -195,20 +199,24 @@ void kSorUpdateMutualInducedField_kernel(
{
float
polarSOR
=
0.70
f
;
int
threadId
=
__mul24
(
blockIdx
.
x
,
blockDim
.
x
)
+
threadIdx
.
x
;
if
(
threadId
>=
3
*
numberOfEntries
)
return
;
int
pos
=
__mul24
(
blockIdx
.
x
,
blockDim
.
x
)
+
threadIdx
.
x
;
while
(
pos
<
3
*
cSim
.
atoms
)
{
float
previousDipole
=
inducedDipole
[
pos
];
float
previousDipoleP
=
inducedDipoleP
[
pos
];
float
previous
Dipole
=
inducedDipole
[
threadId
];
float
previous
DipoleP
=
inducedDipoleP
[
threadId
];
induced
Dipole
[
pos
]
=
fixedEField
[
pos
]
+
polarizability
[
pos
]
*
matrixProduct
[
pos
];
induced
DipoleP
[
pos
]
=
fixedEFieldP
[
pos
]
+
polarizability
[
pos
]
*
matrixProductP
[
pos
];
inducedDipole
[
threadId
]
=
fixedEField
[
threadId
]
+
polarizability
[
threadId
]
*
matrixProduct
[
threadId
];
inducedDipoleP
[
threadId
]
=
fixedEFieldP
[
threadId
]
+
polarizability
[
threadId
]
*
matrixProductP
[
threadId
]
;
inducedDipole
[
pos
]
=
previousDipole
+
polarSOR
*
(
inducedDipole
[
pos
]
-
previousDipole
);
inducedDipoleP
[
pos
]
=
previousDipoleP
+
polarSOR
*
(
inducedDipoleP
[
pos
]
-
previousDipoleP
)
;
inducedDipole
[
threadId
]
=
previousDipole
+
polarSOR
*
(
inducedDipole
[
threadId
]
-
previousDipole
);
inducedDipoleP
[
threadId
]
=
previousDipoleP
+
polarSOR
*
(
inducedDipoleP
[
threadId
]
-
previousDipoleP
);
matrixProduct
[
pos
]
=
(
inducedDipole
[
pos
]
-
previousDipole
)
*
(
inducedDipole
[
pos
]
-
previousDipole
);
matrixProductP
[
pos
]
=
(
inducedDipoleP
[
pos
]
-
previousDipoleP
)
*
(
inducedDipoleP
[
pos
]
-
previousDipoleP
);
matrixProduct
[
threadId
]
=
(
inducedDipole
[
threadId
]
-
previousDipole
)
*
(
inducedDipole
[
threadId
]
-
previousDipole
)
;
matrixProductP
[
threadId
]
=
(
inducedDipoleP
[
threadId
]
-
previousDipoleP
)
*
(
inducedDipoleP
[
threadId
]
-
previousDipoleP
);
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
...
...
@@ -470,28 +478,13 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
int
iteration
;
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
int
numOfElems
=
gpu
->
natoms
*
3
;
int
numThreads
=
min
(
THREADS_PER_BLOCK
,
numOfElems
);
int
numBlocks
=
numOfElems
/
numThreads
;
if
(
(
numOfElems
%
numThreads
)
!=
0
)
numBlocks
++
;
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s %d numOfElems=%d numThreads=%d numBlocks=%d "
"maxIterations=%d targetEpsilon=%.3e
\n
"
,
methodName
,
gpu
->
natoms
,
numOfElems
,
numThreads
,
numBlocks
,
amoebaGpu
->
mutualInducedMaxIterations
,
amoebaGpu
->
mutualInducedTargetEpsilon
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
// ---------------------------------------------------------------------------------------
// set E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
// initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
kInitializeMutualInducedField_kernel
<<<
numBlocks
,
numThreads
>>>
(
kInitializeMutualInducedField_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
bsf_reduce_threads_per_block
>>>
(
gpu
->
natoms
,
amoebaGpu
->
psE_Field
->
_pDevData
,
amoebaGpu
->
psE_FieldPolar
->
_pDevData
,
...
...
@@ -555,7 +548,7 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
// post matrix multiply
kSorUpdateMutualInducedField_kernel
<<<
numBlocks
,
numThreads
>>>
(
kSorUpdateMutualInducedField_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
bsf_reduce_threads_per_block
>>>
(
gpu
->
natoms
,
amoebaGpu
->
psPolarizability
->
_pDevData
,
amoebaGpu
->
psInducedDipole
->
_pDevData
,
amoebaGpu
->
psInducedDipolePolar
->
_pDevData
,
amoebaGpu
->
psE_Field
->
_pDevData
,
amoebaGpu
->
psE_FieldPolar
->
_pDevData
,
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
View file @
6bad9d44
...
...
@@ -242,14 +242,16 @@ static void kInitializeMutualInducedField_kernel(
float
*
inducedDipolePolar
)
{
int
threadId
=
__mul24
(
blockIdx
.
x
,
blockDim
.
x
)
+
threadIdx
.
x
;
if
(
threadId
>=
3
*
numberOfAtoms
)
return
;
fixedEField
[
threadId
]
*=
polarizability
[
threadId
];
inducedDipole
[
threadId
]
=
fixedEField
[
threadId
];
int
pos
=
__mul24
(
blockIdx
.
x
,
blockDim
.
x
)
+
threadIdx
.
x
;
while
(
pos
<
3
*
cSim
.
atoms
)
{
fixedEField
[
pos
]
*=
polarizability
[
pos
];
inducedDipole
[
pos
]
=
fixedEField
[
pos
];
fixedEFieldPolar
[
threadId
]
*=
polarizability
[
threadId
];
inducedDipolePolar
[
threadId
]
=
fixedEFieldPolar
[
threadId
];
fixedEFieldPolar
[
pos
]
*=
polarizability
[
pos
];
inducedDipolePolar
[
pos
]
=
fixedEFieldPolar
[
pos
];
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
...
...
@@ -325,27 +327,31 @@ static void kSorUpdateMutualInducedField_kernel(
float
*
matrixProduct
,
float
*
matrixProductP
)
{
int
threadId
=
__mul24
(
blockIdx
.
x
,
blockDim
.
x
)
+
threadIdx
.
x
;
if
(
threadId
>=
3
*
numberOfEntries
)
return
;
int
pos
=
__mul24
(
blockIdx
.
x
,
blockDim
.
x
)
+
threadIdx
.
x
;
while
(
pos
<
3
*
cSim
.
atoms
)
{
float
previousDipole
=
inducedDipole
[
threadId
];
float
previousDipoleP
=
inducedDipoleP
[
threadId
];
float
previousDipole
=
inducedDipole
[
pos
];
float
previousDipoleP
=
inducedDipoleP
[
pos
];
// add self terms to fields
const
float
term
=
(
4.0
f
/
3.0
f
)
*
(
cSim
.
alphaEwald
*
cSim
.
alphaEwald
*
cSim
.
alphaEwald
)
/
cAmoebaSim
.
sqrtPi
;
matrixProduct
[
threadId
]
+=
term
*
previousDipole
;
matrixProductP
[
threadId
]
+=
term
*
previousDipoleP
;
matrixProduct
[
pos
]
+=
term
*
previousDipole
;
matrixProductP
[
pos
]
+=
term
*
previousDipoleP
;
inducedDipole
[
threadId
]
=
fixedEField
[
threadId
]
+
polarizability
[
threadId
]
*
matrixProduct
[
threadId
];
inducedDipoleP
[
threadId
]
=
fixedEFieldP
[
threadId
]
+
polarizability
[
threadId
]
*
matrixProductP
[
threadId
];
inducedDipole
[
pos
]
=
fixedEField
[
pos
]
+
polarizability
[
pos
]
*
matrixProduct
[
pos
];
inducedDipoleP
[
pos
]
=
fixedEFieldP
[
pos
]
+
polarizability
[
pos
]
*
matrixProductP
[
pos
];
const
float
polarSOR
=
0.70
f
;
inducedDipole
[
threadId
]
=
previousDipole
+
polarSOR
*
(
inducedDipole
[
threadId
]
-
previousDipole
);
inducedDipoleP
[
threadId
]
=
previousDipoleP
+
polarSOR
*
(
inducedDipoleP
[
threadId
]
-
previousDipoleP
);
inducedDipole
[
pos
]
=
previousDipole
+
polarSOR
*
(
inducedDipole
[
pos
]
-
previousDipole
);
inducedDipoleP
[
pos
]
=
previousDipoleP
+
polarSOR
*
(
inducedDipoleP
[
pos
]
-
previousDipoleP
);
matrixProduct
[
pos
]
=
(
inducedDipole
[
pos
]
-
previousDipole
)
*
(
inducedDipole
[
pos
]
-
previousDipole
);
matrixProductP
[
pos
]
=
(
inducedDipoleP
[
pos
]
-
previousDipoleP
)
*
(
inducedDipoleP
[
pos
]
-
previousDipoleP
);
matrixProduct
[
threadId
]
=
(
inducedDipole
[
threadId
]
-
previousDipole
)
*
(
inducedDipole
[
threadId
]
-
previousDipole
)
;
matrixProductP
[
threadId
]
=
(
inducedDipoleP
[
threadId
]
-
previousDipoleP
)
*
(
inducedDipoleP
[
threadId
]
-
previousDipoleP
);
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
...
...
@@ -539,28 +545,13 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
int
iteration
;
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
int
numOfElems
=
gpu
->
natoms
*
3
;
int
numThreads
=
min
(
THREADS_PER_BLOCK
,
numOfElems
);
int
numBlocks
=
numOfElems
/
numThreads
;
if
(
(
numOfElems
%
numThreads
)
!=
0
)
numBlocks
++
;
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s %d numOfElems=%d numThreads=%d numBlocks=%d "
"maxIterations=%d targetEpsilon=%.3e
\n
"
,
methodName
,
gpu
->
natoms
,
numOfElems
,
numThreads
,
numBlocks
,
amoebaGpu
->
mutualInducedMaxIterations
,
amoebaGpu
->
mutualInducedTargetEpsilon
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
#endif
// ---------------------------------------------------------------------------------------
// set E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
// initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
kInitializeMutualInducedField_kernel
<<<
numBlocks
,
numThreads
>>>
(
kInitializeMutualInducedField_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
bsf_reduce_threads_per_block
>>>
(
gpu
->
natoms
,
amoebaGpu
->
psE_Field
->
_pDevData
,
amoebaGpu
->
psE_FieldPolar
->
_pDevData
,
...
...
@@ -607,7 +598,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
// post matrix multiply
kSorUpdateMutualInducedField_kernel
<<<
numBlocks
,
numThreads
>>>
(
kSorUpdateMutualInducedField_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
bsf_reduce_threads_per_block
>>>
(
gpu
->
natoms
,
amoebaGpu
->
psPolarizability
->
_pDevData
,
amoebaGpu
->
psInducedDipole
->
_pDevData
,
amoebaGpu
->
psInducedDipolePolar
->
_pDevData
,
amoebaGpu
->
psE_Field
->
_pDevData
,
amoebaGpu
->
psE_FieldPolar
->
_pDevData
,
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaReduce.cu
deleted
100644 → 0
View file @
1beac75d
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
#include <stdio.h>
#undef AMOEBA_OFFSET_3
#undef AMOEBA_INCLUDE_DIAGONAL
#define METHOD_NAME(a, b) a##ExcludeDiagonalOffset1##b
#include "kCalculateAmoebaCudaReduce.h"
#undef METHOD_NAME
#define AMOEBA_OFFSET_3
#define METHOD_NAME(a, b) a##ExcludeDiagonalOffset3##b
#include "kCalculateAmoebaCudaReduce.h"
#undef METHOD_NAME
#undef AMOEBA_OFFSET_3
#define AMOEBA_INCLUDE_DIAGONAL
#define METHOD_NAME(a, b) a##IncludeDiagonalOffset1##b
#include "kCalculateAmoebaCudaReduce.h"
#undef METHOD_NAME
#define AMOEBA_OFFSET_3
#define METHOD_NAME(a, b) a##IncludeDiagonalOffset3##b
#include "kCalculateAmoebaCudaReduce.h"
#undef METHOD_NAME
#undef AMOEBA_OFFSET_3
#undef AMOEBA_INCLUDE_DIAGONAL
void
cudaReduceN2ToN
(
float
*
N2Array
,
int
Nsz
,
float
*
NArray
,
int
includeDiagonal
,
int
offset
)
{
int
numThreads
=
min
(
THREADS_PER_BLOCK
,
(
Nsz
));
int
numBlocksPerAtom
=
(
Nsz
/
numThreads
);
if
(
Nsz
%
numThreads
){
numBlocksPerAtom
++
;
}
int
numBlocks
=
numBlocksPerAtom
*
Nsz
;
float
*
partialSum1_d
;
// allocate GPU memory
cudaMalloc
(
(
void
**
)
&
partialSum1_d
,
numBlocks
*
offset
*
sizeof
(
float
)
);
if
(
includeDiagonal
){
if
(
offset
==
3
){
kCalculateAmoebaReduceIncludeDiagonalOffset3N2ToNBlockLevel
<<<
numBlocks
,
numThreads
>>>
(
N2Array
,
partialSum1_d
,
Nsz
,
numBlocksPerAtom
);
LAUNCHERROR
(
"kCalculateAmoebaReduceN2ToNBlockLevel1"
);
}
else
if
(
offset
==
1
){
kCalculateAmoebaReduceIncludeDiagonalOffset1N2ToNBlockLevel
<<<
numBlocks
,
numThreads
>>>
(
N2Array
,
partialSum1_d
,
Nsz
,
numBlocksPerAtom
);
LAUNCHERROR
(
"kCalculateAmoebaReduceN2ToNBlockLevel2"
);
}
}
else
{
if
(
offset
==
3
){
kCalculateAmoebaReduceExcludeDiagonalOffset3N2ToNBlockLevel
<<<
numBlocks
,
numThreads
>>>
(
N2Array
,
partialSum1_d
,
Nsz
,
numBlocksPerAtom
);
LAUNCHERROR
(
"kCalculateAmoebaReduceN2ToNBlockLevel3"
);
}
else
if
(
offset
==
1
){
kCalculateAmoebaReduceExcludeDiagonalOffset1N2ToNBlockLevel
<<<
numBlocks
,
numThreads
>>>
(
N2Array
,
partialSum1_d
,
Nsz
,
numBlocksPerAtom
);
LAUNCHERROR
(
"kCalculateAmoebaReduceN2ToNBlockLevel4"
);
}
}
int
numBlocks2
=
numBlocks
;
numBlocks
=
numBlocks2
*
Nsz
/
numThreads
;
if
(
(
numBlocks2
*
Nsz
)
%
numThreads
){
numBlocks
++
;
}
if
(
offset
==
3
){
kCalculateAmoebaReduceIncludeDiagonalOffset3N2ToNFinal
<<<
numBlocks
,
numThreads
>>>
(
partialSum1_d
,
NArray
,
Nsz
,
numBlocksPerAtom
);
LAUNCHERROR
(
"kCalculateAmoebaReduceN2ToNFinal3"
);
}
else
if
(
offset
==
1
){
kCalculateAmoebaReduceIncludeDiagonalOffset1N2ToNFinal
<<<
numBlocks
,
numThreads
>>>
(
partialSum1_d
,
NArray
,
Nsz
,
numBlocksPerAtom
);
LAUNCHERROR
(
"kCalculateAmoebaReduceN2ToNFinal1"
);
}
//Free memory
cudaFree
(
partialSum1_d
);
}
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaReduce.h
deleted
100644 → 0
View file @
1beac75d
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
typedef
unsigned
int
uint
;
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
GT2XX_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_THREADS_PER_BLOCK
,
1
)
#endif
void
METHOD_NAME
(
kCalculateAmoebaReduce
,
N2ToNBlockLevel
)(
float
*
N2Array
,
float
*
partialSum
,
int
num
,
int
numberOfBlocksPerAtom
)
{
uint
tid
=
threadIdx
.
x
;
__shared__
float
asx
[
THREADS_PER_BLOCK
];
asx
[
tid
]
=
0
.
0
f
;
#ifdef AMOEBA_OFFSET_3
__shared__
float
asy
[
THREADS_PER_BLOCK
];
__shared__
float
asz
[
THREADS_PER_BLOCK
];
asx
[
tid
]
=
0
.
0
f
;
asy
[
tid
]
=
asz
[
tid
]
=
0
.
0
f
;
int
offset
=
3
;
#else
int
offset
=
1
;
#endif
int
atomI
=
blockIdx
.
x
/
numberOfBlocksPerAtom
;
int
atomJ
=
(
blockIdx
.
x
%
numberOfBlocksPerAtom
)
*
blockDim
.
x
+
tid
;
#ifdef AMOEBA_INCLUDE_DIAGONAL
if
(
atomJ
<
num
&&
atomI
<
num
){
#else
if
(
atomJ
<
num
&&
atomJ
!=
atomI
){
#endif
int
index
=
offset
*
(
atomI
*
num
+
atomJ
);
asx
[
tid
]
=
N2Array
[
index
];
#ifdef AMOEBA_OFFSET_3
asy
[
tid
]
=
N2Array
[
index
+
1
];
asz
[
tid
]
=
N2Array
[
index
+
2
];
#endif
}
__syncthreads
();
//to make sure all the elements are loaded
for
(
uint
s
=
(
blockDim
.
x
)
/
2
;
s
!=
0
;
s
>>=
1
){
if
(
tid
<
s
){
asx
[
tid
]
+=
asx
[
tid
+
s
];
#ifdef AMOEBA_OFFSET_3
asy
[
tid
]
+=
asy
[
tid
+
s
];
asz
[
tid
]
+=
asz
[
tid
+
s
];
#endif
}
__syncthreads
();
}
if
(
tid
==
0
){
partialSum
[
blockIdx
.
x
*
offset
]
=
asx
[
0
];
#ifdef AMOEBA_OFFSET_3
partialSum
[
blockIdx
.
x
*
3
+
1
]
=
asy
[
0
];
partialSum
[
blockIdx
.
x
*
3
+
2
]
=
asz
[
0
];
#endif
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
GT2XX_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_THREADS_PER_BLOCK
,
1
)
#endif
void
METHOD_NAME
(
kCalculateAmoebaReduce
,
N2ToNFinal
)(
float
*
partialSum
,
float
*
final
,
int
num
,
int
numberOfBlocksPerAtom
)
{
uint
thread_id
=
__mul24
(
blockIdx
.
x
,
blockDim
.
x
)
+
threadIdx
.
x
;
if
(
thread_id
>
num
)
return
;
float3
sum
;
#ifdef AMOEBA_OFFSET_3
int
offset
=
3
;
sum
.
x
=
sum
.
y
=
sum
.
z
=
0
.
0
f
;
#else
int
offset
=
1
;
sum
.
x
=
0
.
0
f
;
#endif
int
index
=
thread_id
*
offset
*
numberOfBlocksPerAtom
;
for
(
int
i
=
0
;
i
<
numberOfBlocksPerAtom
;
i
++
){
sum
.
x
+=
partialSum
[
index
+
i
*
offset
];
#ifdef AMOEBA_OFFSET_3
sum
.
y
+=
partialSum
[
index
+
i
*
offset
+
1
];
sum
.
z
+=
partialSum
[
index
+
i
*
offset
+
2
];
#endif
}
final
[
thread_id
*
offset
]
=
sum
.
x
;
#ifdef AMOEBA_OFFSET_3
final
[
thread_id
*
3
+
1
]
=
sum
.
y
;
final
[
thread_id
*
3
+
2
]
=
sum
.
z
;
#endif
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment