Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
0f86d9d9
"...ssh:/git@developer.sourcefind.cn:2222/tsoc/openmm.git" did not exist on "d314e695a83252ff6cbfe07a4b25f446caae33a6"
Commit
0f86d9d9
authored
Aug 24, 2015
by
peastman
Browse files
Fixed performance regression on Kepler
parent
b591d011
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
20 additions
and
8 deletions
+20
-8
plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
+20
-8
No files found.
plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
View file @
0f86d9d9
...
@@ -211,10 +211,13 @@ extern "C" __global__ void transformPotentialToCartesianCoordinates(const real*
...
@@ -211,10 +211,13 @@ extern "C" __global__ void transformPotentialToCartesianCoordinates(const real*
extern
"C"
__global__
void
gridSpreadFixedMultipoles
(
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
fracDipole
,
extern
"C"
__global__
void
gridSpreadFixedMultipoles
(
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
fracDipole
,
const
real
*
__restrict__
fracQuadrupole
,
real2
*
__restrict__
pmeGrid
,
int2
*
__restrict__
pmeAtomGridIndex
,
const
real
*
__restrict__
fracQuadrupole
,
real2
*
__restrict__
pmeGrid
,
int2
*
__restrict__
pmeAtomGridIndex
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
// The workspace array doesn't really need to be shared, but we have shared memory to spare, and putting it there
#if __CUDA_ARCH__ < 500
// reduces the load on L2 cache.
real
array
[
PME_ORDER
*
PME_ORDER
];
#else
// We have shared memory to spare, and putting the workspace array there reduces the load on L2 cache.
__shared__
real
sharedArray
[
PME_ORDER
*
PME_ORDER
*
64
];
__shared__
real
sharedArray
[
PME_ORDER
*
PME_ORDER
*
64
];
real
*
array
=
&
sharedArray
[
PME_ORDER
*
PME_ORDER
*
threadIdx
.
x
];
real
*
array
=
&
sharedArray
[
PME_ORDER
*
PME_ORDER
*
threadIdx
.
x
];
#endif
real4
theta1
[
PME_ORDER
];
real4
theta1
[
PME_ORDER
];
real4
theta2
[
PME_ORDER
];
real4
theta2
[
PME_ORDER
];
real4
theta3
[
PME_ORDER
];
real4
theta3
[
PME_ORDER
];
...
@@ -299,10 +302,13 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p
...
@@ -299,10 +302,13 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p
extern
"C"
__global__
void
gridSpreadInducedDipoles
(
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
inducedDipole
,
extern
"C"
__global__
void
gridSpreadInducedDipoles
(
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
inducedDipole
,
const
real
*
__restrict__
inducedDipolePolar
,
real2
*
__restrict__
pmeGrid
,
int2
*
__restrict__
pmeAtomGridIndex
,
const
real
*
__restrict__
inducedDipolePolar
,
real2
*
__restrict__
pmeGrid
,
int2
*
__restrict__
pmeAtomGridIndex
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
// The workspace array doesn't really need to be shared, but we have shared memory to spare, and putting it there
#if __CUDA_ARCH__ < 500
// reduces the load on L2 cache.
real
array
[
PME_ORDER
*
PME_ORDER
];
#else
// We have shared memory to spare, and putting the workspace array there reduces the load on L2 cache.
__shared__
real
sharedArray
[
PME_ORDER
*
PME_ORDER
*
64
];
__shared__
real
sharedArray
[
PME_ORDER
*
PME_ORDER
*
64
];
real
*
array
=
&
sharedArray
[
PME_ORDER
*
PME_ORDER
*
threadIdx
.
x
];
real
*
array
=
&
sharedArray
[
PME_ORDER
*
PME_ORDER
*
threadIdx
.
x
];
#endif
real4
theta1
[
PME_ORDER
];
real4
theta1
[
PME_ORDER
];
real4
theta2
[
PME_ORDER
];
real4
theta2
[
PME_ORDER
];
real4
theta3
[
PME_ORDER
];
real4
theta3
[
PME_ORDER
];
...
@@ -446,10 +452,13 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
...
@@ -446,10 +452,13 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
long
long
*
__restrict__
fieldBuffers
,
long
long
*
__restrict__
fieldPolarBuffers
,
const
real4
*
__restrict__
posq
,
long
long
*
__restrict__
fieldBuffers
,
long
long
*
__restrict__
fieldPolarBuffers
,
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
labFrameDipole
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
const
real
*
__restrict__
labFrameDipole
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
,
int2
*
__restrict__
pmeAtomGridIndex
)
{
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
,
int2
*
__restrict__
pmeAtomGridIndex
)
{
// The workspace array doesn't really need to be shared, but we have shared memory to spare, and putting it there
#if __CUDA_ARCH__ < 500
// reduces the load on L2 cache.
real
array
[
PME_ORDER
*
PME_ORDER
];
#else
// We have shared memory to spare, and putting the workspace array there reduces the load on L2 cache.
__shared__
real
sharedArray
[
PME_ORDER
*
PME_ORDER
*
64
];
__shared__
real
sharedArray
[
PME_ORDER
*
PME_ORDER
*
64
];
real
*
array
=
&
sharedArray
[
PME_ORDER
*
PME_ORDER
*
threadIdx
.
x
];
real
*
array
=
&
sharedArray
[
PME_ORDER
*
PME_ORDER
*
threadIdx
.
x
];
#endif
real4
theta1
[
PME_ORDER
];
real4
theta1
[
PME_ORDER
];
real4
theta2
[
PME_ORDER
];
real4
theta2
[
PME_ORDER
];
real4
theta3
[
PME_ORDER
];
real4
theta3
[
PME_ORDER
];
...
@@ -620,10 +629,13 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
...
@@ -620,10 +629,13 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
real
*
__restrict__
phip
,
real
*
__restrict__
phidp
,
const
real4
*
__restrict__
posq
,
real
*
__restrict__
phip
,
real
*
__restrict__
phidp
,
const
real4
*
__restrict__
posq
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
,
int2
*
__restrict__
pmeAtomGridIndex
)
{
real3
recipBoxVecY
,
real3
recipBoxVecZ
,
int2
*
__restrict__
pmeAtomGridIndex
)
{
// The workspace array doesn't really need to be shared, but we have shared memory to spare, and putting it there
#if __CUDA_ARCH__ < 500
// reduces the load on L2 cache.
real
array
[
PME_ORDER
*
PME_ORDER
];
#else
// We have shared memory to spare, and putting the workspace array there reduces the load on L2 cache.
__shared__
real
sharedArray
[
PME_ORDER
*
PME_ORDER
*
64
];
__shared__
real
sharedArray
[
PME_ORDER
*
PME_ORDER
*
64
];
real
*
array
=
&
sharedArray
[
PME_ORDER
*
PME_ORDER
*
threadIdx
.
x
];
real
*
array
=
&
sharedArray
[
PME_ORDER
*
PME_ORDER
*
threadIdx
.
x
];
#endif
real4
theta1
[
PME_ORDER
];
real4
theta1
[
PME_ORDER
];
real4
theta2
[
PME_ORDER
];
real4
theta2
[
PME_ORDER
];
real4
theta3
[
PME_ORDER
];
real4
theta3
[
PME_ORDER
];
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment