Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
c7af17c8
Unverified
Commit
c7af17c8
authored
Mar 24, 2022
by
Peter Eastman
Committed by
GitHub
Mar 24, 2022
Browse files
Removed code for CUDA devices without shuffle (#3528)
parent
ae1b02dc
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
36 additions
and
233 deletions
+36
-233
platforms/cuda/src/CudaNonbondedUtilities.cpp
platforms/cuda/src/CudaNonbondedUtilities.cpp
+36
-102
platforms/cuda/src/kernels/nonbonded.cu
platforms/cuda/src/kernels/nonbonded.cu
+0
-131
No files found.
platforms/cuda/src/CudaNonbondedUtilities.cpp
View file @
c7af17c8
...
@@ -556,28 +556,9 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
...
@@ -556,28 +556,9 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
}
}
replacements
[
"LOAD_ATOM1_PARAMETERS"
]
=
load1
.
str
();
replacements
[
"LOAD_ATOM1_PARAMETERS"
]
=
load1
.
str
();
int
cudaVersion
;
cuDriverGetVersion
(
&
cudaVersion
);
bool
useShuffle
=
(
context
.
getComputeCapability
()
>=
3.0
&&
cudaVersion
>=
5050
);
// Part 1. Defines for on diagonal exclusion tiles
// Part 1. Defines for on diagonal exclusion tiles
stringstream
loadLocal1
;
if
(
useShuffle
)
{
// not needed if using shuffles as we can directly fetch from register
}
else
{
for
(
const
ParameterInfo
&
param
:
params
)
{
if
(
param
.
getNumComponents
()
==
1
)
loadLocal1
<<
"localData[LOCAL_ID]."
<<
param
.
getName
()
<<
" = "
<<
param
.
getName
()
<<
"1;
\n
"
;
else
{
for
(
int
j
=
0
;
j
<
param
.
getNumComponents
();
++
j
)
loadLocal1
<<
"localData[LOCAL_ID]."
<<
param
.
getName
()
<<
"_"
<<
suffixes
[
j
]
<<
" = "
<<
param
.
getName
()
<<
"1."
<<
suffixes
[
j
]
<<
";
\n
"
;
}
}
}
replacements
[
"LOAD_LOCAL_PARAMETERS_FROM_1"
]
=
loadLocal1
.
str
();
stringstream
broadcastWarpData
;
stringstream
broadcastWarpData
;
if
(
useShuffle
)
{
broadcastWarpData
<<
"posq2.x = real_shfl(shflPosq.x, j);
\n
"
;
broadcastWarpData
<<
"posq2.x = real_shfl(shflPosq.x, j);
\n
"
;
broadcastWarpData
<<
"posq2.y = real_shfl(shflPosq.y, j);
\n
"
;
broadcastWarpData
<<
"posq2.y = real_shfl(shflPosq.y, j);
\n
"
;
broadcastWarpData
<<
"posq2.z = real_shfl(shflPosq.z, j);
\n
"
;
broadcastWarpData
<<
"posq2.z = real_shfl(shflPosq.z, j);
\n
"
;
...
@@ -591,60 +572,22 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
...
@@ -591,60 +572,22 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
broadcastWarpData
<<
"shfl"
<<
param
.
getName
()
+
"."
+
suffixes
[
j
]
<<
"=real_shfl("
<<
param
.
getName
()
+
"1."
+
suffixes
[
j
]
<<
",j);
\n
"
;
broadcastWarpData
<<
"shfl"
<<
param
.
getName
()
+
"."
+
suffixes
[
j
]
<<
"=real_shfl("
<<
param
.
getName
()
+
"1."
+
suffixes
[
j
]
<<
",j);
\n
"
;
}
}
}
}
}
else
{
// not used if not shuffling
}
replacements
[
"BROADCAST_WARP_DATA"
]
=
broadcastWarpData
.
str
();
replacements
[
"BROADCAST_WARP_DATA"
]
=
broadcastWarpData
.
str
();
// Part 2. Defines for off-diagonal exclusions, and neighborlist tiles.
// Part 2. Defines for off-diagonal exclusions, and neighborlist tiles.
stringstream
declareLocal2
;
stringstream
declareLocal2
;
if
(
useShuffle
)
{
for
(
const
ParameterInfo
&
param
:
params
)
for
(
const
ParameterInfo
&
param
:
params
)
declareLocal2
<<
param
.
getType
()
<<
" shfl"
<<
param
.
getName
()
<<
";
\n
"
;
declareLocal2
<<
param
.
getType
()
<<
" shfl"
<<
param
.
getName
()
<<
";
\n
"
;
}
else
{
// not used if using shared memory
}
replacements
[
"DECLARE_LOCAL_PARAMETERS"
]
=
declareLocal2
.
str
();
replacements
[
"DECLARE_LOCAL_PARAMETERS"
]
=
declareLocal2
.
str
();
stringstream
loadLocal2
;
stringstream
loadLocal2
;
if
(
useShuffle
)
{
for
(
const
ParameterInfo
&
param
:
params
)
for
(
const
ParameterInfo
&
param
:
params
)
loadLocal2
<<
"shfl"
<<
param
.
getName
()
<<
" = global_"
<<
param
.
getName
()
<<
"[j];
\n
"
;
loadLocal2
<<
"shfl"
<<
param
.
getName
()
<<
" = global_"
<<
param
.
getName
()
<<
"[j];
\n
"
;
}
else
{
for
(
const
ParameterInfo
&
param
:
params
)
{
if
(
param
.
getNumComponents
()
==
1
)
loadLocal2
<<
"localData[LOCAL_ID]."
<<
param
.
getName
()
<<
" = global_"
<<
param
.
getName
()
<<
"[j];
\n
"
;
else
{
loadLocal2
<<
param
.
getType
()
<<
" temp_"
<<
param
.
getName
()
<<
" = global_"
<<
param
.
getName
()
<<
"[j];
\n
"
;
for
(
int
j
=
0
;
j
<
param
.
getNumComponents
();
++
j
)
loadLocal2
<<
"localData[LOCAL_ID]."
<<
param
.
getName
()
<<
"_"
<<
suffixes
[
j
]
<<
" = temp_"
<<
param
.
getName
()
<<
"."
<<
suffixes
[
j
]
<<
";
\n
"
;
}
}
}
replacements
[
"LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"
]
=
loadLocal2
.
str
();
replacements
[
"LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"
]
=
loadLocal2
.
str
();
stringstream
load2j
;
stringstream
load2j
;
if
(
useShuffle
)
{
for
(
const
ParameterInfo
&
param
:
params
)
for
(
const
ParameterInfo
&
param
:
params
)
load2j
<<
param
.
getType
()
<<
" "
<<
param
.
getName
()
<<
"2 = shfl"
<<
param
.
getName
()
<<
";
\n
"
;
load2j
<<
param
.
getType
()
<<
" "
<<
param
.
getName
()
<<
"2 = shfl"
<<
param
.
getName
()
<<
";
\n
"
;
}
else
{
for
(
const
ParameterInfo
&
param
:
params
)
{
if
(
param
.
getNumComponents
()
==
1
)
load2j
<<
param
.
getType
()
<<
" "
<<
param
.
getName
()
<<
"2 = localData[atom2]."
<<
param
.
getName
()
<<
";
\n
"
;
else
{
load2j
<<
param
.
getType
()
<<
" "
<<
param
.
getName
()
<<
"2 = make_"
<<
param
.
getType
()
<<
"("
;
for
(
int
j
=
0
;
j
<
param
.
getNumComponents
();
++
j
)
{
if
(
j
>
0
)
load2j
<<
", "
;
load2j
<<
"localData[atom2]."
<<
param
.
getName
()
<<
"_"
<<
suffixes
[
j
];
}
load2j
<<
");
\n
"
;
}
}
}
replacements
[
"LOAD_ATOM2_PARAMETERS"
]
=
load2j
.
str
();
replacements
[
"LOAD_ATOM2_PARAMETERS"
]
=
load2j
.
str
();
stringstream
load2g
;
stringstream
load2g
;
...
@@ -654,11 +597,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
...
@@ -654,11 +597,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
stringstream
clearLocal
;
stringstream
clearLocal
;
for
(
const
ParameterInfo
&
param
:
params
)
{
for
(
const
ParameterInfo
&
param
:
params
)
{
if
(
useShuffle
)
clearLocal
<<
"shfl"
<<
param
.
getName
()
<<
" = "
;
clearLocal
<<
"shfl"
;
else
clearLocal
<<
"localData[atom2]."
;
clearLocal
<<
param
.
getName
()
<<
" = "
;
if
(
param
.
getNumComponents
()
==
1
)
if
(
param
.
getNumComponents
()
==
1
)
clearLocal
<<
"0;
\n
"
;
clearLocal
<<
"0;
\n
"
;
else
else
...
@@ -680,7 +619,6 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
...
@@ -680,7 +619,6 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
replacements
[
"SAVE_DERIVATIVES"
]
=
saveDerivs
.
str
();
replacements
[
"SAVE_DERIVATIVES"
]
=
saveDerivs
.
str
();
stringstream
shuffleWarpData
;
stringstream
shuffleWarpData
;
if
(
useShuffle
)
{
shuffleWarpData
<<
"shflPosq.x = real_shfl(shflPosq.x, tgx+1);
\n
"
;
shuffleWarpData
<<
"shflPosq.x = real_shfl(shflPosq.x, tgx+1);
\n
"
;
shuffleWarpData
<<
"shflPosq.y = real_shfl(shflPosq.y, tgx+1);
\n
"
;
shuffleWarpData
<<
"shflPosq.y = real_shfl(shflPosq.y, tgx+1);
\n
"
;
shuffleWarpData
<<
"shflPosq.z = real_shfl(shflPosq.z, tgx+1);
\n
"
;
shuffleWarpData
<<
"shflPosq.z = real_shfl(shflPosq.z, tgx+1);
\n
"
;
...
@@ -701,9 +639,6 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
...
@@ -701,9 +639,6 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
}
}
}
}
}
}
}
else
{
// not used otherwise
}
replacements
[
"SHUFFLE_WARP_DATA"
]
=
shuffleWarpData
.
str
();
replacements
[
"SHUFFLE_WARP_DATA"
]
=
shuffleWarpData
.
str
();
map
<
string
,
string
>
defines
;
map
<
string
,
string
>
defines
;
...
@@ -715,7 +650,6 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
...
@@ -715,7 +650,6 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
defines
[
"USE_EXCLUSIONS"
]
=
"1"
;
defines
[
"USE_EXCLUSIONS"
]
=
"1"
;
if
(
isSymmetric
)
if
(
isSymmetric
)
defines
[
"USE_SYMMETRIC"
]
=
"1"
;
defines
[
"USE_SYMMETRIC"
]
=
"1"
;
if
(
useShuffle
)
defines
[
"ENABLE_SHUFFLE"
]
=
"1"
;
defines
[
"ENABLE_SHUFFLE"
]
=
"1"
;
if
(
includeForces
)
if
(
includeForces
)
defines
[
"INCLUDE_FORCES"
]
=
"1"
;
defines
[
"INCLUDE_FORCES"
]
=
"1"
;
...
...
platforms/cuda/src/kernels/nonbonded.cu
View file @
c7af17c8
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
#ifndef ENABLE_SHUFFLE
typedef
struct
{
real
x
,
y
,
z
;
real
q
;
real
fx
,
fy
,
fz
;
ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN
real
padding
;
#endif
}
AtomData
;
#endif
#ifdef ENABLE_SHUFFLE
//support for 64 bit shuffles
//support for 64 bit shuffles
static
__inline__
__device__
float
real_shfl
(
float
var
,
int
srcLane
)
{
static
__inline__
__device__
float
real_shfl
(
float
var
,
int
srcLane
)
{
return
SHFL
(
var
,
srcLane
);
return
SHFL
(
var
,
srcLane
);
...
@@ -39,7 +26,6 @@ static __inline__ __device__ long long real_shfl(long long var, int srcLane) {
...
@@ -39,7 +26,6 @@ static __inline__ __device__ long long real_shfl(long long var, int srcLane) {
int2
fuse
;
fuse
.
x
=
lo
;
fuse
.
y
=
hi
;
int2
fuse
;
fuse
.
x
=
lo
;
fuse
.
y
=
hi
;
return
*
reinterpret_cast
<
long
long
*>
(
&
fuse
);
return
*
reinterpret_cast
<
long
long
*>
(
&
fuse
);
}
}
#endif
/**
/**
* Save the force on a single atom.
* Save the force on a single atom.
...
@@ -131,10 +117,6 @@ extern "C" __global__ void computeNonbonded(
...
@@ -131,10 +117,6 @@ extern "C" __global__ void computeNonbonded(
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
// block warpIndex
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
// block warpIndex
mixed
energy
=
0
;
mixed
energy
=
0
;
INIT_DERIVATIVES
INIT_DERIVATIVES
// used shared memory if the device cannot shuffle
#ifndef ENABLE_SHUFFLE
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
#endif
// First loop: process tiles that contain exclusions.
// First loop: process tiles that contain exclusions.
...
@@ -154,26 +136,14 @@ extern "C" __global__ void computeNonbonded(
...
@@ -154,26 +136,14 @@ extern "C" __global__ void computeNonbonded(
const
bool
hasExclusions
=
true
;
const
bool
hasExclusions
=
true
;
if
(
x
==
y
)
{
if
(
x
==
y
)
{
// This tile is on the diagonal.
// This tile is on the diagonal.
#ifdef ENABLE_SHUFFLE
real4
shflPosq
=
posq1
;
real4
shflPosq
=
posq1
;
#else
localData
[
threadIdx
.
x
].
x
=
posq1
.
x
;
localData
[
threadIdx
.
x
].
y
=
posq1
.
y
;
localData
[
threadIdx
.
x
].
z
=
posq1
.
z
;
localData
[
threadIdx
.
x
].
q
=
posq1
.
w
;
LOAD_LOCAL_PARAMETERS_FROM_1
#endif
// we do not need to fetch parameters from global since this is a symmetric tile
// we do not need to fetch parameters from global since this is a symmetric tile
// instead we can broadcast the values using shuffle
// instead we can broadcast the values using shuffle
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
j
;
int
atom2
=
tbx
+
j
;
real4
posq2
;
real4
posq2
;
#ifdef ENABLE_SHUFFLE
BROADCAST_WARP_DATA
BROADCAST_WARP_DATA
#else
posq2
=
make_real4
(
localData
[
atom2
].
x
,
localData
[
atom2
].
y
,
localData
[
atom2
].
z
,
localData
[
atom2
].
q
);
#endif
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
APPLY_PERIODIC_TO_DELTA
(
delta
)
...
@@ -216,20 +186,10 @@ extern "C" __global__ void computeNonbonded(
...
@@ -216,20 +186,10 @@ extern "C" __global__ void computeNonbonded(
// This is an off-diagonal tile.
// This is an off-diagonal tile.
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
real4
shflPosq
=
posq
[
j
];
real4
shflPosq
=
posq
[
j
];
#ifdef ENABLE_SHUFFLE
real3
shflForce
;
real3
shflForce
;
shflForce
.
x
=
0.0
f
;
shflForce
.
x
=
0.0
f
;
shflForce
.
y
=
0.0
f
;
shflForce
.
y
=
0.0
f
;
shflForce
.
z
=
0.0
f
;
shflForce
.
z
=
0.0
f
;
#else
localData
[
threadIdx
.
x
].
x
=
shflPosq
.
x
;
localData
[
threadIdx
.
x
].
y
=
shflPosq
.
y
;
localData
[
threadIdx
.
x
].
z
=
shflPosq
.
z
;
localData
[
threadIdx
.
x
].
q
=
shflPosq
.
w
;
localData
[
threadIdx
.
x
].
fx
=
0.0
f
;
localData
[
threadIdx
.
x
].
fy
=
0.0
f
;
localData
[
threadIdx
.
x
].
fz
=
0.0
f
;
#endif
DECLARE_LOCAL_PARAMETERS
DECLARE_LOCAL_PARAMETERS
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
...
@@ -238,11 +198,7 @@ extern "C" __global__ void computeNonbonded(
...
@@ -238,11 +198,7 @@ extern "C" __global__ void computeNonbonded(
unsigned
int
tj
=
tgx
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
int
atom2
=
tbx
+
tj
;
#ifdef ENABLE_SHUFFLE
real4
posq2
=
shflPosq
;
real4
posq2
=
shflPosq
;
#else
real4
posq2
=
make_real4
(
localData
[
atom2
].
x
,
localData
[
atom2
].
y
,
localData
[
atom2
].
z
,
localData
[
atom2
].
q
);
#endif
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
APPLY_PERIODIC_TO_DELTA
(
delta
)
...
@@ -271,34 +227,19 @@ extern "C" __global__ void computeNonbonded(
...
@@ -271,34 +227,19 @@ extern "C" __global__ void computeNonbonded(
force
.
x
-=
delta
.
x
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
force
.
z
-=
delta
.
z
;
#ifdef ENABLE_SHUFFLE
shflForce
.
x
+=
delta
.
x
;
shflForce
.
x
+=
delta
.
x
;
shflForce
.
y
+=
delta
.
y
;
shflForce
.
y
+=
delta
.
y
;
shflForce
.
z
+=
delta
.
z
;
shflForce
.
z
+=
delta
.
z
;
#else
localData
[
tbx
+
tj
].
fx
+=
delta
.
x
;
localData
[
tbx
+
tj
].
fy
+=
delta
.
y
;
localData
[
tbx
+
tj
].
fz
+=
delta
.
z
;
#endif
#else // !USE_SYMMETRIC
#else // !USE_SYMMETRIC
force
.
x
-=
dEdR1
.
x
;
force
.
x
-=
dEdR1
.
x
;
force
.
y
-=
dEdR1
.
y
;
force
.
y
-=
dEdR1
.
y
;
force
.
z
-=
dEdR1
.
z
;
force
.
z
-=
dEdR1
.
z
;
#ifdef ENABLE_SHUFFLE
shflForce
.
x
+=
dEdR2
.
x
;
shflForce
.
x
+=
dEdR2
.
x
;
shflForce
.
y
+=
dEdR2
.
y
;
shflForce
.
y
+=
dEdR2
.
y
;
shflForce
.
z
+=
dEdR2
.
z
;
shflForce
.
z
+=
dEdR2
.
z
;
#else
localData
[
tbx
+
tj
].
fx
+=
dEdR2
.
x
;
localData
[
tbx
+
tj
].
fy
+=
dEdR2
.
y
;
localData
[
tbx
+
tj
].
fz
+=
dEdR2
.
z
;
#endif
#endif // end USE_SYMMETRIC
#endif // end USE_SYMMETRIC
#endif
#endif
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA
SHUFFLE_WARP_DATA
#endif
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
excl
>>=
1
;
excl
>>=
1
;
#endif
#endif
...
@@ -309,15 +250,9 @@ extern "C" __global__ void computeNonbonded(
...
@@ -309,15 +250,9 @@ extern "C" __global__ void computeNonbonded(
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
// write results for off diagonal tiles
// write results for off diagonal tiles
#ifdef INCLUDE_FORCES
#ifdef INCLUDE_FORCES
#ifdef ENABLE_SHUFFLE
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
(
realToFixedPoint
(
shflForce
.
x
)));
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
(
realToFixedPoint
(
shflForce
.
x
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
(
realToFixedPoint
(
shflForce
.
y
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
(
realToFixedPoint
(
shflForce
.
y
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
(
realToFixedPoint
(
shflForce
.
z
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
(
realToFixedPoint
(
shflForce
.
z
)));
#else
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
(
realToFixedPoint
(
localData
[
threadIdx
.
x
].
fx
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
(
realToFixedPoint
(
localData
[
threadIdx
.
x
].
fy
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
(
realToFixedPoint
(
localData
[
threadIdx
.
x
].
fz
)));
#endif
#endif
#endif
}
}
// Write results for on and off diagonal tiles
// Write results for on and off diagonal tiles
...
@@ -399,37 +334,19 @@ extern "C" __global__ void computeNonbonded(
...
@@ -399,37 +334,19 @@ extern "C" __global__ void computeNonbonded(
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
#endif
atomIndices
[
threadIdx
.
x
]
=
j
;
atomIndices
[
threadIdx
.
x
]
=
j
;
#ifdef ENABLE_SHUFFLE
DECLARE_LOCAL_PARAMETERS
DECLARE_LOCAL_PARAMETERS
real4
shflPosq
;
real4
shflPosq
;
real3
shflForce
;
real3
shflForce
;
shflForce
.
x
=
0.0
f
;
shflForce
.
x
=
0.0
f
;
shflForce
.
y
=
0.0
f
;
shflForce
.
y
=
0.0
f
;
shflForce
.
z
=
0.0
f
;
shflForce
.
z
=
0.0
f
;
#endif
if
(
j
<
PADDED_NUM_ATOMS
)
{
if
(
j
<
PADDED_NUM_ATOMS
)
{
// Load position of atom j from from global memory
// Load position of atom j from from global memory
#ifdef ENABLE_SHUFFLE
shflPosq
=
posq
[
j
];
shflPosq
=
posq
[
j
];
#else
localData
[
threadIdx
.
x
].
x
=
posq
[
j
].
x
;
localData
[
threadIdx
.
x
].
y
=
posq
[
j
].
y
;
localData
[
threadIdx
.
x
].
z
=
posq
[
j
].
z
;
localData
[
threadIdx
.
x
].
q
=
posq
[
j
].
w
;
localData
[
threadIdx
.
x
].
fx
=
0.0
f
;
localData
[
threadIdx
.
x
].
fy
=
0.0
f
;
localData
[
threadIdx
.
x
].
fz
=
0.0
f
;
#endif
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
}
else
{
else
{
#ifdef ENABLE_SHUFFLE
shflPosq
=
make_real4
(
0
,
0
,
0
,
0
);
shflPosq
=
make_real4
(
0
,
0
,
0
,
0
);
#else
localData
[
threadIdx
.
x
].
x
=
0
;
localData
[
threadIdx
.
x
].
y
=
0
;
localData
[
threadIdx
.
x
].
z
=
0
;
#endif
CLEAR_LOCAL_PARAMETERS
CLEAR_LOCAL_PARAMETERS
}
}
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
...
@@ -438,19 +355,11 @@ extern "C" __global__ void computeNonbonded(
...
@@ -438,19 +355,11 @@ extern "C" __global__ void computeNonbonded(
// box, then skip having to apply periodic boundary conditions later.
// box, then skip having to apply periodic boundary conditions later.
real4
blockCenterX
=
blockCenter
[
x
];
real4
blockCenterX
=
blockCenter
[
x
];
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
posq1
,
blockCenterX
)
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
posq1
,
blockCenterX
)
#ifdef ENABLE_SHUFFLE
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
shflPosq
,
blockCenterX
)
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
shflPosq
,
blockCenterX
)
#else
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
localData
[
threadIdx
.
x
],
blockCenterX
)
#endif
unsigned
int
tj
=
tgx
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
int
atom2
=
tbx
+
tj
;
#ifdef ENABLE_SHUFFLE
real4
posq2
=
shflPosq
;
real4
posq2
=
shflPosq
;
#else
real4
posq2
=
make_real4
(
localData
[
atom2
].
x
,
localData
[
atom2
].
y
,
localData
[
atom2
].
z
,
localData
[
atom2
].
q
);
#endif
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
invR
=
RSQRT
(
r2
);
real
invR
=
RSQRT
(
r2
);
...
@@ -476,34 +385,19 @@ extern "C" __global__ void computeNonbonded(
...
@@ -476,34 +385,19 @@ extern "C" __global__ void computeNonbonded(
force
.
x
-=
delta
.
x
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
force
.
z
-=
delta
.
z
;
#ifdef ENABLE_SHUFFLE
shflForce
.
x
+=
delta
.
x
;
shflForce
.
x
+=
delta
.
x
;
shflForce
.
y
+=
delta
.
y
;
shflForce
.
y
+=
delta
.
y
;
shflForce
.
z
+=
delta
.
z
;
shflForce
.
z
+=
delta
.
z
;
#else
localData
[
tbx
+
tj
].
fx
+=
delta
.
x
;
localData
[
tbx
+
tj
].
fy
+=
delta
.
y
;
localData
[
tbx
+
tj
].
fz
+=
delta
.
z
;
#endif
#else // !USE_SYMMETRIC
#else // !USE_SYMMETRIC
force
.
x
-=
dEdR1
.
x
;
force
.
x
-=
dEdR1
.
x
;
force
.
y
-=
dEdR1
.
y
;
force
.
y
-=
dEdR1
.
y
;
force
.
z
-=
dEdR1
.
z
;
force
.
z
-=
dEdR1
.
z
;
#ifdef ENABLE_SHUFFLE
shflForce
.
x
+=
dEdR2
.
x
;
shflForce
.
x
+=
dEdR2
.
x
;
shflForce
.
y
+=
dEdR2
.
y
;
shflForce
.
y
+=
dEdR2
.
y
;
shflForce
.
z
+=
dEdR2
.
z
;
shflForce
.
z
+=
dEdR2
.
z
;
#else
localData
[
tbx
+
tj
].
fx
+=
dEdR2
.
x
;
localData
[
tbx
+
tj
].
fy
+=
dEdR2
.
y
;
localData
[
tbx
+
tj
].
fz
+=
dEdR2
.
z
;
#endif
#endif // end USE_SYMMETRIC
#endif // end USE_SYMMETRIC
#endif
#endif
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA
SHUFFLE_WARP_DATA
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
}
...
@@ -514,11 +408,7 @@ extern "C" __global__ void computeNonbonded(
...
@@ -514,11 +408,7 @@ extern "C" __global__ void computeNonbonded(
unsigned
int
tj
=
tgx
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
int
atom2
=
tbx
+
tj
;
#ifdef ENABLE_SHUFFLE
real4
posq2
=
shflPosq
;
real4
posq2
=
shflPosq
;
#else
real4
posq2
=
make_real4
(
localData
[
atom2
].
x
,
localData
[
atom2
].
y
,
localData
[
atom2
].
z
,
localData
[
atom2
].
q
);
#endif
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
APPLY_PERIODIC_TO_DELTA
(
delta
)
...
@@ -547,34 +437,19 @@ extern "C" __global__ void computeNonbonded(
...
@@ -547,34 +437,19 @@ extern "C" __global__ void computeNonbonded(
force
.
x
-=
delta
.
x
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
force
.
z
-=
delta
.
z
;
#ifdef ENABLE_SHUFFLE
shflForce
.
x
+=
delta
.
x
;
shflForce
.
x
+=
delta
.
x
;
shflForce
.
y
+=
delta
.
y
;
shflForce
.
y
+=
delta
.
y
;
shflForce
.
z
+=
delta
.
z
;
shflForce
.
z
+=
delta
.
z
;
#else
localData
[
tbx
+
tj
].
fx
+=
delta
.
x
;
localData
[
tbx
+
tj
].
fy
+=
delta
.
y
;
localData
[
tbx
+
tj
].
fz
+=
delta
.
z
;
#endif
#else // !USE_SYMMETRIC
#else // !USE_SYMMETRIC
force
.
x
-=
dEdR1
.
x
;
force
.
x
-=
dEdR1
.
x
;
force
.
y
-=
dEdR1
.
y
;
force
.
y
-=
dEdR1
.
y
;
force
.
z
-=
dEdR1
.
z
;
force
.
z
-=
dEdR1
.
z
;
#ifdef ENABLE_SHUFFLE
shflForce
.
x
+=
dEdR2
.
x
;
shflForce
.
x
+=
dEdR2
.
x
;
shflForce
.
y
+=
dEdR2
.
y
;
shflForce
.
y
+=
dEdR2
.
y
;
shflForce
.
z
+=
dEdR2
.
z
;
shflForce
.
z
+=
dEdR2
.
z
;
#else
localData
[
tbx
+
tj
].
fx
+=
dEdR2
.
x
;
localData
[
tbx
+
tj
].
fy
+=
dEdR2
.
y
;
localData
[
tbx
+
tj
].
fz
+=
dEdR2
.
z
;
#endif
#endif // end USE_SYMMETRIC
#endif // end USE_SYMMETRIC
#endif
#endif
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA
SHUFFLE_WARP_DATA
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
}
...
@@ -590,15 +465,9 @@ extern "C" __global__ void computeNonbonded(
...
@@ -590,15 +465,9 @@ extern "C" __global__ void computeNonbonded(
unsigned
int
atom2
=
y
*
TILE_SIZE
+
tgx
;
unsigned
int
atom2
=
y
*
TILE_SIZE
+
tgx
;
#endif
#endif
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
#ifdef ENABLE_SHUFFLE
atomicAdd
(
&
forceBuffers
[
atom2
],
static_cast
<
unsigned
long
long
>
(
realToFixedPoint
(
shflForce
.
x
)));
atomicAdd
(
&
forceBuffers
[
atom2
],
static_cast
<
unsigned
long
long
>
(
realToFixedPoint
(
shflForce
.
x
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
(
realToFixedPoint
(
shflForce
.
y
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
(
realToFixedPoint
(
shflForce
.
y
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
(
realToFixedPoint
(
shflForce
.
z
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
(
realToFixedPoint
(
shflForce
.
z
)));
#else
atomicAdd
(
&
forceBuffers
[
atom2
],
static_cast
<
unsigned
long
long
>
(
realToFixedPoint
(
localData
[
threadIdx
.
x
].
fx
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
(
realToFixedPoint
(
localData
[
threadIdx
.
x
].
fy
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
(
realToFixedPoint
(
localData
[
threadIdx
.
x
].
fz
)));
#endif
}
}
#endif
#endif
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment