Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
08f1039f
Commit
08f1039f
authored
May 23, 2013
by
peastman
Browse files
Merge pull request #7 from proteneer/master
Nonbonded.cu now uses shuffle intrinsic
parents
7521da4a
8e6c1180
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
304 additions
and
96 deletions
+304
-96
platforms/cuda/src/CudaNonbondedUtilities.cpp
platforms/cuda/src/CudaNonbondedUtilities.cpp
+129
-37
platforms/cuda/src/kernels/nonbonded.cu
platforms/cuda/src/kernels/nonbonded.cu
+175
-59
No files found.
platforms/cuda/src/CudaNonbondedUtilities.cpp
View file @
08f1039f
...
@@ -416,6 +416,12 @@ void CudaNonbondedUtilities::setAtomBlockRange(double startFraction, double endF
...
@@ -416,6 +416,12 @@ void CudaNonbondedUtilities::setAtomBlockRange(double startFraction, double endF
}
}
CUfunction
CudaNonbondedUtilities
::
createInteractionKernel
(
const
string
&
source
,
vector
<
ParameterInfo
>&
params
,
vector
<
ParameterInfo
>&
arguments
,
bool
useExclusions
,
bool
isSymmetric
)
{
CUfunction
CudaNonbondedUtilities
::
createInteractionKernel
(
const
string
&
source
,
vector
<
ParameterInfo
>&
params
,
vector
<
ParameterInfo
>&
arguments
,
bool
useExclusions
,
bool
isSymmetric
)
{
map
<
string
,
string
>
defines
;
if
(
context
.
getComputeCapability
()
>=
3.0
&&
!
context
.
getUseDoublePrecision
())
{
defines
[
"ENABLE_SHUFFLE"
]
=
"1"
;
}
map
<
string
,
string
>
replacements
;
map
<
string
,
string
>
replacements
;
replacements
[
"COMPUTE_INTERACTION"
]
=
source
;
replacements
[
"COMPUTE_INTERACTION"
]
=
source
;
const
string
suffixes
[]
=
{
"x"
,
"y"
,
"z"
,
"w"
};
const
string
suffixes
[]
=
{
"x"
,
"y"
,
"z"
,
"w"
};
...
@@ -445,29 +451,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
...
@@ -445,29 +451,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
args
<<
arguments
[
i
].
getName
();
args
<<
arguments
[
i
].
getName
();
}
}
replacements
[
"PARAMETER_ARGUMENTS"
]
=
args
.
str
();
replacements
[
"PARAMETER_ARGUMENTS"
]
=
args
.
str
();
stringstream
loadLocal1
;
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
if
(
params
[
i
].
getNumComponents
()
==
1
)
{
loadLocal1
<<
"localData[localAtomIndex]."
<<
params
[
i
].
getName
()
<<
" = "
<<
params
[
i
].
getName
()
<<
"1;
\n
"
;
}
else
{
for
(
int
j
=
0
;
j
<
params
[
i
].
getNumComponents
();
++
j
)
loadLocal1
<<
"localData[localAtomIndex]."
<<
params
[
i
].
getName
()
<<
"_"
<<
suffixes
[
j
]
<<
" = "
<<
params
[
i
].
getName
()
<<
"1."
<<
suffixes
[
j
]
<<
";
\n
"
;
}
}
replacements
[
"LOAD_LOCAL_PARAMETERS_FROM_1"
]
=
loadLocal1
.
str
();
stringstream
loadLocal2
;
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
if
(
params
[
i
].
getNumComponents
()
==
1
)
{
loadLocal2
<<
"localData[localAtomIndex]."
<<
params
[
i
].
getName
()
<<
" = global_"
<<
params
[
i
].
getName
()
<<
"[j];
\n
"
;
}
else
{
loadLocal2
<<
params
[
i
].
getType
()
<<
" temp_"
<<
params
[
i
].
getName
()
<<
" = global_"
<<
params
[
i
].
getName
()
<<
"[j];
\n
"
;
for
(
int
j
=
0
;
j
<
params
[
i
].
getNumComponents
();
++
j
)
loadLocal2
<<
"localData[localAtomIndex]."
<<
params
[
i
].
getName
()
<<
"_"
<<
suffixes
[
j
]
<<
" = temp_"
<<
params
[
i
].
getName
()
<<
"."
<<
suffixes
[
j
]
<<
";
\n
"
;
}
}
replacements
[
"LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"
]
=
loadLocal2
.
str
();
stringstream
load1
;
stringstream
load1
;
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
load1
<<
params
[
i
].
getType
();
load1
<<
params
[
i
].
getType
();
...
@@ -478,23 +462,133 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
...
@@ -478,23 +462,133 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
load1
<<
"[atom1];
\n
"
;
load1
<<
"[atom1];
\n
"
;
}
}
replacements
[
"LOAD_ATOM1_PARAMETERS"
]
=
load1
.
str
();
replacements
[
"LOAD_ATOM1_PARAMETERS"
]
=
load1
.
str
();
stringstream
load2j
;
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
bool
useShuffle
;
if
(
params
[
i
].
getNumComponents
()
==
1
)
{
if
(
defines
.
find
(
"ENABLE_SHUFFLE"
)
!=
defines
.
end
())
{
load2j
<<
params
[
i
].
getType
()
<<
" "
<<
params
[
i
].
getName
()
<<
"2 = localData[atom2]."
<<
params
[
i
].
getName
()
<<
";
\n
"
;
useShuffle
=
true
;
}
else
{
useShuffle
=
false
;
}
// Part 1. Defines for on diagonal exclusion tiles
stringstream
loadLocal1
;
if
(
useShuffle
)
{
// not needed if using shuffles as we can directly fetch from register
}
else
{
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
if
(
params
[
i
].
getNumComponents
()
==
1
)
{
loadLocal1
<<
"localData[threadIdx.x]."
<<
params
[
i
].
getName
()
<<
" = "
<<
params
[
i
].
getName
()
<<
"1;
\n
"
;
}
else
{
for
(
int
j
=
0
;
j
<
params
[
i
].
getNumComponents
();
++
j
)
loadLocal1
<<
"localData[threadIdx.x]."
<<
params
[
i
].
getName
()
<<
"_"
<<
suffixes
[
j
]
<<
" = "
<<
params
[
i
].
getName
()
<<
"1."
<<
suffixes
[
j
]
<<
";
\n
"
;
}
}
}
else
{
}
load2j
<<
params
[
i
].
getType
()
<<
" "
<<
params
[
i
].
getName
()
<<
"2 = make_"
<<
params
[
i
].
getType
()
<<
"("
;
replacements
[
"LOAD_LOCAL_PARAMETERS_FROM_1"
]
=
loadLocal1
.
str
();
for
(
int
j
=
0
;
j
<
params
[
i
].
getNumComponents
();
++
j
)
{
if
(
j
>
0
)
stringstream
broadcastWarpData
;
load2j
<<
", "
;
if
(
useShuffle
)
{
load2j
<<
"localData[atom2]."
<<
params
[
i
].
getName
()
<<
"_"
<<
suffixes
[
j
];
broadcastWarpData
<<
"posq2.x = real_shfl(shflPosq.x, j);
\n
"
;
broadcastWarpData
<<
"posq2.y = real_shfl(shflPosq.y, j);
\n
"
;
broadcastWarpData
<<
"posq2.z = real_shfl(shflPosq.z, j);
\n
"
;
broadcastWarpData
<<
"posq2.w = real_shfl(shflPosq.w, j);
\n
"
;
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
broadcastWarpData
<<
params
[
i
].
getType
()
<<
" shfl"
<<
params
[
i
].
getName
()
<<
";
\n
"
;
for
(
int
j
=
0
;
j
<
params
[
i
].
getNumComponents
();
j
++
)
{
string
name
;
if
(
params
[
i
].
getNumComponents
()
==
1
)
{
broadcastWarpData
<<
"shfl"
<<
params
[
i
].
getName
()
<<
"=real_shfl("
<<
params
[
i
].
getName
()
<<
"1,j);
\n
"
;
}
else
{
broadcastWarpData
<<
"shfl"
<<
params
[
i
].
getName
()
+
"."
+
suffixes
[
j
]
<<
"=real_shfl("
<<
params
[
i
].
getName
()
+
"1."
+
suffixes
[
j
]
<<
",j);
\n
"
;
}
}
}
}
else
{
// not used if not shuffling
}
replacements
[
"BROADCAST_WARP_DATA"
]
=
broadcastWarpData
.
str
();
// Part 2. Defines for off-diagonal exclusions, and neighborlist tiles.
stringstream
declareLocal2
;
if
(
useShuffle
)
{
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
declareLocal2
<<
params
[
i
].
getType
()
<<
" shfl"
<<
params
[
i
].
getName
()
<<
";
\n
"
;
}
}
else
{
// not used if using shared memory
}
replacements
[
"DECLARE_LOCAL_PARAMETERS"
]
=
declareLocal2
.
str
();
stringstream
loadLocal2
;
if
(
useShuffle
)
{
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
loadLocal2
<<
"shfl"
<<
params
[
i
].
getName
()
<<
" = global_"
<<
params
[
i
].
getName
()
<<
"[j];
\n
"
;
}
}
else
{
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
if
(
params
[
i
].
getNumComponents
()
==
1
)
{
loadLocal2
<<
"localData[threadIdx.x]."
<<
params
[
i
].
getName
()
<<
" = global_"
<<
params
[
i
].
getName
()
<<
"[j];
\n
"
;
}
else
{
loadLocal2
<<
params
[
i
].
getType
()
<<
" temp_"
<<
params
[
i
].
getName
()
<<
" = global_"
<<
params
[
i
].
getName
()
<<
"[j];
\n
"
;
for
(
int
j
=
0
;
j
<
params
[
i
].
getNumComponents
();
++
j
)
loadLocal2
<<
"localData[threadIdx.x]."
<<
params
[
i
].
getName
()
<<
"_"
<<
suffixes
[
j
]
<<
" = temp_"
<<
params
[
i
].
getName
()
<<
"."
<<
suffixes
[
j
]
<<
";
\n
"
;
}
}
}
replacements
[
"LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"
]
=
loadLocal2
.
str
();
stringstream
load2j
;
if
(
useShuffle
)
{
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
load2j
<<
params
[
i
].
getType
()
<<
" "
<<
params
[
i
].
getName
()
<<
"2 = shfl"
<<
params
[
i
].
getName
()
<<
";
\n
"
;
}
else
{
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
if
(
params
[
i
].
getNumComponents
()
==
1
)
{
load2j
<<
params
[
i
].
getType
()
<<
" "
<<
params
[
i
].
getName
()
<<
"2 = localData[atom2]."
<<
params
[
i
].
getName
()
<<
";
\n
"
;
}
else
{
load2j
<<
params
[
i
].
getType
()
<<
" "
<<
params
[
i
].
getName
()
<<
"2 = make_"
<<
params
[
i
].
getType
()
<<
"("
;
for
(
int
j
=
0
;
j
<
params
[
i
].
getNumComponents
();
++
j
)
{
if
(
j
>
0
)
load2j
<<
", "
;
load2j
<<
"localData[atom2]."
<<
params
[
i
].
getName
()
<<
"_"
<<
suffixes
[
j
];
}
load2j
<<
");
\n
"
;
}
}
load2j
<<
");
\n
"
;
}
}
}
}
replacements
[
"LOAD_ATOM2_PARAMETERS"
]
=
load2j
.
str
();
replacements
[
"LOAD_ATOM2_PARAMETERS"
]
=
load2j
.
str
();
map
<
string
,
string
>
defines
;
stringstream
shuffleWarpData
;
if
(
useShuffle
)
{
shuffleWarpData
<<
"shflPosq.x = real_shfl(shflPosq.x, tgx+1);
\n
"
;
shuffleWarpData
<<
"shflPosq.y = real_shfl(shflPosq.y, tgx+1);
\n
"
;
shuffleWarpData
<<
"shflPosq.z = real_shfl(shflPosq.z, tgx+1);
\n
"
;
shuffleWarpData
<<
"shflPosq.w = real_shfl(shflPosq.w, tgx+1);
\n
"
;
shuffleWarpData
<<
"shflForce.x = real_shfl(shflForce.x, tgx+1);
\n
"
;
shuffleWarpData
<<
"shflForce.y = real_shfl(shflForce.y, tgx+1);
\n
"
;
shuffleWarpData
<<
"shflForce.z = real_shfl(shflForce.z, tgx+1);
\n
"
;
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
if
(
params
[
i
].
getNumComponents
()
==
1
)
{
shuffleWarpData
<<
"shfl"
<<
params
[
i
].
getName
()
<<
"=real_shfl(shfl"
<<
params
[
i
].
getName
()
<<
", tgx+1);
\n
"
;
}
else
{
for
(
int
j
=
0
;
j
<
params
[
i
].
getNumComponents
();
j
++
)
{
// looks something like shflsigmaEpsilon.x = real_shfl(shflsigmaEpsilon.x,tgx+1);
shuffleWarpData
<<
"shfl"
<<
params
[
i
].
getName
()
<<
"."
<<
suffixes
[
j
]
<<
"=real_shfl(shfl"
<<
params
[
i
].
getName
()
<<
"."
<<
suffixes
[
j
]
<<
", tgx+1);
\n
"
;
}
}
}
}
else
{
// not used otherwise
}
replacements
[
"SHUFFLE_WARP_DATA"
]
=
shuffleWarpData
.
str
();
if
(
useCutoff
)
if
(
useCutoff
)
defines
[
"USE_CUTOFF"
]
=
"1"
;
defines
[
"USE_CUTOFF"
]
=
"1"
;
if
(
usePeriodic
)
if
(
usePeriodic
)
...
@@ -519,8 +613,6 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
...
@@ -519,8 +613,6 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
defines
[
"LAST_EXCLUSION_TILE"
]
=
context
.
intToString
(
endExclusionIndex
);
defines
[
"LAST_EXCLUSION_TILE"
]
=
context
.
intToString
(
endExclusionIndex
);
if
((
localDataSize
/
4
)
%
2
==
0
&&
!
context
.
getUseDoublePrecision
())
if
((
localDataSize
/
4
)
%
2
==
0
&&
!
context
.
getUseDoublePrecision
())
defines
[
"PARAMETER_SIZE_IS_EVEN"
]
=
"1"
;
defines
[
"PARAMETER_SIZE_IS_EVEN"
]
=
"1"
;
if
(
context
.
getComputeCapability
()
>=
3.0
&&
!
context
.
getUseDoublePrecision
())
defines
[
"ENABLE_SHUFFLE"
]
=
"1"
;
CUmodule
program
=
context
.
createModule
(
CudaKernelSources
::
vectorOps
+
context
.
replaceStrings
(
CudaKernelSources
::
nonbonded
,
replacements
),
defines
);
CUmodule
program
=
context
.
createModule
(
CudaKernelSources
::
vectorOps
+
context
.
replaceStrings
(
CudaKernelSources
::
nonbonded
,
replacements
),
defines
);
CUfunction
kernel
=
context
.
getKernel
(
program
,
"computeNonbonded"
);
CUfunction
kernel
=
context
.
getKernel
(
program
,
"computeNonbonded"
);
...
...
platforms/cuda/src/kernels/nonbonded.cu
View file @
08f1039f
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
#ifndef ENABLE_SHUFFLE
typedef
struct
{
typedef
struct
{
real
x
,
y
,
z
;
real
x
,
y
,
z
;
real
q
;
real
q
;
...
@@ -9,6 +10,20 @@ typedef struct {
...
@@ -9,6 +10,20 @@ typedef struct {
real
padding
;
real
padding
;
#endif
#endif
}
AtomData
;
}
AtomData
;
#endif
//support for 64 bit shuffles
static
__inline__
__device__
float
real_shfl
(
float
var
,
int
srcLane
)
{
return
__shfl
(
var
,
srcLane
);
}
static
__inline__
__device__
double
real_shfl
(
double
var
,
int
srcLane
)
{
int
hi
,
lo
;
asm
volatile
(
"mov.b64 { %0, %1 }, %2;"
:
"=r"
(
lo
),
"=r"
(
hi
)
:
"d"
(
var
));
hi
=
__shfl
(
hi
,
srcLane
);
lo
=
__shfl
(
lo
,
srcLane
);
return
__hiloint2double
(
hi
,
lo
);
}
/**
/**
* Compute nonbonded interactions. The kernel is separated into two parts,
* Compute nonbonded interactions. The kernel is separated into two parts,
...
@@ -40,12 +55,15 @@ typedef struct {
...
@@ -40,12 +55,15 @@ typedef struct {
* t o 3 4 5 6 7 8 1 2
* t o 3 4 5 6 7 8 1 2
* a p 2 3 4 5 6 7 8 1
* a p 2 3 4 5 6 7 8 1
*
*
* TODO: Implement shuffle as opposed to using nonbonded.
*
* Tiles without exclusions read off directly from the neighbourlist interactingAtoms
* Tiles without exclusions read off directly from the neighbourlist interactingAtoms
* and follows the same force accumulation method. If more there are more interactingTiles
* and follows the same force accumulation method. If more there are more interactingTiles
* than the size of the neighbourlist initially allocated, the neighbourlist is rebuilt
* than the size of the neighbourlist initially allocated, the neighbourlist is rebuilt
* and the full tileset.
* and the full tileset is computed. This should happen on the first step, and very rarely
* afterwards.
*
* On CUDA devices that support the shuffle intrinsic, on diagonal exclusion tiles use
* __shfl to broadcast. For all other types of tiles __shfl is used to pass around the
* forces, positions, and parameters when computing the forces.
*
*
* [out]forceBuffers - forces on each atom to eventually be accumulated
* [out]forceBuffers - forces on each atom to eventually be accumulated
* [out]energyBuffer - energyBuffer to eventually be accumulated
* [out]energyBuffer - energyBuffer to eventually be accumulated
...
@@ -83,10 +101,11 @@ extern "C" __global__ void computeNonbonded(
...
@@ -83,10 +101,11 @@ extern "C" __global__ void computeNonbonded(
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
// index within the warp
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
// index within the warp
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
// block warpIndex
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
// block warpIndex
real
energy
=
0.0
f
;
real
energy
=
0.0
f
;
// used shared memory if the device cannot shuffle
#ifndef ENABLE_SHUFFLE
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
#endif
// First loop: process tiles that contain exclusions.
// First loop: process tiles that contain exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
...
@@ -103,16 +122,26 @@ extern "C" __global__ void computeNonbonded(
...
@@ -103,16 +122,26 @@ extern "C" __global__ void computeNonbonded(
const
bool
hasExclusions
=
true
;
const
bool
hasExclusions
=
true
;
if
(
x
==
y
)
{
if
(
x
==
y
)
{
// This tile is on the diagonal.
// This tile is on the diagonal.
#ifdef ENABLE_SHUFFLE
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
real4
shflPosq
=
posq1
;
localData
[
localAtomIndex
].
x
=
posq1
.
x
;
#else
localData
[
localAtomIndex
].
y
=
posq1
.
y
;
localData
[
threadIdx
.
x
].
x
=
posq1
.
x
;
localData
[
localAtomIndex
].
z
=
posq1
.
z
;
localData
[
threadIdx
.
x
].
y
=
posq1
.
y
;
localData
[
localAtomIndex
].
q
=
posq1
.
w
;
localData
[
threadIdx
.
x
].
z
=
posq1
.
z
;
localData
[
threadIdx
.
x
].
q
=
posq1
.
w
;
LOAD_LOCAL_PARAMETERS_FROM_1
LOAD_LOCAL_PARAMETERS_FROM_1
#endif
// we do not need to fetch parameters from global since this is a symmetric tile
// instead we can broadcast the values using shuffle
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
j
;
int
atom2
=
tbx
+
j
;
real4
posq2
=
make_real4
(
localData
[
atom2
].
x
,
localData
[
atom2
].
y
,
localData
[
atom2
].
z
,
localData
[
atom2
].
q
);
real4
posq2
;
#ifdef ENABLE_SHUFFLE
BROADCAST_WARP_DATA
#else
posq2
=
make_real4
(
localData
[
atom2
].
x
,
localData
[
atom2
].
y
,
localData
[
atom2
].
z
,
localData
[
atom2
].
q
);
#endif
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
...
@@ -152,25 +181,35 @@ extern "C" __global__ void computeNonbonded(
...
@@ -152,25 +181,35 @@ extern "C" __global__ void computeNonbonded(
}
}
else
{
else
{
// This is an off-diagonal tile.
// This is an off-diagonal tile.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
real4
tempPosq
=
posq
[
j
];
real4
shflPosq
=
posq
[
j
];
localData
[
localAtomIndex
].
x
=
tempPosq
.
x
;
#ifdef ENABLE_SHUFFLE
localData
[
localAtomIndex
].
y
=
tempPosq
.
y
;
real3
shflForce
;
localData
[
localAtomIndex
].
z
=
tempPosq
.
z
;
shflForce
.
x
=
0.0
f
;
localData
[
localAtomIndex
].
q
=
tempPosq
.
w
;
shflForce
.
y
=
0.0
f
;
shflForce
.
z
=
0.0
f
;
#else
localData
[
threadIdx
.
x
].
x
=
shflPosq
.
x
;
localData
[
threadIdx
.
x
].
y
=
shflPosq
.
y
;
localData
[
threadIdx
.
x
].
z
=
shflPosq
.
z
;
localData
[
threadIdx
.
x
].
q
=
shflPosq
.
w
;
localData
[
threadIdx
.
x
].
fx
=
0.0
f
;
localData
[
threadIdx
.
x
].
fy
=
0.0
f
;
localData
[
threadIdx
.
x
].
fz
=
0.0
f
;
#endif
DECLARE_LOCAL_PARAMETERS
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData
[
localAtomIndex
].
fx
=
0.0
f
;
localData
[
localAtomIndex
].
fy
=
0.0
f
;
localData
[
localAtomIndex
].
fz
=
0.0
f
;
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
));
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
));
#endif
#endif
unsigned
int
tj
=
tgx
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
int
atom2
=
tbx
+
tj
;
#ifdef ENABLE_SHUFFLE
real4
posq2
=
shflPosq
;
#else
real4
posq2
=
make_real4
(
localData
[
atom2
].
x
,
localData
[
atom2
].
y
,
localData
[
atom2
].
z
,
localData
[
atom2
].
q
);
real4
posq2
=
make_real4
(
localData
[
atom2
].
x
,
localData
[
atom2
].
y
,
localData
[
atom2
].
z
,
localData
[
atom2
].
q
);
#endif
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
...
@@ -202,46 +241,64 @@ extern "C" __global__ void computeNonbonded(
...
@@ -202,46 +241,64 @@ extern "C" __global__ void computeNonbonded(
force
.
x
-=
delta
.
x
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
force
.
z
-=
delta
.
z
;
#ifdef ENABLE_SHUFFLE
shflForce
.
x
+=
delta
.
x
;
shflForce
.
y
+=
delta
.
y
;
shflForce
.
z
+=
delta
.
z
;
#else
localData
[
tbx
+
tj
].
fx
+=
delta
.
x
;
localData
[
tbx
+
tj
].
fx
+=
delta
.
x
;
localData
[
tbx
+
tj
].
fy
+=
delta
.
y
;
localData
[
tbx
+
tj
].
fy
+=
delta
.
y
;
localData
[
tbx
+
tj
].
fz
+=
delta
.
z
;
localData
[
tbx
+
tj
].
fz
+=
delta
.
z
;
#else
#endif
#else // !USE_SYMMETRIC
force
.
x
-=
dEdR1
.
x
;
force
.
x
-=
dEdR1
.
x
;
force
.
y
-=
dEdR1
.
y
;
force
.
y
-=
dEdR1
.
y
;
force
.
z
-=
dEdR1
.
z
;
force
.
z
-=
dEdR1
.
z
;
#ifdef ENABLE_SHUFFLE
shflForce
.
x
+=
dEdR2
.
x
;
shflForce
.
y
+=
dEdR2
.
y
;
shflForce
.
z
+=
dEdR2
.
z
;
#else
localData
[
tbx
+
tj
].
fx
+=
dEdR2
.
x
;
localData
[
tbx
+
tj
].
fx
+=
dEdR2
.
x
;
localData
[
tbx
+
tj
].
fy
+=
dEdR2
.
y
;
localData
[
tbx
+
tj
].
fy
+=
dEdR2
.
y
;
localData
[
tbx
+
tj
].
fz
+=
dEdR2
.
z
;
localData
[
tbx
+
tj
].
fz
+=
dEdR2
.
z
;
#endif
#endif
#endif // end USE_SYMMETRIC
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
}
}
#endif
#endif
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
excl
>>=
1
;
excl
>>=
1
;
#endif
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA
#endif
#endif
// cycles the indices
// cycles the indices
// 0 1 2 3 4 5 6 7 -> 1 2 3 4 5 6 7 0
// 0 1 2 3 4 5 6 7 -> 1 2 3 4 5 6 7 0
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
// write results for off diagonal tiles
// Write results.
#ifdef ENABLE_SHUFFLE
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
shflForce
.
x
*
0x100000000
)));
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
shflForce
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
shflForce
.
z
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
#else
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
if
(
x
!=
y
)
{
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fx
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fx
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fy
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fy
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fz
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fz
*
0x100000000
)));
#endif
}
}
// Write results for on and off diagonal tiles
const
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
// of them (no cutoff).
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
const
unsigned
int
numTiles
=
interactionCount
[
0
];
const
unsigned
int
numTiles
=
interactionCount
[
0
];
int
pos
=
(
numTiles
>
maxTiles
?
startTileIndex
+
warp
*
numTileIndices
/
totalWarps
:
warp
*
numTiles
/
totalWarps
);
int
pos
=
(
numTiles
>
maxTiles
?
startTileIndex
+
warp
*
numTileIndices
/
totalWarps
:
warp
*
numTiles
/
totalWarps
);
...
@@ -253,6 +310,8 @@ extern "C" __global__ void computeNonbonded(
...
@@ -253,6 +310,8 @@ extern "C" __global__ void computeNonbonded(
#endif
#endif
int
skipBase
=
0
;
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
int
currentSkipIndex
=
tbx
;
// atomIndices can probably be shuffled as well
// but it probably wouldn't make things any faster
__shared__
int
atomIndices
[
THREAD_BLOCK_SIZE
];
__shared__
int
atomIndices
[
THREAD_BLOCK_SIZE
];
__shared__
volatile
int
skipTiles
[
THREAD_BLOCK_SIZE
];
__shared__
volatile
int
skipTiles
[
THREAD_BLOCK_SIZE
];
skipTiles
[
threadIdx
.
x
]
=
-
1
;
skipTiles
[
threadIdx
.
x
]
=
-
1
;
...
@@ -263,7 +322,6 @@ extern "C" __global__ void computeNonbonded(
...
@@ -263,7 +322,6 @@ extern "C" __global__ void computeNonbonded(
bool
includeTile
=
true
;
bool
includeTile
=
true
;
// Extract the coordinates of this tile.
// Extract the coordinates of this tile.
unsigned
int
x
,
y
;
unsigned
int
x
,
y
;
bool
singlePeriodicCopy
=
false
;
bool
singlePeriodicCopy
=
false
;
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
...
@@ -303,46 +361,64 @@ extern "C" __global__ void computeNonbonded(
...
@@ -303,46 +361,64 @@ extern "C" __global__ void computeNonbonded(
}
}
if
(
includeTile
)
{
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
// Load atom data for this tile.
real4
posq1
=
posq
[
atom1
];
real4
posq1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
LOAD_ATOM1_PARAMETERS
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
//
const unsigned int localAtomIndex = threadIdx.x;
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
);
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
);
#else
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
#endif
atomIndices
[
threadIdx
.
x
]
=
j
;
atomIndices
[
threadIdx
.
x
]
=
j
;
#ifdef ENABLE_SHUFFLE
DECLARE_LOCAL_PARAMETERS
real4
shflPosq
;
real3
shflForce
;
shflForce
.
x
=
0.0
f
;
shflForce
.
y
=
0.0
f
;
shflForce
.
z
=
0.0
f
;
#endif
if
(
j
<
PADDED_NUM_ATOMS
)
{
if
(
j
<
PADDED_NUM_ATOMS
)
{
// Load position of atom j from from global memory
// Load position of atom j from from global memory
real4
tempPosq
=
posq
[
j
];
#ifdef ENABLE_SHUFFLE
localData
[
localAtomIndex
].
x
=
tempPosq
.
x
;
shflPosq
=
posq
[
j
];
localData
[
localAtomIndex
].
y
=
tempPosq
.
y
;
#else
localData
[
localAtomIndex
].
z
=
tempPosq
.
z
;
localData
[
threadIdx
.
x
].
x
=
posq
[
j
].
x
;
localData
[
localAtomIndex
].
q
=
tempPosq
.
w
;
localData
[
threadIdx
.
x
].
y
=
posq
[
j
].
y
;
localData
[
threadIdx
.
x
].
z
=
posq
[
j
].
z
;
localData
[
threadIdx
.
x
].
q
=
posq
[
j
].
w
;
localData
[
threadIdx
.
x
].
fx
=
0.0
f
;
localData
[
threadIdx
.
x
].
fy
=
0.0
f
;
localData
[
threadIdx
.
x
].
fz
=
0.0
f
;
#endif
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData
[
localAtomIndex
].
fx
=
0.0
f
;
localData
[
localAtomIndex
].
fy
=
0.0
f
;
localData
[
localAtomIndex
].
fz
=
0.0
f
;
}
}
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
if
(
singlePeriodicCopy
)
{
if
(
singlePeriodicCopy
)
{
// The box is small enough that we can just translate all the atoms into a single periodic
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
// box, then skip having to apply periodic boundary conditions later.
real4
blockCenterX
=
blockCenter
[
x
];
real4
blockCenterX
=
blockCenter
[
x
];
posq1
.
x
-=
floor
((
posq1
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
posq1
.
x
-=
floor
((
posq1
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
posq1
.
y
-=
floor
((
posq1
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
posq1
.
y
-=
floor
((
posq1
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
posq1
.
z
-=
floor
((
posq1
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
posq1
.
z
-=
floor
((
posq1
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
localData
[
localAtomIndex
].
x
-=
floor
((
localData
[
localAtomIndex
].
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
#ifdef ENABLE_SHUFFLE
localData
[
localAtomIndex
].
y
-=
floor
((
localData
[
localAtomIndex
].
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
shflPosq
.
x
-=
floor
((
shflPosq
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
localData
[
localAtomIndex
].
z
-=
floor
((
localData
[
localAtomIndex
].
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
shflPosq
.
y
-=
floor
((
shflPosq
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
shflPosq
.
z
-=
floor
((
shflPosq
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#else
localData
[
threadIdx
.
x
].
x
-=
floor
((
localData
[
threadIdx
.
x
].
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
localData
[
threadIdx
.
x
].
y
-=
floor
((
localData
[
threadIdx
.
x
].
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
localData
[
threadIdx
.
x
].
z
-=
floor
((
localData
[
threadIdx
.
x
].
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
unsigned
int
tj
=
tgx
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
int
atom2
=
tbx
+
tj
;
#ifdef ENABLE_SHUFFLE
real4
posq2
=
shflPosq
;
#else
real4
posq2
=
make_real4
(
localData
[
atom2
].
x
,
localData
[
atom2
].
y
,
localData
[
atom2
].
z
,
localData
[
atom2
].
q
);
real4
posq2
=
make_real4
(
localData
[
atom2
].
x
,
localData
[
atom2
].
y
,
localData
[
atom2
].
z
,
localData
[
atom2
].
q
);
#endif
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
if
(
r2
<
CUTOFF_SQUARED
)
{
if
(
r2
<
CUTOFF_SQUARED
)
{
...
@@ -367,18 +443,34 @@ extern "C" __global__ void computeNonbonded(
...
@@ -367,18 +443,34 @@ extern "C" __global__ void computeNonbonded(
force
.
x
-=
delta
.
x
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
force
.
z
-=
delta
.
z
;
#ifdef ENABLE_SHUFFLE
shflForce
.
x
+=
delta
.
x
;
shflForce
.
y
+=
delta
.
y
;
shflForce
.
z
+=
delta
.
z
;
#else
localData
[
tbx
+
tj
].
fx
+=
delta
.
x
;
localData
[
tbx
+
tj
].
fx
+=
delta
.
x
;
localData
[
tbx
+
tj
].
fy
+=
delta
.
y
;
localData
[
tbx
+
tj
].
fy
+=
delta
.
y
;
localData
[
tbx
+
tj
].
fz
+=
delta
.
z
;
localData
[
tbx
+
tj
].
fz
+=
delta
.
z
;
#else
#endif
#else // !USE_SYMMETRIC
force
.
x
-=
dEdR1
.
x
;
force
.
x
-=
dEdR1
.
x
;
force
.
y
-=
dEdR1
.
y
;
force
.
y
-=
dEdR1
.
y
;
force
.
z
-=
dEdR1
.
z
;
force
.
z
-=
dEdR1
.
z
;
#ifdef ENABLE_SHUFFLE
shflForce
.
x
+=
dEdR2
.
x
;
shflForce
.
y
+=
dEdR2
.
y
;
shflForce
.
z
+=
dEdR2
.
z
;
#else
localData
[
tbx
+
tj
].
fx
+=
dEdR2
.
x
;
localData
[
tbx
+
tj
].
fx
+=
dEdR2
.
x
;
localData
[
tbx
+
tj
].
fy
+=
dEdR2
.
y
;
localData
[
tbx
+
tj
].
fy
+=
dEdR2
.
y
;
localData
[
tbx
+
tj
].
fz
+=
dEdR2
.
z
;
localData
[
tbx
+
tj
].
fz
+=
dEdR2
.
z
;
#endif
#endif
#endif // end USE_SYMMETRIC
}
}
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
}
...
@@ -386,11 +478,14 @@ extern "C" __global__ void computeNonbonded(
...
@@ -386,11 +478,14 @@ extern "C" __global__ void computeNonbonded(
#endif
#endif
{
{
// We need to apply periodic boundary conditions separately for each interaction.
// We need to apply periodic boundary conditions separately for each interaction.
unsigned
int
tj
=
tgx
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
int
atom2
=
tbx
+
tj
;
#ifdef ENABLE_SHUFFLE
real4
posq2
=
shflPosq
;
#else
real4
posq2
=
make_real4
(
localData
[
atom2
].
x
,
localData
[
atom2
].
y
,
localData
[
atom2
].
z
,
localData
[
atom2
].
q
);
real4
posq2
=
make_real4
(
localData
[
atom2
].
x
,
localData
[
atom2
].
y
,
localData
[
atom2
].
z
,
localData
[
atom2
].
q
);
#endif
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
...
@@ -422,26 +517,41 @@ extern "C" __global__ void computeNonbonded(
...
@@ -422,26 +517,41 @@ extern "C" __global__ void computeNonbonded(
force
.
x
-=
delta
.
x
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
force
.
z
-=
delta
.
z
;
#ifdef ENABLE_SHUFFLE
shflForce
.
x
+=
delta
.
x
;
shflForce
.
y
+=
delta
.
y
;
shflForce
.
z
+=
delta
.
z
;
#else
localData
[
tbx
+
tj
].
fx
+=
delta
.
x
;
localData
[
tbx
+
tj
].
fx
+=
delta
.
x
;
localData
[
tbx
+
tj
].
fy
+=
delta
.
y
;
localData
[
tbx
+
tj
].
fy
+=
delta
.
y
;
localData
[
tbx
+
tj
].
fz
+=
delta
.
z
;
localData
[
tbx
+
tj
].
fz
+=
delta
.
z
;
#else
#endif
#else // !USE_SYMMETRIC
force
.
x
-=
dEdR1
.
x
;
force
.
x
-=
dEdR1
.
x
;
force
.
y
-=
dEdR1
.
y
;
force
.
y
-=
dEdR1
.
y
;
force
.
z
-=
dEdR1
.
z
;
force
.
z
-=
dEdR1
.
z
;
#ifdef ENABLE_SHUFFLE
shflForce
.
x
+=
dEdR2
.
x
;
shflForce
.
y
+=
dEdR2
.
y
;
shflForce
.
z
+=
dEdR2
.
z
;
#else
localData
[
tbx
+
tj
].
fx
+=
dEdR2
.
x
;
localData
[
tbx
+
tj
].
fx
+=
dEdR2
.
x
;
localData
[
tbx
+
tj
].
fy
+=
dEdR2
.
y
;
localData
[
tbx
+
tj
].
fy
+=
dEdR2
.
y
;
localData
[
tbx
+
tj
].
fz
+=
dEdR2
.
z
;
localData
[
tbx
+
tj
].
fz
+=
dEdR2
.
z
;
#endif
#endif
#endif // end USE_SYMMETRIC
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
}
}
#endif
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA
#endif
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
}
// Write results.
// Write results.
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
...
@@ -451,12 +561,18 @@ extern "C" __global__ void computeNonbonded(
...
@@ -451,12 +561,18 @@ extern "C" __global__ void computeNonbonded(
unsigned
int
atom2
=
y
*
TILE_SIZE
+
tgx
;
unsigned
int
atom2
=
y
*
TILE_SIZE
+
tgx
;
#endif
#endif
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
#ifdef ENABLE_SHUFFLE
atomicAdd
(
&
forceBuffers
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
shflForce
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
shflForce
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
shflForce
.
z
*
0x100000000
)));
#else
atomicAdd
(
&
forceBuffers
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fx
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fx
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fy
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fy
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fz
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fz
*
0x100000000
)));
#endif
}
}
}
}
pos
++
;
pos
++
;
}
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
}
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment