Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
44665537
Commit
44665537
authored
May 21, 2013
by
Yutong Zhao
Browse files
first working version with shuffle - but breaks lots of existing tests
parent
faf5e0fe
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
315 additions
and
175 deletions
+315
-175
platforms/cuda/src/CudaContext.cpp
platforms/cuda/src/CudaContext.cpp
+8
-7
platforms/cuda/src/CudaNonbondedUtilities.cpp
platforms/cuda/src/CudaNonbondedUtilities.cpp
+108
-0
platforms/cuda/src/kernels/nonbonded.cu
platforms/cuda/src/kernels/nonbonded.cu
+187
-156
platforms/cuda/tests/TestCudaNonbondedForce.cpp
platforms/cuda/tests/TestCudaNonbondedForce.cpp
+12
-12
No files found.
platforms/cuda/src/CudaContext.cpp
View file @
44665537
...
...
@@ -394,7 +394,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
// Write out the source to a temporary file.
stringstream
tempFileName
;
tempFileName
<<
"openmmTempKernel"
<<
this
;
// Include a pointer to this context as part of the filename to avoid collisions.
tempFileName
<<
"openmmTempKernel"
<<
/*rand() <<*/
this
;
// Include a pointer to this context as part of the filename to avoid collisions.
string
inputFile
=
(
tempDir
+
tempFileName
.
str
()
+
".cu"
);
string
outputFile
=
(
tempDir
+
tempFileName
.
str
()
+
".ptx"
);
string
logFile
=
(
tempDir
+
tempFileName
.
str
()
+
".log"
);
...
...
@@ -428,6 +428,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
}
log
.
close
();
}
cout
<<
error
.
str
()
<<
endl
;
throw
OpenMMException
(
error
.
str
());
}
CUmodule
module
;
...
...
@@ -437,15 +438,15 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
m
<<
"Error loading CUDA module: "
<<
getErrorString
(
result
)
<<
" ("
<<
result
<<
")"
;
throw
OpenMMException
(
m
.
str
());
}
remove
(
inputFile
.
c_str
());
remove
(
outputFile
.
c_str
());
remove
(
logFile
.
c_str
());
//
remove(inputFile.c_str());
//
remove(outputFile.c_str());
//
remove(logFile.c_str());
return
module
;
}
catch
(...)
{
remove
(
inputFile
.
c_str
());
remove
(
outputFile
.
c_str
());
remove
(
logFile
.
c_str
());
//
remove(inputFile.c_str());
//
remove(outputFile.c_str());
//
remove(logFile.c_str());
throw
;
}
}
...
...
platforms/cuda/src/CudaNonbondedUtilities.cpp
View file @
44665537
...
...
@@ -445,6 +445,8 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
args
<<
arguments
[
i
].
getName
();
}
replacements
[
"PARAMETER_ARGUMENTS"
]
=
args
.
str
();
/*
stringstream loadLocal1;
for (int i = 0; i < (int) params.size(); i++) {
if (params[i].getNumComponents() == 1) {
...
...
@@ -456,6 +458,17 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
}
}
replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str();
*/
stringstream
loadLocal1
;
loadLocal1
<<
"tempSigmaEpsilon = sigmaEpsilon1;"
<<
endl
;
//for (int i = 0; i < (int) params.size(); i++) {
// loadLocal1<<params[i].getType()<<" temp"<<params[i].getName()<<"="<<params[i].getName()<<"1;\n";
//}
//cout << loadLocal1.str() << endl;
replacements
[
"LOAD_LOCAL_PARAMETERS_FROM_1"
]
=
loadLocal1
.
str
();
/*
stringstream loadLocal2;
for (int i = 0; i < (int) params.size(); i++) {
if (params[i].getNumComponents() == 1) {
...
...
@@ -468,6 +481,40 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
}
}
replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
*/
stringstream
declareLocal2
;
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
if
(
params
[
i
].
getNumComponents
()
==
1
)
{
// loadLocal2<<params[i].getType()<<" "<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
}
else
{
declareLocal2
<<
params
[
i
].
getType
()
<<
" temp"
<<
params
[
i
].
getName
()
<<
";
\n
"
;
}
}
replacements
[
"DECLARE_LOCAL_PARAMETERS"
]
=
declareLocal2
.
str
();
stringstream
loadLocal2
;
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
if
(
params
[
i
].
getNumComponents
()
==
1
)
{
// loadLocal2<<params[i].getType()<<" "<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
}
else
{
loadLocal2
<<
"temp"
<<
params
[
i
].
getName
()
<<
" = global_"
<<
params
[
i
].
getName
()
<<
"[j];
\n
"
;
}
}
/*
for (int i = 0; i < (int) params.size(); i++) {
if (params[i].getNumComponents() == 1) {
loadLocal2<<params[i].getType()<<" "<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
}
else {
loadLocal2<<params[i].getType()<<" temp_"<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
for (int j = 0; j < params[i].getNumComponents(); ++j)
loadLocal2<<params[i].getType()<<" "<<params[i].getName()<<"_"<<suffixes[j]<<" = temp_"<<params[i].getName()<<"."<<suffixes[j]<<";\n";
}
}
*/
replacements
[
"LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"
]
=
loadLocal2
.
str
();
stringstream
load1
;
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
load1
<<
params
[
i
].
getType
();
...
...
@@ -478,6 +525,8 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
load1
<<
"[atom1];
\n
"
;
}
replacements
[
"LOAD_ATOM1_PARAMETERS"
]
=
load1
.
str
();
/*
stringstream load2j;
for (int i = 0; i < (int) params.size(); i++) {
if (params[i].getNumComponents() == 1) {
...
...
@@ -494,6 +543,65 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
}
}
replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();
*/
stringstream
load2j
;
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
/*
if (params[i].getNumComponents() == 1) {
load2j<<params[i].getType()<<" "<<params[i].getName()<<"2 = "<<params[i].getName()<<";\n";
}
else {
load2j<<params[i].getType()<<" "<<params[i].getName()<<"2 = make_"<<params[i].getType()<<"(";
for (int j = 0; j < params[i].getNumComponents(); ++j) {
if (j > 0)
load2j<<", ";
load2j<<params[i].getName()<<"_"<<suffixes[j];
}
load2j<<");\n";
}*/
load2j
<<
params
[
i
].
getType
()
<<
" "
<<
params
[
i
].
getName
()
<<
"2 = temp"
<<
params
[
i
].
getName
()
<<
";
\n
"
;
}
replacements
[
"LOAD_ATOM2_PARAMETERS"
]
=
load2j
.
str
();
stringstream
broadcastWarpData
;
broadcastWarpData
<<
"posq2.x = __shfl(tempPosq.x, j);
\n
"
;
broadcastWarpData
<<
"posq2.y = __shfl(tempPosq.y, j);
\n
"
;
broadcastWarpData
<<
"posq2.z = __shfl(tempPosq.z, j);
\n
"
;
broadcastWarpData
<<
"posq2.w = __shfl(tempPosq.w, j);
\n
"
;
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
broadcastWarpData
<<
params
[
i
].
getType
()
<<
" temp"
<<
params
[
i
].
getName
()
<<
";
\n
"
;
for
(
int
j
=
0
;
j
<
params
[
i
].
getNumComponents
();
j
++
)
{
string
name
;
if
(
params
[
i
].
getNumComponents
()
==
1
)
{
broadcastWarpData
<<
"temp"
<<
params
[
i
].
getName
()
<<
"=__shfl("
<<
params
[
i
].
getName
()
<<
"1,j);
\n
"
;
}
else
{
broadcastWarpData
<<
"temp"
<<
params
[
i
].
getName
()
+
"."
+
suffixes
[
j
]
<<
"=__shfl("
<<
params
[
i
].
getName
()
+
"1."
+
suffixes
[
j
]
<<
",j);
\n
"
;
}
}
}
replacements
[
"BROADCAST_WARP_DATA"
]
=
broadcastWarpData
.
str
();
stringstream
shuffleWarpData
;
shuffleWarpData
<<
"tempPosq.x = __shfl(tempPosq.x, tgx+1);
\n
"
;
shuffleWarpData
<<
"tempPosq.y = __shfl(tempPosq.y, tgx+1);
\n
"
;
shuffleWarpData
<<
"tempPosq.z = __shfl(tempPosq.z, tgx+1);
\n
"
;
shuffleWarpData
<<
"tempPosq.w = __shfl(tempPosq.w, tgx+1);
\n
"
;
shuffleWarpData
<<
"tempForces.x = __shfl(tempForces.x, tgx+1);
\n
"
;
shuffleWarpData
<<
"tempForces.y = __shfl(tempForces.y, tgx+1);
\n
"
;
shuffleWarpData
<<
"tempForces.z = __shfl(tempForces.z, tgx+1);
\n
"
;
shuffleWarpData
<<
"tempsigmaEpsilon.x = __shfl(tempsigmaEpsilon.x, tgx+1);
\n
"
;
shuffleWarpData
<<
"tempsigmaEpsilon.y = __shfl(tempsigmaEpsilon.y, tgx+1);
\n
"
;
/*
for(int i=0; i< (int) params.size(); i++) {
shuffleWarpData << params[i].getName() << "=__shfl(" << params[i].getName() << ", tgx+1);\n";
}
*/
replacements
[
"SHUFFLE_WARP_DATA"
]
=
shuffleWarpData
.
str
();
map
<
string
,
string
>
defines
;
if
(
useCutoff
)
defines
[
"USE_CUTOFF"
]
=
"1"
;
...
...
platforms/cuda/src/kernels/nonbonded.cu
View file @
44665537
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
// structs are aligned to host compiler rules by default.
// large structures can spill into cache if using registers.
// this would defeat the purpose of using shuffles!
typedef
struct
{
real
x
,
y
,
z
;
real
q
;
...
...
@@ -16,7 +19,10 @@ typedef struct {
* implicit warp-level synchronization. A tile is defined by two atom blocks
* each of warpsize. Each warp computes a range of tiles.
*
* Tiles with exclusions compute the entire set of interactions across
* On-diagonal tiles processes interaction using a naive all-against-one interaction
* accumulation scheme.
*
* Off-diagonal tiles with exclusions compute the entire set of interactions across
* atom blocks, equal to warpsize*warpsize. In order to avoid access conflicts
* the forces are computed and accumulated diagonally in the manner shown below
* where, suppose
...
...
@@ -43,7 +49,7 @@ typedef struct {
* TODO: Implement shuffle as opposed to using nonbonded.
*
* Tiles without exclusions read off directly from the neighbourlist interactingAtoms
* and follows the same force accumulation method. If more there are more interactingTiles
* and follows the same force accumulation method
above
. If more there are more interactingTiles
* than the size of the neighbourlist initially allocated, the neighbourlist is rebuilt
* and the full tileset.
*
...
...
@@ -83,9 +89,6 @@ extern "C" __global__ void computeNonbonded(
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
// index within the warp
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
// block warpIndex
real
energy
=
0.0
f
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
// First loop: process tiles that contain exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
...
...
@@ -96,23 +99,29 @@ extern "C" __global__ void computeNonbonded(
real3
force
=
make_real3
(
0
);
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
posq1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
tileflags
excl
=
exclusions
[
pos
*
TILE_SIZE
+
tgx
];
#endif
const
bool
hasExclusions
=
true
;
if
(
x
==
y
)
{
// This tile is on the diagonal.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
localData
[
localAtomIndex
].
x
=
posq1
.
x
;
localData
[
localAtomIndex
].
y
=
posq1
.
y
;
localData
[
lo
c
al
AtomIndex
].
z
=
posq1
.
z
;
localData
[
localAtomIndex
].
q
=
posq1
.
w
;
LOAD_LOCAL_PARAMETERS_FROM_1
real4
tempPosq
=
posq1
;
// we do not need to fetch parameters from g
lo
b
al
since this is a symmetric tile
// instead we can broadcast the values using shuffle
//
LOAD_LOCAL_PARAMETERS_FROM_1
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
j
;
real4
posq2
=
make_real4
(
localData
[
atom2
].
x
,
localData
[
atom2
].
y
,
localData
[
atom2
].
z
,
localData
[
atom2
].
q
);
real4
posq2
;
// load in the data from other registers
BROADCAST_WARP_DATA
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
...
...
@@ -150,27 +159,25 @@ extern "C" __global__ void computeNonbonded(
#endif
}
}
else
{
// This is an off-diagonal tile.
else
{
// This is an off-diagonal tile.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
real4
tempPosq
=
posq
[
j
];
localData
[
localAtomIndex
].
x
=
tempPosq
.
x
;
localData
[
localAtomIndex
].
y
=
tempPosq
.
y
;
localData
[
localAtomIndex
].
z
=
tempPosq
.
z
;
localData
[
localAtomIndex
].
q
=
tempPosq
.
w
;
real3
tempForces
;
tempForces
.
x
=
0.0
f
;
tempForces
.
y
=
0.0
f
;
tempForces
.
z
=
0.0
f
;
DECLARE_LOCAL_PARAMETERS
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData
[
localAtomIndex
].
fx
=
0.0
f
;
localData
[
localAtomIndex
].
fy
=
0.0
f
;
localData
[
localAtomIndex
].
fz
=
0.0
f
;
#ifdef USE_EXCLUSIONS
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
));
#endif
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real4
posq2
=
make_real4
(
localData
[
atom2
].
x
,
localData
[
atom2
].
y
,
localData
[
atom2
].
z
,
localData
[
atom2
].
q
)
;
real4
posq2
=
tempPosq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
...
...
@@ -178,6 +185,7 @@ extern "C" __global__ void computeNonbonded(
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
...
...
@@ -202,41 +210,47 @@ extern "C" __global__ void computeNonbonded(
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
localData
[
tbx
+
tj
].
f
x
+=
delta
.
x
;
localData
[
tbx
+
tj
].
f
y
+=
delta
.
y
;
localData
[
tbx
+
tj
].
f
z
+=
delta
.
z
;
tempForces
.
x
+=
delta
.
x
;
tempForces
.
y
+=
delta
.
y
;
tempForces
.
z
+=
delta
.
z
;
#else
force
.
x
-=
dEdR1
.
x
;
force
.
y
-=
dEdR1
.
y
;
force
.
z
-=
dEdR1
.
z
;
localData
[
tbx
+
tj
].
f
x
+=
dEdR2
.
x
;
localData
[
tbx
+
tj
].
f
y
+=
dEdR2
.
y
;
localData
[
tbx
+
tj
].
f
z
+=
dEdR2
.
z
;
tempForces
.
x
+=
dEdR2
.
x
;
tempForces
.
y
+=
dEdR2
.
y
;
tempForces
.
z
+=
dEdR2
.
z
;
#endif
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl
>>=
1
;
#endif
// cycles the indices
// 0 1 2 3 4 5 6 7 -> 1 2 3 4 5 6 7 0
SHUFFLE_WARP_DATA
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
// Write results.
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
tempForces
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
tempForces
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
tempForces
.
z
*
0x100000000
)));
}
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
if
(
x
!=
y
)
{
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
f
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
f
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
f
z
*
0x100000000
)));
}
//
if (x != y) {
//
offset = y*TILE_SIZE + tgx;
//
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (
tempForces.
x*0x100000000)));
//
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (
tempForces.
y*0x100000000)));
//
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (
tempForces.
z*0x100000000)));
//
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
...
...
@@ -305,7 +319,6 @@ extern "C" __global__ void computeNonbonded(
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
real4
posq1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
...
...
@@ -315,17 +328,26 @@ extern "C" __global__ void computeNonbonded(
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomIndices
[
threadIdx
.
x
]
=
j
;
real4
tempPosq
;
real3
tempForces
;
tempForces
.
x
=
0.0
f
;
tempForces
.
y
=
0.0
f
;
tempForces
.
z
=
0.0
f
;
DECLARE_LOCAL_PARAMETERS
if
(
j
<
PADDED_NUM_ATOMS
)
{
// Load position of atom j from from global memory
real4
tempPosq
=
posq
[
j
];
localData
[
localAtomIndex
].
x
=
tempPosq
.
x
;
localData
[
localAtomIndex
].
y
=
tempPosq
.
y
;
localData
[
localAtomIndex
].
z
=
tempPosq
.
z
;
localData
[
localAtomIndex
].
q
=
tempPosq
.
w
;
tempPosq
=
posq
[
j
];
//localData[localAtomIndex].x = tempPosq.x;
//localData[localAtomIndex].y = tempPosq.y;
//localData[localAtomIndex].z = tempPosq.z;
//localData[localAtomIndex].q = tempPosq.w;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData
[
localAtomIndex
].
fx
=
0.0
f
;
localData
[
localAtomIndex
].
fy
=
0.0
f
;
localData
[
localAtomIndex
].
fz
=
0.0
f
;
//
localData[localAtomIndex].fx = 0.0f;
//
localData[localAtomIndex].fy = 0.0f;
//
localData[localAtomIndex].fz = 0.0f;
}
#ifdef USE_PERIODIC
if
(
singlePeriodicCopy
)
{
...
...
@@ -336,13 +358,18 @@ extern "C" __global__ void computeNonbonded(
posq1
.
x
-=
floor
((
posq1
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
posq1
.
y
-=
floor
((
posq1
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
posq1
.
z
-=
floor
((
posq1
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
localData
[
localAtomIndex
].
x
-=
floor
((
localData
[
localAtomIndex
].
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
localData
[
localAtomIndex
].
y
-=
floor
((
localData
[
localAtomIndex
].
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
localData
[
localAtomIndex
].
z
-=
floor
((
localData
[
localAtomIndex
].
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
//localData[localAtomIndex].x -= floor((localData[localAtomIndex].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
//localData[localAtomIndex].y -= floor((localData[localAtomIndex].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
//localData[localAtomIndex].z -= floor((localData[localAtomIndex].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
tempPosq
.
x
-=
floor
((
tempPosq
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
tempPosq
.
y
-=
floor
((
tempPosq
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
tempPosq
.
z
-=
floor
((
tempPosq
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real4
posq2
=
make_real4
(
localData
[
atom2
].
x
,
localData
[
atom2
].
y
,
localData
[
atom2
].
z
,
localData
[
atom2
].
q
)
;
real4
posq2
=
tempPosq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
if
(
r2
<
CUTOFF_SQUARED
)
{
...
...
@@ -367,30 +394,31 @@ extern "C" __global__ void computeNonbonded(
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
localData
[
tbx
+
tj
].
f
x
+=
delta
.
x
;
localData
[
tbx
+
tj
].
f
y
+=
delta
.
y
;
localData
[
tbx
+
tj
].
f
z
+=
delta
.
z
;
tempForces
.
x
+=
delta
.
x
;
tempForces
.
y
+=
delta
.
y
;
tempForces
.
z
+=
delta
.
z
;
#else
force
.
x
-=
dEdR1
.
x
;
force
.
y
-=
dEdR1
.
y
;
force
.
z
-=
dEdR1
.
z
;
localData
[
tbx
+
tj
].
f
x
+=
dEdR2
.
x
;
localData
[
tbx
+
tj
].
f
y
+=
dEdR2
.
y
;
localData
[
tbx
+
tj
].
f
z
+=
dEdR2
.
z
;
tempForces
.
x
+=
dEdR2
.
x
;
tempForces
.
y
+=
dEdR2
.
y
;
tempForces
.
z
+=
dEdR2
.
z
;
#endif
}
SHUFFLE_WARP_DATA
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
else
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real4
posq2
=
make_real4
(
localData
[
atom2
].
x
,
localData
[
atom2
].
y
,
localData
[
atom2
].
z
,
localData
[
atom2
].
q
)
;
real4
posq2
=
tempPosq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
...
...
@@ -398,6 +426,7 @@ extern "C" __global__ void computeNonbonded(
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
...
...
@@ -422,20 +451,21 @@ extern "C" __global__ void computeNonbonded(
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
localData
[
tbx
+
tj
].
f
x
+=
delta
.
x
;
localData
[
tbx
+
tj
].
f
y
+=
delta
.
y
;
localData
[
tbx
+
tj
].
f
z
+=
delta
.
z
;
tempForces
.
x
+=
delta
.
x
;
tempForces
.
y
+=
delta
.
y
;
tempForces
.
z
+=
delta
.
z
;
#else
force
.
x
-=
dEdR1
.
x
;
force
.
y
-=
dEdR1
.
y
;
force
.
z
-=
dEdR1
.
z
;
localData
[
tbx
+
tj
].
f
x
+=
dEdR2
.
x
;
localData
[
tbx
+
tj
].
f
y
+=
dEdR2
.
y
;
localData
[
tbx
+
tj
].
f
z
+=
dEdR2
.
z
;
tempForces
.
x
+=
dEdR2
.
x
;
tempForces
.
y
+=
dEdR2
.
y
;
tempForces
.
z
+=
dEdR2
.
z
;
#endif
#ifdef USE_CUTOFF
}
#endif
SHUFFLE_WARP_DATA
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
...
...
@@ -451,12 +481,13 @@ extern "C" __global__ void computeNonbonded(
unsigned
int
atom2
=
y
*
TILE_SIZE
+
tgx
;
#endif
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
atomicAdd
(
&
forceBuffers
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
f
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
f
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
f
z
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
tempForces
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
tempForces
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
tempForces
.
z
*
0x100000000
)));
}
}
pos
++
;
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
}
platforms/cuda/tests/TestCudaNonbondedForce.cpp
View file @
44665537
...
...
@@ -872,21 +872,21 @@ int main(int argc, char* argv[]) {
try
{
if
(
argc
>
1
)
platform
.
setPropertyDefaultValue
(
"CudaPrecision"
,
string
(
argv
[
1
]));
testCoulomb
();
testLJ
();
testExclusionsAnd14
();
testCutoff
();
testCutoff14
();
testPeriodic
();
//
testCoulomb();
//
testLJ();
//
testExclusionsAnd14();
//
testCutoff();
//
testCutoff14();
//
testPeriodic();
testLargeSystem
();
//testBlockInteractions(false);
//testBlockInteractions(true);
testDispersionCorrection
();
testChangingParameters
();
testParallelComputation
(
false
);
testParallelComputation
(
true
);
testSwitchingFunction
(
NonbondedForce
::
CutoffNonPeriodic
);
testSwitchingFunction
(
NonbondedForce
::
PME
);
//
testDispersionCorrection();
//
testChangingParameters();
//
testParallelComputation(false);
//
testParallelComputation(true);
//
testSwitchingFunction(NonbondedForce::CutoffNonPeriodic);
//
testSwitchingFunction(NonbondedForce::PME);
}
catch
(
const
exception
&
e
)
{
cout
<<
"exception: "
<<
e
.
what
()
<<
endl
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment