Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
ec39f6ff
Commit
ec39f6ff
authored
May 23, 2013
by
Yutong Zhao
Browse files
nonbonded.cu will now use shuffles on sm_30 or higher
parent
44665537
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
412 additions
and
340 deletions
+412
-340
platforms/cuda/src/CudaContext.cpp
platforms/cuda/src/CudaContext.cpp
+7
-7
platforms/cuda/src/CudaNonbondedUtilities.cpp
platforms/cuda/src/CudaNonbondedUtilities.cpp
+115
-128
platforms/cuda/src/kernels/nonbonded.cu
platforms/cuda/src/kernels/nonbonded.cu
+278
-193
platforms/cuda/tests/TestCudaNonbondedForce.cpp
platforms/cuda/tests/TestCudaNonbondedForce.cpp
+12
-12
No files found.
platforms/cuda/src/CudaContext.cpp
View file @
ec39f6ff
...
@@ -394,7 +394,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
...
@@ -394,7 +394,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
// Write out the source to a temporary file.
// Write out the source to a temporary file.
stringstream
tempFileName
;
stringstream
tempFileName
;
tempFileName
<<
"openmmTempKernel"
<<
/*rand() <<*/
this
;
// Include a pointer to this context as part of the filename to avoid collisions.
tempFileName
<<
"openmmTempKernel"
<<
this
;
// Include a pointer to this context as part of the filename to avoid collisions.
string
inputFile
=
(
tempDir
+
tempFileName
.
str
()
+
".cu"
);
string
inputFile
=
(
tempDir
+
tempFileName
.
str
()
+
".cu"
);
string
outputFile
=
(
tempDir
+
tempFileName
.
str
()
+
".ptx"
);
string
outputFile
=
(
tempDir
+
tempFileName
.
str
()
+
".ptx"
);
string
logFile
=
(
tempDir
+
tempFileName
.
str
()
+
".log"
);
string
logFile
=
(
tempDir
+
tempFileName
.
str
()
+
".log"
);
...
@@ -438,15 +438,15 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
...
@@ -438,15 +438,15 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
m
<<
"Error loading CUDA module: "
<<
getErrorString
(
result
)
<<
" ("
<<
result
<<
")"
;
m
<<
"Error loading CUDA module: "
<<
getErrorString
(
result
)
<<
" ("
<<
result
<<
")"
;
throw
OpenMMException
(
m
.
str
());
throw
OpenMMException
(
m
.
str
());
}
}
//
remove(inputFile.c_str());
remove
(
inputFile
.
c_str
());
//
remove(outputFile.c_str());
remove
(
outputFile
.
c_str
());
//
remove(logFile.c_str());
remove
(
logFile
.
c_str
());
return
module
;
return
module
;
}
}
catch
(...)
{
catch
(...)
{
//
remove(inputFile.c_str());
remove
(
inputFile
.
c_str
());
//
remove(outputFile.c_str());
remove
(
outputFile
.
c_str
());
//
remove(logFile.c_str());
remove
(
logFile
.
c_str
());
throw
;
throw
;
}
}
}
}
...
...
platforms/cuda/src/CudaNonbondedUtilities.cpp
View file @
ec39f6ff
...
@@ -416,6 +416,11 @@ void CudaNonbondedUtilities::setAtomBlockRange(double startFraction, double endF
...
@@ -416,6 +416,11 @@ void CudaNonbondedUtilities::setAtomBlockRange(double startFraction, double endF
}
}
CUfunction
CudaNonbondedUtilities
::
createInteractionKernel
(
const
string
&
source
,
vector
<
ParameterInfo
>&
params
,
vector
<
ParameterInfo
>&
arguments
,
bool
useExclusions
,
bool
isSymmetric
)
{
CUfunction
CudaNonbondedUtilities
::
createInteractionKernel
(
const
string
&
source
,
vector
<
ParameterInfo
>&
params
,
vector
<
ParameterInfo
>&
arguments
,
bool
useExclusions
,
bool
isSymmetric
)
{
map
<
string
,
string
>
defines
;
if
(
context
.
getComputeCapability
()
>=
3.0
&&
!
context
.
getUseDoublePrecision
())
defines
[
"ENABLE_SHUFFLE"
]
=
"1"
;
map
<
string
,
string
>
replacements
;
map
<
string
,
string
>
replacements
;
replacements
[
"COMPUTE_INTERACTION"
]
=
source
;
replacements
[
"COMPUTE_INTERACTION"
]
=
source
;
const
string
suffixes
[]
=
{
"x"
,
"y"
,
"z"
,
"w"
};
const
string
suffixes
[]
=
{
"x"
,
"y"
,
"z"
,
"w"
};
...
@@ -446,163 +451,145 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
...
@@ -446,163 +451,145 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
}
}
replacements
[
"PARAMETER_ARGUMENTS"
]
=
args
.
str
();
replacements
[
"PARAMETER_ARGUMENTS"
]
=
args
.
str
();
/*
stringstream
load1
;
stringstream loadLocal1;
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
if (params[i].getNumComponents() == 1) {
load1
<<
params
[
i
].
getType
();
loadLocal1<<"localData[localAtomIndex]."<<params[i].getName()<<" = "<<params[i].getName()<<"1;\n";
load1
<<
" "
;
}
load1
<<
params
[
i
].
getName
();
else {
load1
<<
"1 = global_"
;
for (int j = 0; j < params[i].getNumComponents(); ++j)
load1
<<
params
[
i
].
getName
();
loadLocal1<<"localData[localAtomIndex]."<<params[i].getName()<<"_"<<suffixes[j]<<" = "<<params[i].getName()<<"1."<<suffixes[j]<<";\n";
load1
<<
"[atom1];
\n
"
;
}
}
}
replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str();
replacements
[
"LOAD_ATOM1_PARAMETERS"
]
=
load1
.
str
();
*/
bool
useShuffle
=
(
defines
[
"ENABLE_SHUFFLE"
]
==
"1"
);
// Part 1. Defines for on diagonal exclusion tiles
stringstream
loadLocal1
;
stringstream
loadLocal1
;
if
(
useShuffle
)
{
loadLocal1
<<
"tempSigmaEpsilon = sigmaEpsilon1;"
<<
endl
;
// not needed if using shuffles as we can directly fetch from
//for (int i = 0; i < (int) params.size(); i++) {
// LOAD_ATOM1_PARAMETERS
// loadLocal1<<params[i].getType()<<" temp"<<params[i].getName()<<"="<<params[i].getName()<<"1;\n";
}
else
{
//}
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
//cout << loadLocal1.str() << endl;
if
(
params
[
i
].
getNumComponents
()
==
1
)
{
loadLocal1
<<
"localData[threadIdx.x]."
<<
params
[
i
].
getName
()
<<
" = "
<<
params
[
i
].
getName
()
<<
"1;
\n
"
;
}
else
{
for
(
int
j
=
0
;
j
<
params
[
i
].
getNumComponents
();
++
j
)
loadLocal1
<<
"localData[threadIdx.x]."
<<
params
[
i
].
getName
()
<<
"_"
<<
suffixes
[
j
]
<<
" = "
<<
params
[
i
].
getName
()
<<
"1."
<<
suffixes
[
j
]
<<
";
\n
"
;
}
}
}
replacements
[
"LOAD_LOCAL_PARAMETERS_FROM_1"
]
=
loadLocal1
.
str
();
replacements
[
"LOAD_LOCAL_PARAMETERS_FROM_1"
]
=
loadLocal1
.
str
();
/*
stringstream
broadcastWarpData
;
stringstream loadLocal2;
if
(
useShuffle
)
{
for (int i = 0; i < (int) params.size(); i++) {
broadcastWarpData
<<
"posq2.x = real_shfl(shflPosq.x, j);
\n
"
;
if (params[i].getNumComponents() == 1) {
broadcastWarpData
<<
"posq2.y = real_shfl(shflPosq.y, j);
\n
"
;
loadLocal2<<"localData[localAtomIndex]."<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
broadcastWarpData
<<
"posq2.z = real_shfl(shflPosq.z, j);
\n
"
;
}
broadcastWarpData
<<
"posq2.w = real_shfl(shflPosq.w, j);
\n
"
;
else {
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
loadLocal2<<params[i].getType()<<" temp_"<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
broadcastWarpData
<<
params
[
i
].
getType
()
<<
" shfl"
<<
params
[
i
].
getName
()
<<
";
\n
"
;
for (int j = 0; j < params[i].getNumComponents(); ++j)
for
(
int
j
=
0
;
j
<
params
[
i
].
getNumComponents
();
j
++
)
{
loadLocal2<<"localData[localAtomIndex]."<<params[i].getName()<<"_"<<suffixes[j]<<" = temp_"<<params[i].getName()<<"."<<suffixes[j]<<";\n";
string
name
;
if
(
params
[
i
].
getNumComponents
()
==
1
)
{
broadcastWarpData
<<
"shfl"
<<
params
[
i
].
getName
()
<<
"=real_shfl("
<<
params
[
i
].
getName
()
<<
"1,j);
\n
"
;
}
else
{
broadcastWarpData
<<
"shfl"
<<
params
[
i
].
getName
()
+
"."
+
suffixes
[
j
]
<<
"=real_shfl("
<<
params
[
i
].
getName
()
+
"1."
+
suffixes
[
j
]
<<
",j);
\n
"
;
}
}
}
}
}
else
{
// not used if not shuffling
}
}
replacements["
L
OAD
_LOCAL_PARAMETERS_FROM_GLOBAL
"] =
l
oad
Local2
.str();
replacements
[
"
BR
OAD
CAST_WARP_DATA
"
]
=
br
oad
castWarpData
.
str
();
*/
// Part 2. Defines for off-diagonal exclusions, and neighborlist tiles.
stringstream
declareLocal2
;
stringstream
declareLocal2
;
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
if
(
useShuffle
)
{
if
(
params
[
i
].
getNumComponents
()
==
1
)
{
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
// loadLocal2<<params[i].getType()<<" "<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
declareLocal2
<<
params
[
i
].
getType
()
<<
" shfl"
<<
params
[
i
].
getName
()
<<
";
\n
"
;
}
else
{
//if (params[i].getNumComponents() == 1) {
declareLocal2
<<
params
[
i
].
getType
()
<<
" temp"
<<
params
[
i
].
getName
()
<<
";
\n
"
;
//declareLocal2<<params[i].getType()<<" "<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
//} else {
// declareLocal2<<params[i].getType()<<" temp"<<params[i].getName()<<";\n";
//}
}
}
}
else
{
// not used if using shared memory
}
}
replacements
[
"DECLARE_LOCAL_PARAMETERS"
]
=
declareLocal2
.
str
();
replacements
[
"DECLARE_LOCAL_PARAMETERS"
]
=
declareLocal2
.
str
();
stringstream
loadLocal2
;
stringstream
loadLocal2
;
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
if
(
useShuffle
)
{
if
(
params
[
i
].
getNumComponents
()
==
1
)
{
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
// loadLocal2<<params[i].getType()<<" "<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
loadLocal2
<<
"shfl"
<<
params
[
i
].
getName
()
<<
" = global_"
<<
params
[
i
].
getName
()
<<
"[j];
\n
"
;
}
else
{
loadLocal2
<<
"temp"
<<
params
[
i
].
getName
()
<<
" = global_"
<<
params
[
i
].
getName
()
<<
"[j];
\n
"
;
}
}
}
}
else
{
/*
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
for (int i = 0; i < (int) params.size(); i++
) {
if
(
params
[
i
].
getNumComponents
()
==
1
)
{
if (params[i].getNumComponents() == 1) {
loadLocal2
<<
"localData[threadIdx.x]."
<<
params
[
i
].
getName
()
<<
" = global_"
<<
params
[
i
].
getName
()
<<
"[j];
\n
"
;
loadLocal2<<params[i].getType()<<" "<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
}
}
else
{
else {
loadLocal2
<<
params
[
i
].
getType
()
<<
" temp_"
<<
params
[
i
].
getName
()
<<
" = global_"
<<
params
[
i
].
getName
()
<<
"[j];
\n
"
;
loadLocal2<<params[i].getType()<<" temp_"<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
for
(
int
j
=
0
;
j
<
params
[
i
].
getNumComponents
();
++
j
)
for (int j = 0; j < params[i].getNumComponents(); ++j)
loadLocal2
<<
"localData[threadIdx.x]."
<<
params
[
i
].
getName
()
<<
"_"
<<
suffixes
[
j
]
<<
" = temp_"
<<
params
[
i
].
getName
()
<<
"."
<<
suffixes
[
j
]
<<
";
\n
"
;
loadLocal2<<params[i].getType()<<" "<<params[i].getName()<<"_"<<suffixes[j]<<" = temp_"<<params[i].getName()<<"."<<suffixes[j]<<";\n";
}
}
}
}
}
*/
replacements
[
"LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"
]
=
loadLocal2
.
str
();
replacements
[
"LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"
]
=
loadLocal2
.
str
();
stringstream
load1
;
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
load1
<<
params
[
i
].
getType
();
load1
<<
" "
;
load1
<<
params
[
i
].
getName
();
load1
<<
"1 = global_"
;
load1
<<
params
[
i
].
getName
();
load1
<<
"[atom1];
\n
"
;
}
replacements
[
"LOAD_ATOM1_PARAMETERS"
]
=
load1
.
str
();
/*
stringstream
load2j
;
stringstream
load2j
;
for (int i = 0; i < (int) params.size(); i++) {
if
(
useShuffle
)
{
if (params[i].getNumComponents() == 1) {
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
load2j<<params[i].getType()<<" "<<params[i].getName()<<"2 = localData[atom2]."<<params[i].getName()<<";\n";
load2j
<<
params
[
i
].
getType
()
<<
" "
<<
params
[
i
].
getName
()
<<
"2 = shfl"
<<
params
[
i
].
getName
()
<<
";
\n
"
;
}
}
else
{
else {
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
load2j<<params[i].getType()<<" "<<params[i].getName()<<"2 = make_"<<params[i].getType()<<"(";
if
(
params
[
i
].
getNumComponents
()
==
1
)
{
for (int j = 0; j < params[i].getNumComponents(); ++j) {
load2j
<<
params
[
i
].
getType
()
<<
" "
<<
params
[
i
].
getName
()
<<
"2 = localData[atom2]."
<<
params
[
i
].
getName
()
<<
";
\n
"
;
if (j > 0)
load2j<<", ";
load2j<<"localData[atom2]."<<params[i].getName()<<"_"<<suffixes[j];
}
}
load2j<<");\n";
else
{
}
load2j
<<
params
[
i
].
getType
()
<<
" "
<<
params
[
i
].
getName
()
<<
"2 = make_"
<<
params
[
i
].
getType
()
<<
"("
;
}
for
(
int
j
=
0
;
j
<
params
[
i
].
getNumComponents
();
++
j
)
{
replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();
if
(
j
>
0
)
*/
load2j
<<
", "
;
stringstream
load2j
;
load2j
<<
"localData[atom2]."
<<
params
[
i
].
getName
()
<<
"_"
<<
suffixes
[
j
];
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
}
/*
load2j
<<
");
\n
"
;
if (params[i].getNumComponents() == 1) {
load2j<<params[i].getType()<<" "<<params[i].getName()<<"2 = "<<params[i].getName()<<";\n";
}
else {
load2j<<params[i].getType()<<" "<<params[i].getName()<<"2 = make_"<<params[i].getType()<<"(";
for (int j = 0; j < params[i].getNumComponents(); ++j) {
if (j > 0)
load2j<<", ";
load2j<<params[i].getName()<<"_"<<suffixes[j];
}
}
load2j<<");\n";
}
}*/
load2j
<<
params
[
i
].
getType
()
<<
" "
<<
params
[
i
].
getName
()
<<
"2 = temp"
<<
params
[
i
].
getName
()
<<
";
\n
"
;
}
}
replacements
[
"LOAD_ATOM2_PARAMETERS"
]
=
load2j
.
str
();
replacements
[
"LOAD_ATOM2_PARAMETERS"
]
=
load2j
.
str
();
stringstream
broadcastWarpData
;
stringstream
shuffleWarpData
;
broadcastWarpData
<<
"posq2.x = __shfl(tempPosq.x, j);
\n
"
;
if
(
useShuffle
)
{
broadcastWarpData
<<
"posq2.y = __shfl(tempPosq.y, j);
\n
"
;
shuffleWarpData
<<
"shflPosq.x = real_shfl(shflPosq.x, tgx+1);
\n
"
;
broadcastWarpData
<<
"posq2.z = __shfl(tempPosq.z, j);
\n
"
;
shuffleWarpData
<<
"shflPosq.y = real_shfl(shflPosq.y, tgx+1);
\n
"
;
broadcastWarpData
<<
"posq2.w = __shfl(tempPosq.w, j);
\n
"
;
shuffleWarpData
<<
"shflPosq.z = real_shfl(shflPosq.z, tgx+1);
\n
"
;
shuffleWarpData
<<
"shflPosq.w = real_shfl(shflPosq.w, tgx+1);
\n
"
;
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
shuffleWarpData
<<
"shflForce.x = real_shfl(shflForce.x, tgx+1);
\n
"
;
broadcastWarpData
<<
params
[
i
].
getType
()
<<
" temp"
<<
params
[
i
].
getName
()
<<
";
\n
"
;
shuffleWarpData
<<
"shflForce.y = real_shfl(shflForce.y, tgx+1);
\n
"
;
for
(
int
j
=
0
;
j
<
params
[
i
].
getNumComponents
();
j
++
)
{
shuffleWarpData
<<
"shflForce.z = real_shfl(shflForce.z, tgx+1);
\n
"
;
string
name
;
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
if
(
params
[
i
].
getNumComponents
()
==
1
)
{
if
(
params
[
i
].
getNumComponents
()
==
1
)
{
broadcastWarpData
<<
"temp"
<<
params
[
i
].
getName
()
<<
"=__shfl("
<<
params
[
i
].
getName
()
<<
"1,j);
\n
"
;
shuffleWarpData
<<
"shfl"
<<
params
[
i
].
getName
()
<<
"=real_shfl(shfl"
<<
params
[
i
].
getName
()
<<
", tgx+1);
\n
"
;
}
else
{
}
else
{
broadcastWarpData
<<
"temp"
<<
params
[
i
].
getName
()
+
"."
+
suffixes
[
j
]
<<
"=__shfl("
<<
params
[
i
].
getName
()
+
"1."
+
suffixes
[
j
]
<<
",j);
\n
"
;
for
(
int
j
=
0
;
j
<
params
[
i
].
getNumComponents
();
j
++
)
{
// looks something like
// shflsigmaEpsilon.x = real_shfl(shflsigmaEpsilon.x,tgx+1);
shuffleWarpData
<<
"shfl"
<<
params
[
i
].
getName
()
<<
"."
<<
suffixes
[
j
]
<<
"=real_shfl(shfl"
<<
params
[
i
].
getName
()
<<
"."
<<
suffixes
[
j
]
<<
", tgx+1);
\n
"
;
}
}
}
}
}
}
else
{
// not used otherwise
}
}
replacements
[
"BROADCAST_WARP_DATA"
]
=
broadcastWarpData
.
str
();
stringstream
shuffleWarpData
;
shuffleWarpData
<<
"tempPosq.x = __shfl(tempPosq.x, tgx+1);
\n
"
;
shuffleWarpData
<<
"tempPosq.y = __shfl(tempPosq.y, tgx+1);
\n
"
;
shuffleWarpData
<<
"tempPosq.z = __shfl(tempPosq.z, tgx+1);
\n
"
;
shuffleWarpData
<<
"tempPosq.w = __shfl(tempPosq.w, tgx+1);
\n
"
;
shuffleWarpData
<<
"tempForces.x = __shfl(tempForces.x, tgx+1);
\n
"
;
shuffleWarpData
<<
"tempForces.y = __shfl(tempForces.y, tgx+1);
\n
"
;
shuffleWarpData
<<
"tempForces.z = __shfl(tempForces.z, tgx+1);
\n
"
;
shuffleWarpData
<<
"tempsigmaEpsilon.x = __shfl(tempsigmaEpsilon.x, tgx+1);
\n
"
;
shuffleWarpData
<<
"tempsigmaEpsilon.y = __shfl(tempsigmaEpsilon.y, tgx+1);
\n
"
;
/*
for(int i=0; i< (int) params.size(); i++) {
shuffleWarpData << params[i].getName() << "=__shfl(" << params[i].getName() << ", tgx+1);\n";
}
*/
replacements
[
"SHUFFLE_WARP_DATA"
]
=
shuffleWarpData
.
str
();
replacements
[
"SHUFFLE_WARP_DATA"
]
=
shuffleWarpData
.
str
();
map
<
string
,
string
>
defines
;
if
(
useCutoff
)
if
(
useCutoff
)
defines
[
"USE_CUTOFF"
]
=
"1"
;
defines
[
"USE_CUTOFF"
]
=
"1"
;
if
(
usePeriodic
)
if
(
usePeriodic
)
...
...
platforms/cuda/src/kernels/nonbonded.cu
View file @
ec39f6ff
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
// structs are aligned to host compiler rules by default.
#ifndef ENABLE_SHUFFLE
// large structures can spill into cache if using registers.
// this would defeat the purpose of using shuffles!
typedef
struct
{
typedef
struct
{
real
x
,
y
,
z
;
real
x
,
y
,
z
;
real
q
;
real
q
;
...
@@ -12,6 +10,20 @@ typedef struct {
...
@@ -12,6 +10,20 @@ typedef struct {
real
padding
;
real
padding
;
#endif
#endif
}
AtomData
;
}
AtomData
;
#endif
//support for 64 bit shuffles
static
__inline__
__device__
float
real_shfl
(
float
var
,
int
srcLane
)
{
return
__shfl
(
var
,
srcLane
);
}
static
__inline__
__device__
double
real_shfl
(
double
var
,
int
srcLane
)
{
int
hi
,
lo
;
asm
volatile
(
"mov.b64 { %0, %1 }, %2;"
:
"=r"
(
lo
),
"=r"
(
hi
)
:
"d"
(
var
));
hi
=
__shfl
(
hi
,
srcLane
);
lo
=
__shfl
(
lo
,
srcLane
);
return
__hiloint2double
(
hi
,
lo
);
}
/**
/**
* Compute nonbonded interactions. The kernel is separated into two parts,
* Compute nonbonded interactions. The kernel is separated into two parts,
...
@@ -19,10 +31,7 @@ typedef struct {
...
@@ -19,10 +31,7 @@ typedef struct {
* implicit warp-level synchronization. A tile is defined by two atom blocks
* implicit warp-level synchronization. A tile is defined by two atom blocks
* each of warpsize. Each warp computes a range of tiles.
* each of warpsize. Each warp computes a range of tiles.
*
*
* On-diagonal tiles processes interaction using a naive all-against-one interaction
* Tiles with exclusions compute the entire set of interactions across
* accumulation scheme.
*
* Off-diagonal tiles with exclusions compute the entire set of interactions across
* atom blocks, equal to warpsize*warpsize. In order to avoid access conflicts
* atom blocks, equal to warpsize*warpsize. In order to avoid access conflicts
* the forces are computed and accumulated diagonally in the manner shown below
* the forces are computed and accumulated diagonally in the manner shown below
* where, suppose
* where, suppose
...
@@ -46,12 +55,15 @@ typedef struct {
...
@@ -46,12 +55,15 @@ typedef struct {
* t o 3 4 5 6 7 8 1 2
* t o 3 4 5 6 7 8 1 2
* a p 2 3 4 5 6 7 8 1
* a p 2 3 4 5 6 7 8 1
*
*
* TODO: Implement shuffle as opposed to using nonbonded.
*
* Tiles without exclusions read off directly from the neighbourlist interactingAtoms
* Tiles without exclusions read off directly from the neighbourlist interactingAtoms
* and follows the same force accumulation method
above
. If more there are more interactingTiles
* and follows the same force accumulation method. If more there are more interactingTiles
* than the size of the neighbourlist initially allocated, the neighbourlist is rebuilt
* than the size of the neighbourlist initially allocated, the neighbourlist is rebuilt
* and the full tileset.
* and the full tileset is computed. This should happen on the first step, and very rarely
* afterwards.
*
* On CUDA devices that support the shuffle intrinsic, on diagonal exclusion tiles use
* __shfl to broadcast. For all other types of tiles __shfl is used to pass around the
* forces, positions, and parameters when computing the forces.
*
*
* [out]forceBuffers - forces on each atom to eventually be accumulated
* [out]forceBuffers - forces on each atom to eventually be accumulated
* [out]energyBuffer - energyBuffer to eventually be accumulated
* [out]energyBuffer - energyBuffer to eventually be accumulated
...
@@ -89,7 +101,11 @@ extern "C" __global__ void computeNonbonded(
...
@@ -89,7 +101,11 @@ extern "C" __global__ void computeNonbonded(
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
// index within the warp
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
// index within the warp
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
// block warpIndex
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
// block warpIndex
real
energy
=
0.0
f
;
real
energy
=
0.0
f
;
// used shared memory if the device cannot shuffle
#ifndef ENABLE_SHUFFLE
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
#endif
// First loop: process tiles that contain exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
...
@@ -99,29 +115,33 @@ extern "C" __global__ void computeNonbonded(
...
@@ -99,29 +115,33 @@ extern "C" __global__ void computeNonbonded(
real3
force
=
make_real3
(
0
);
real3
force
=
make_real3
(
0
);
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
posq1
=
posq
[
atom1
];
real4
posq1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
tileflags
excl
=
exclusions
[
pos
*
TILE_SIZE
+
tgx
];
tileflags
excl
=
exclusions
[
pos
*
TILE_SIZE
+
tgx
];
#endif
#endif
const
bool
hasExclusions
=
true
;
const
bool
hasExclusions
=
true
;
if
(
x
==
y
)
{
if
(
x
==
y
)
{
// This tile is on the diagonal.
// This tile is on the diagonal.
#ifdef ENABLE_SHUFFLE
real4
shflPosq
=
posq1
;
#elif
localData
[
threadIdx
.
x
].
x
=
posq1
.
x
;
localData
[
threadIdx
.
x
].
y
=
posq1
.
y
;
localData
[
threadIdx
.
x
].
z
=
posq1
.
z
;
localData
[
threadIdx
.
x
].
q
=
posq1
.
w
;
LOAD_LOCAL_PARAMETERS_FROM_1
#endif
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
real4
tempPosq
=
posq1
;
// we do not need to fetch parameters from global since this is a symmetric tile
// we do not need to fetch parameters from global since this is a symmetric tile
// instead we can broadcast the values using shuffle
// instead we can broadcast the values using shuffle
// LOAD_LOCAL_PARAMETERS_FROM_1
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
j
;
int
atom2
=
tbx
+
j
;
real4
posq2
;
real4
posq2
;
#ifdef ENABLE_SHUFFLE
// load in the data from other registers
BROADCAST_WARP_DATA
BROADCAST_WARP_DATA
#elif
posq2
=
make_real4
(
localData
[
atom2
].
x
,
localData
[
atom2
].
y
,
localData
[
atom2
].
z
,
localData
[
atom2
].
q
);
#endif
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
...
@@ -159,16 +179,24 @@ extern "C" __global__ void computeNonbonded(
...
@@ -159,16 +179,24 @@ extern "C" __global__ void computeNonbonded(
#endif
#endif
}
}
}
}
else
{
// This is an off-diagonal tile.
else
{
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
// This is an off-diagonal tile.
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
real4
tempPosq
=
posq
[
j
];
real4
shflPosq
=
posq
[
j
];
#ifdef ENABLE_SHUFFLE
real3
tempForces
;
real3
shflForce
;
tempForces
.
x
=
0.0
f
;
shflForce
.
x
=
0.0
f
;
tempForces
.
y
=
0.0
f
;
shflForce
.
y
=
0.0
f
;
tempForces
.
z
=
0.0
f
;
shflForce
.
z
=
0.0
f
;
#elif
localData
[
threadIdx
.
x
].
x
=
shflPosq
.
x
;
localData
[
threadIdx
.
x
].
y
=
shflPosq
.
y
;
localData
[
threadIdx
.
x
].
z
=
shflPosq
.
z
;
localData
[
threadIdx
.
x
].
q
=
shflPosq
.
w
;
localData
[
threadIdx
.
x
].
fx
=
0.0
f
;
localData
[
threadIdx
.
x
].
fy
=
0.0
f
;
localData
[
threadIdx
.
x
].
fz
=
0.0
f
;
#endif
DECLARE_LOCAL_PARAMETERS
DECLARE_LOCAL_PARAMETERS
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
...
@@ -177,7 +205,11 @@ extern "C" __global__ void computeNonbonded(
...
@@ -177,7 +205,11 @@ extern "C" __global__ void computeNonbonded(
unsigned
int
tj
=
tgx
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
int
atom2
=
tbx
+
tj
;
real4
posq2
=
tempPosq
;
#ifdef ENABLE_SHUFFLE
real4
posq2
=
shflPosq
;
#elif
real4
posq2
=
make_real4
(
localData
[
atom2
].
x
,
localData
[
atom2
].
y
,
localData
[
atom2
].
z
,
localData
[
atom2
].
q
);
#endif
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
...
@@ -185,77 +217,88 @@ extern "C" __global__ void computeNonbonded(
...
@@ -185,77 +217,88 @@ extern "C" __global__ void computeNonbonded(
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
y
*
TILE_SIZE
+
tj
;
#ifdef USE_SYMMETRIC
real
dEdR
=
0.0
f
;
#else
real3
dEdR1
=
make_real3
(
0
);
real3
dEdR2
=
make_real3
(
0
);
#endif
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
(
atom1
>=
NUM_ATOMS
||
atom2
>=
NUM_ATOMS
||
!
(
excl
&
0x1
));
#endif
real
tempEnergy
=
0.0
f
;
COMPUTE_INTERACTION
energy
+=
tempEnergy
;
#ifdef USE_SYMMETRIC
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
#ifdef ENABLE_SHUFFLE
shflForce
.
x
+=
delta
.
x
;
shflForce
.
y
+=
delta
.
y
;
shflForce
.
z
+=
delta
.
z
;
#ifdef USE_CUTOFF
#elif
if
(
r2
<
CUTOFF_SQUARED
)
{
localData
[
tbx
+
tj
].
fx
+=
delta
.
x
;
#endif
localData
[
tbx
+
tj
].
fy
+=
delta
.
y
;
real
invR
=
RSQRT
(
r2
);
localData
[
tbx
+
tj
].
fz
+=
delta
.
z
;
real
r
=
RECIP
(
invR
);
#endif
LOAD_ATOM2_PARAMETERS
#else // !USE_SYMMETRIC
atom2
=
y
*
TILE_SIZE
+
tj
;
force
.
x
-=
dEdR1
.
x
;
#ifdef USE_SYMMETRIC
force
.
y
-=
dEdR1
.
y
;
real
dEdR
=
0.0
f
;
force
.
z
-=
dEdR1
.
z
;
#else
#ifdef ENABLE_SHUFFLE
real3
dEdR1
=
make_real3
(
0
);
shflForce
.
x
+=
dEdR2
.
x
;
real3
dEdR2
=
make_real3
(
0
);
shflForce
.
y
+=
dEdR2
.
y
;
#endif
shflForce
.
z
+=
dEdR2
.
z
;
#ifdef USE_EXCLUSIONS
#elif
bool
isExcluded
=
(
atom1
>=
NUM_ATOMS
||
atom2
>=
NUM_ATOMS
||
!
(
excl
&
0x1
));
localData
[
tbx
+
tj
].
fx
+=
dEdR2
.
x
;
#endif
localData
[
tbx
+
tj
].
fy
+=
dEdR2
.
y
;
real
tempEnergy
=
0.0
f
;
localData
[
tbx
+
tj
].
fz
+=
dEdR2
.
z
;
COMPUTE_INTERACTION
#endif
energy
+=
tempEnergy
;
#endif // end USE_SYMMETRIC
#ifdef USE_SYMMETRIC
#ifdef USE_CUTOFF
delta
*=
dEdR
;
}
force
.
x
-=
delta
.
x
;
#endif
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
tempForces
.
x
+=
delta
.
x
;
tempForces
.
y
+=
delta
.
y
;
tempForces
.
z
+=
delta
.
z
;
#else
force
.
x
-=
dEdR1
.
x
;
force
.
y
-=
dEdR1
.
y
;
force
.
z
-=
dEdR1
.
z
;
tempForces
.
x
+=
dEdR2
.
x
;
tempForces
.
y
+=
dEdR2
.
y
;
tempForces
.
z
+=
dEdR2
.
z
;
#endif
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
excl
>>=
1
;
excl
>>=
1
;
#endif
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA
#endif
#endif
// cycles the indices
// cycles the indices
// 0 1 2 3 4 5 6 7 -> 1 2 3 4 5 6 7 0
// 0 1 2 3 4 5 6 7 -> 1 2 3 4 5 6 7 0
SHUFFLE_WARP_DATA
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
// write results for off diagonal tiles
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
tempForces
.
x
*
0x100000000
)));
#ifdef ENABLE_SHUFFLE
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
tempForces
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
shflForce
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
tempForces
.
z
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
shflForce
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
shflForce
.
z
*
0x100000000
)));
#elif
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fx
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fy
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fz
*
0x100000000
)));
#endif
}
}
// Write results for on and off diagonal tiles
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
const
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
//if (x != y) {
// offset = y*TILE_SIZE + tgx;
// atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (tempForces.x*0x100000000)));
// atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (tempForces.y*0x100000000)));
// atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (tempForces.z*0x100000000)));
//}
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
// of them (no cutoff).
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
const
unsigned
int
numTiles
=
interactionCount
[
0
];
const
unsigned
int
numTiles
=
interactionCount
[
0
];
int
pos
=
(
numTiles
>
maxTiles
?
startTileIndex
+
warp
*
numTileIndices
/
totalWarps
:
warp
*
numTiles
/
totalWarps
);
int
pos
=
(
numTiles
>
maxTiles
?
startTileIndex
+
warp
*
numTileIndices
/
totalWarps
:
warp
*
numTiles
/
totalWarps
);
...
@@ -267,6 +310,8 @@ extern "C" __global__ void computeNonbonded(
...
@@ -267,6 +310,8 @@ extern "C" __global__ void computeNonbonded(
#endif
#endif
int
skipBase
=
0
;
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
int
currentSkipIndex
=
tbx
;
// atomIndices can probably be shuffled as well
// but it probably wouldn't make things any faster
__shared__
int
atomIndices
[
THREAD_BLOCK_SIZE
];
__shared__
int
atomIndices
[
THREAD_BLOCK_SIZE
];
__shared__
volatile
int
skipTiles
[
THREAD_BLOCK_SIZE
];
__shared__
volatile
int
skipTiles
[
THREAD_BLOCK_SIZE
];
skipTiles
[
threadIdx
.
x
]
=
-
1
;
skipTiles
[
threadIdx
.
x
]
=
-
1
;
...
@@ -277,7 +322,6 @@ extern "C" __global__ void computeNonbonded(
...
@@ -277,7 +322,6 @@ extern "C" __global__ void computeNonbonded(
bool
includeTile
=
true
;
bool
includeTile
=
true
;
// Extract the coordinates of this tile.
// Extract the coordinates of this tile.
unsigned
int
x
,
y
;
unsigned
int
x
,
y
;
bool
singlePeriodicCopy
=
false
;
bool
singlePeriodicCopy
=
false
;
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
...
@@ -317,99 +361,118 @@ extern "C" __global__ void computeNonbonded(
...
@@ -317,99 +361,118 @@ extern "C" __global__ void computeNonbonded(
}
}
if
(
includeTile
)
{
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
// Load atom data for this tile.
real4
posq1
=
posq
[
atom1
];
real4
posq1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
LOAD_ATOM1_PARAMETERS
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
//
const unsigned int localAtomIndex = threadIdx.x;
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
);
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
);
#else
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
#endif
atomIndices
[
threadIdx
.
x
]
=
j
;
atomIndices
[
threadIdx
.
x
]
=
j
;
real4
tempPosq
;
#ifdef ENABLE_SHUFFLE
real3
tempForces
;
tempForces
.
x
=
0.0
f
;
tempForces
.
y
=
0.0
f
;
tempForces
.
z
=
0.0
f
;
DECLARE_LOCAL_PARAMETERS
DECLARE_LOCAL_PARAMETERS
real4
shflPosq
;
real3
shflForce
;
shflForce
.
x
=
0.0
f
;
shflForce
.
y
=
0.0
f
;
shflForce
.
z
=
0.0
f
;
#endif
if
(
j
<
PADDED_NUM_ATOMS
)
{
if
(
j
<
PADDED_NUM_ATOMS
)
{
// Load position of atom j from from global memory
// Load position of atom j from from global memory
tempPosq
=
posq
[
j
];
#ifdef ENABLE_SHUFFLE
shflPosq
=
posq
[
j
];
//localData[localAtomIndex].x = tempPosq.x;
#elif
//localData[localAtomIndex].y = tempPosq.y;
localData
[
threadIdx
.
x
].
x
=
posq
[
j
].
x
;
//localData[localAtomIndex].z = tempPosq.z;
localData
[
threadIdx
.
x
].
y
=
posq
[
j
].
y
;
//localData[localAtomIndex].q = tempPosq.w;
localData
[
threadIdx
.
x
].
z
=
posq
[
j
].
z
;
localData
[
threadIdx
.
x
].
q
=
posq
[
j
].
w
;
localData
[
threadIdx
.
x
].
fx
=
0.0
f
;
localData
[
threadIdx
.
x
].
fy
=
0.0
f
;
localData
[
threadIdx
.
x
].
fz
=
0.0
f
;
#endif
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
//localData[localAtomIndex].fx = 0.0f;
//localData[localAtomIndex].fy = 0.0f;
//localData[localAtomIndex].fz = 0.0f;
}
}
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
if
(
singlePeriodicCopy
)
{
if
(
singlePeriodicCopy
)
{
// The box is small enough that we can just translate all the atoms into a single periodic
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
// box, then skip having to apply periodic boundary conditions later.
real4
blockCenterX
=
blockCenter
[
x
];
real4
blockCenterX
=
blockCenter
[
x
];
posq1
.
x
-=
floor
((
posq1
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
posq1
.
x
-=
floor
((
posq1
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
posq1
.
y
-=
floor
((
posq1
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
posq1
.
y
-=
floor
((
posq1
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
posq1
.
z
-=
floor
((
posq1
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
posq1
.
z
-=
floor
((
posq1
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#ifdef ENABLE_SHUFFLE
//localData[localAtomIndex].x -= floor((localData[localAtomIndex].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
shflPosq
.
x
-=
floor
((
shflPosq
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
//localData[localAtomIndex].y -= floor((localData[localAtomIndex].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
shflPosq
.
y
-=
floor
((
shflPosq
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
//localData[localAtomIndex].z -= floor((localData[localAtomIndex].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
shflPosq
.
z
-=
floor
((
shflPosq
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
tempPosq
.
x
-=
floor
((
tempPosq
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
#elif
tempPosq
.
y
-=
floor
((
tempPosq
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
localData
[
threadIdx
.
x
].
x
-=
floor
((
localData
[
threadIdx
.
x
].
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
tempPosq
.
z
-=
floor
((
tempPosq
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
localData
[
threadIdx
.
x
].
y
-=
floor
((
localData
[
threadIdx
.
x
].
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
localData
[
threadIdx
.
x
].
z
-=
floor
((
localData
[
threadIdx
.
x
].
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
unsigned
int
tj
=
tgx
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
int
atom2
=
tbx
+
tj
;
real4
posq2
=
tempPosq
;
#ifdef ENABLE_SHUFFLE
real4
posq2
=
shflPosq
;
#elif
real4
posq2
=
make_real4
(
localData
[
atom2
].
x
,
localData
[
atom2
].
y
,
localData
[
atom2
].
z
,
localData
[
atom2
].
q
);
#endif
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
if
(
r2
<
CUTOFF_SQUARED
)
{
if
(
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
);
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices
[
tbx
+
tj
];
atom2
=
atomIndices
[
tbx
+
tj
];
#ifdef USE_SYMMETRIC
#ifdef USE_SYMMETRIC
real
dEdR
=
0.0
f
;
real
dEdR
=
0.0
f
;
#else
#else
real3
dEdR1
=
make_real3
(
0
);
real3
dEdR1
=
make_real3
(
0
);
real3
dEdR2
=
make_real3
(
0
);
real3
dEdR2
=
make_real3
(
0
);
#endif
#endif
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
(
atom1
>=
NUM_ATOMS
||
atom2
>=
NUM_ATOMS
);
bool
isExcluded
=
(
atom1
>=
NUM_ATOMS
||
atom2
>=
NUM_ATOMS
);
#endif
#endif
real
tempEnergy
=
0.0
f
;
real
tempEnergy
=
0.0
f
;
COMPUTE_INTERACTION
COMPUTE_INTERACTION
energy
+=
tempEnergy
;
energy
+=
tempEnergy
;
#ifdef USE_SYMMETRIC
#ifdef USE_SYMMETRIC
delta
*=
dEdR
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
force
.
z
-=
delta
.
z
;
tempForces
.
x
+=
delta
.
x
;
#ifdef ENABLE_SHUFFLE
tempForces
.
y
+=
delta
.
y
;
shflForce
.
x
+=
delta
.
x
;
tempForces
.
z
+=
delta
.
z
;
shflForce
.
y
+=
delta
.
y
;
#else
shflForce
.
z
+=
delta
.
z
;
force
.
x
-=
dEdR1
.
x
;
force
.
y
-=
dEdR1
.
y
;
#elif
force
.
z
-=
dEdR1
.
z
;
localData
[
tbx
+
tj
].
fx
+=
delta
.
x
;
tempForces
.
x
+=
dEdR2
.
x
;
localData
[
tbx
+
tj
].
fy
+=
delta
.
y
;
tempForces
.
y
+=
dEdR2
.
y
;
localData
[
tbx
+
tj
].
fz
+=
delta
.
z
;
tempForces
.
z
+=
dEdR2
.
z
;
#endif
#endif
#else // !USE_SYMMETRIC
force
.
x
-=
dEdR1
.
x
;
force
.
y
-=
dEdR1
.
y
;
force
.
z
-=
dEdR1
.
z
;
#ifdef ENABLE_SHUFFLE
shflForce
.
x
+=
dEdR2
.
x
;
shflForce
.
y
+=
dEdR2
.
y
;
shflForce
.
z
+=
dEdR2
.
z
;
#elif
localData
[
tbx
+
tj
].
fx
+=
dEdR2
.
x
;
localData
[
tbx
+
tj
].
fy
+=
dEdR2
.
y
;
localData
[
tbx
+
tj
].
fz
+=
dEdR2
.
z
;
#endif
#endif // end USE_SYMMETRIC
}
}
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA
SHUFFLE_WARP_DATA
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
}
else
else
#endif
#endif
...
@@ -418,7 +481,11 @@ extern "C" __global__ void computeNonbonded(
...
@@ -418,7 +481,11 @@ extern "C" __global__ void computeNonbonded(
unsigned
int
tj
=
tgx
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
int
atom2
=
tbx
+
tj
;
real4
posq2
=
tempPosq
;
#ifdef ENABLE_SHUFFLE
real4
posq2
=
shflPosq
;
#elif
real4
posq2
=
make_real4
(
localData
[
atom2
].
x
,
localData
[
atom2
].
y
,
localData
[
atom2
].
z
,
localData
[
atom2
].
q
);
#endif
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
...
@@ -426,52 +493,65 @@ extern "C" __global__ void computeNonbonded(
...
@@ -426,52 +493,65 @@ extern "C" __global__ void computeNonbonded(
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices
[
tbx
+
tj
];
#ifdef USE_SYMMETRIC
real
dEdR
=
0.0
f
;
#else
real3
dEdR1
=
make_real3
(
0
);
real3
dEdR2
=
make_real3
(
0
);
#endif
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
(
atom1
>=
NUM_ATOMS
||
atom2
>=
NUM_ATOMS
);
#endif
real
tempEnergy
=
0.0
f
;
COMPUTE_INTERACTION
energy
+=
tempEnergy
;
#ifdef USE_SYMMETRIC
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
#ifdef ENABLE_SHUFFLE
shflForce
.
x
+=
delta
.
x
;
shflForce
.
y
+=
delta
.
y
;
shflForce
.
z
+=
delta
.
z
;
#ifdef USE_CUTOFF
#elif
if
(
r2
<
CUTOFF_SQUARED
)
{
localData
[
tbx
+
tj
].
fx
+=
delta
.
x
;
#endif
localData
[
tbx
+
tj
].
fy
+=
delta
.
y
;
real
invR
=
RSQRT
(
r2
);
localData
[
tbx
+
tj
].
fz
+=
delta
.
z
;
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices
[
tbx
+
tj
];
#ifdef USE_SYMMETRIC
real
dEdR
=
0.0
f
;
#else
real3
dEdR1
=
make_real3
(
0
);
real3
dEdR2
=
make_real3
(
0
);
#endif
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
(
atom1
>=
NUM_ATOMS
||
atom2
>=
NUM_ATOMS
);
#endif
real
tempEnergy
=
0.0
f
;
COMPUTE_INTERACTION
energy
+=
tempEnergy
;
#ifdef USE_SYMMETRIC
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
tempForces
.
x
+=
delta
.
x
;
tempForces
.
y
+=
delta
.
y
;
tempForces
.
z
+=
delta
.
z
;
#else
force
.
x
-=
dEdR1
.
x
;
force
.
y
-=
dEdR1
.
y
;
force
.
z
-=
dEdR1
.
z
;
tempForces
.
x
+=
dEdR2
.
x
;
tempForces
.
y
+=
dEdR2
.
y
;
tempForces
.
z
+=
dEdR2
.
z
;
#endif
#ifdef USE_CUTOFF
}
#endif
#endif
#else // !USE_SYMMETRIC
force
.
x
-=
dEdR1
.
x
;
force
.
y
-=
dEdR1
.
y
;
force
.
z
-=
dEdR1
.
z
;
#ifdef ENABLE_SHUFFLE
shflForce
.
x
+=
dEdR2
.
x
;
shflForce
.
y
+=
dEdR2
.
y
;
shflForce
.
z
+=
dEdR2
.
z
;
#elif
localData
[
tbx
+
tj
].
fx
+=
dEdR2
.
x
;
localData
[
tbx
+
tj
].
fy
+=
dEdR2
.
y
;
localData
[
tbx
+
tj
].
fz
+=
dEdR2
.
z
;
#endif
#endif // end USE_SYMMETRIC
#ifdef USE_CUTOFF
}
#endif
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA
SHUFFLE_WARP_DATA
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
}
// Write results.
// Write results.
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
...
@@ -481,13 +561,18 @@ extern "C" __global__ void computeNonbonded(
...
@@ -481,13 +561,18 @@ extern "C" __global__ void computeNonbonded(
unsigned
int
atom2
=
y
*
TILE_SIZE
+
tgx
;
unsigned
int
atom2
=
y
*
TILE_SIZE
+
tgx
;
#endif
#endif
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
atomicAdd
(
&
forceBuffers
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
tempForces
.
x
*
0x100000000
)));
#ifdef ENABLE_SHUFFLE
atomicAdd
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
tempForces
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
shflForce
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
tempForces
.
z
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
shflForce
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
shflForce
.
z
*
0x100000000
)));
#elif
atomicAdd
(
&
forceBuffers
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fx
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fy
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fz
*
0x100000000
)));
#endif
}
}
}
}
pos
++
;
pos
++
;
}
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
}
}
\ No newline at end of file
platforms/cuda/tests/TestCudaNonbondedForce.cpp
View file @
ec39f6ff
...
@@ -872,21 +872,21 @@ int main(int argc, char* argv[]) {
...
@@ -872,21 +872,21 @@ int main(int argc, char* argv[]) {
try
{
try
{
if
(
argc
>
1
)
if
(
argc
>
1
)
platform
.
setPropertyDefaultValue
(
"CudaPrecision"
,
string
(
argv
[
1
]));
platform
.
setPropertyDefaultValue
(
"CudaPrecision"
,
string
(
argv
[
1
]));
//
testCoulomb();
testCoulomb
();
//
testLJ();
testLJ
();
//
testExclusionsAnd14();
testExclusionsAnd14
();
//
testCutoff();
testCutoff
();
//
testCutoff14();
testCutoff14
();
//
testPeriodic();
testPeriodic
();
testLargeSystem
();
testLargeSystem
();
//testBlockInteractions(false);
//testBlockInteractions(false);
//testBlockInteractions(true);
//testBlockInteractions(true);
//
testDispersionCorrection();
testDispersionCorrection
();
//
testChangingParameters();
testChangingParameters
();
//
testParallelComputation(false);
testParallelComputation
(
false
);
//
testParallelComputation(true);
testParallelComputation
(
true
);
//
testSwitchingFunction(NonbondedForce::CutoffNonPeriodic);
testSwitchingFunction
(
NonbondedForce
::
CutoffNonPeriodic
);
//
testSwitchingFunction(NonbondedForce::PME);
testSwitchingFunction
(
NonbondedForce
::
PME
);
}
}
catch
(
const
exception
&
e
)
{
catch
(
const
exception
&
e
)
{
cout
<<
"exception: "
<<
e
.
what
()
<<
endl
;
cout
<<
"exception: "
<<
e
.
what
()
<<
endl
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment