Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
2bb875b0
Commit
2bb875b0
authored
Jul 06, 2012
by
Peter Eastman
Browse files
Optimizations to new CUDA platform
parent
ca7fd533
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
104 additions
and
42 deletions
+104
-42
platforms/cuda2/src/CudaKernels.cpp
platforms/cuda2/src/CudaKernels.cpp
+4
-3
platforms/cuda2/src/CudaNonbondedUtilities.cpp
platforms/cuda2/src/CudaNonbondedUtilities.cpp
+2
-0
platforms/cuda2/src/kernels/gbsaObc1.cu
platforms/cuda2/src/kernels/gbsaObc1.cu
+36
-9
platforms/cuda2/src/kernels/nonbonded.cu
platforms/cuda2/src/kernels/nonbonded.cu
+35
-6
platforms/cuda2/src/kernels/pme.cu
platforms/cuda2/src/kernels/pme.cu
+27
-24
No files found.
platforms/cuda2/src/CudaKernels.cpp
View file @
2bb875b0
...
@@ -1448,6 +1448,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
...
@@ -1448,6 +1448,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeConvolutionKernel
=
cu
.
getKernel
(
module
,
"reciprocalConvolution"
);
pmeConvolutionKernel
=
cu
.
getKernel
(
module
,
"reciprocalConvolution"
);
pmeInterpolateForceKernel
=
cu
.
getKernel
(
module
,
"gridInterpolateForce"
);
pmeInterpolateForceKernel
=
cu
.
getKernel
(
module
,
"gridInterpolateForce"
);
pmeFinishSpreadChargeKernel
=
cu
.
getKernel
(
module
,
"finishSpreadCharge"
);
pmeFinishSpreadChargeKernel
=
cu
.
getKernel
(
module
,
"finishSpreadCharge"
);
cuFuncSetCacheConfig
(
pmeInterpolateForceKernel
,
CU_FUNC_CACHE_PREFER_L1
);
// Create required data structures.
// Create required data structures.
...
@@ -1606,9 +1607,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1606,9 +1607,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
void
*
interpolateArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
cu
.
getForce
().
getDevicePointer
(),
&
pmeGrid
->
getDevicePointer
(),
void
*
interpolateArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
cu
.
getForce
().
getDevicePointer
(),
&
pmeGrid
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
interpolateForceThreads
=
64
;
cu
.
executeKernel
(
pmeInterpolateForceKernel
,
interpolateArgs
,
cu
.
getNumAtoms
());
int
interpolateSharedSize
=
2
*
interpolateForceThreads
*
PmeOrder
*
(
cu
.
getUseDoublePrecision
()
?
sizeof
(
double3
)
:
sizeof
(
float3
));
cu
.
executeKernel
(
pmeInterpolateForceKernel
,
interpolateArgs
,
cu
.
getNumAtoms
(),
interpolateForceThreads
,
interpolateSharedSize
);
}
}
double
energy
=
(
includeReciprocal
?
ewaldSelfEnergy
:
0.0
);
double
energy
=
(
includeReciprocal
?
ewaldSelfEnergy
:
0.0
);
if
(
dispersionCoefficient
!=
0.0
&&
includeDirect
)
{
if
(
dispersionCoefficient
!=
0.0
&&
includeDirect
)
{
...
@@ -1970,6 +1969,8 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
...
@@ -1970,6 +1969,8 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
defines
[
"USE_CUTOFF"
]
=
"1"
;
defines
[
"USE_CUTOFF"
]
=
"1"
;
if
(
nb
.
getUsePeriodic
())
if
(
nb
.
getUsePeriodic
())
defines
[
"USE_PERIODIC"
]
=
"1"
;
defines
[
"USE_PERIODIC"
]
=
"1"
;
if
(
cu
.
getComputeCapability
()
>=
3.0
&&
!
cu
.
getUseDoublePrecision
())
defines
[
"ENABLE_SHUFFLE"
]
=
"1"
;
defines
[
"CUTOFF_SQUARED"
]
=
cu
.
doubleToString
(
nb
.
getCutoffDistance
()
*
nb
.
getCutoffDistance
());
defines
[
"CUTOFF_SQUARED"
]
=
cu
.
doubleToString
(
nb
.
getCutoffDistance
()
*
nb
.
getCutoffDistance
());
defines
[
"PREFACTOR"
]
=
cu
.
doubleToString
(
prefactor
);
defines
[
"PREFACTOR"
]
=
cu
.
doubleToString
(
prefactor
);
defines
[
"NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getNumAtoms
());
defines
[
"NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getNumAtoms
());
...
...
platforms/cuda2/src/CudaNonbondedUtilities.cpp
View file @
2bb875b0
...
@@ -447,6 +447,8 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
...
@@ -447,6 +447,8 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
defines
[
"NUM_BLOCKS"
]
=
context
.
intToString
(
context
.
getNumAtomBlocks
());
defines
[
"NUM_BLOCKS"
]
=
context
.
intToString
(
context
.
getNumAtomBlocks
());
if
((
localDataSize
/
4
)
%
2
==
0
&&
!
context
.
getUseDoublePrecision
())
if
((
localDataSize
/
4
)
%
2
==
0
&&
!
context
.
getUseDoublePrecision
())
defines
[
"PARAMETER_SIZE_IS_EVEN"
]
=
"1"
;
defines
[
"PARAMETER_SIZE_IS_EVEN"
]
=
"1"
;
if
(
context
.
getComputeCapability
()
>=
3.0
&&
!
context
.
getUseDoublePrecision
())
defines
[
"ENABLE_SHUFFLE"
]
=
"1"
;
CUmodule
program
=
context
.
createModule
(
CudaKernelSources
::
vectorOps
+
context
.
replaceStrings
(
CudaKernelSources
::
nonbonded
,
replacements
),
defines
);
CUmodule
program
=
context
.
createModule
(
CudaKernelSources
::
vectorOps
+
context
.
replaceStrings
(
CudaKernelSources
::
nonbonded
,
replacements
),
defines
);
CUfunction
kernel
=
context
.
getKernel
(
program
,
"computeNonbonded"
);
CUfunction
kernel
=
context
.
getKernel
(
program
,
"computeNonbonded"
);
...
...
platforms/cuda2/src/kernels/gbsaObc1.cu
View file @
2bb875b0
...
@@ -87,10 +87,11 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
...
@@ -87,10 +87,11 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
#endif
#endif
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__shared__
AtomData1
localData
[
FORCE_WORK_GROUP_SIZE
];
__shared__
AtomData1
localData
[
FORCE_WORK_GROUP_SIZE
];
__shared__
real
tempBuffer
[
FORCE_WORK_GROUP_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
#ifndef ENABLE_SHUFFLE
__shared__
real
tempBuffer
[
FORCE_WORK_GROUP_SIZE
];
#endif
do
{
do
{
// Extract the coordinates of this tile
// Extract the coordinates of this tile
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
...
@@ -204,7 +205,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
...
@@ -204,7 +205,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
tempBuffer
[
threadIdx
.
x
]
=
0
.0
f
;
real
sum
=
0
;
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#else
#else
...
@@ -236,16 +237,24 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
...
@@ -236,16 +237,24 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
(
params1
.
y
*
params1
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
(
params1
.
y
*
params1
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
if
(
params2
.
x
<
params1
.
y
-
r
)
if
(
params2
.
x
<
params1
.
y
-
r
)
term
+=
2.0
f
*
(
RECIP
(
params2
.
x
)
-
l_ij
);
term
+=
2.0
f
*
(
RECIP
(
params2
.
x
)
-
l_ij
);
tempBuffer
[
threadIdx
.
x
]
=
term
;
sum
=
term
;
}
}
}
}
// Sum the forces on atom j.
// Sum the forces on atom j.
#ifdef ENABLE_SHUFFLE
for
(
int
i
=
16
;
i
>=
1
;
i
/=
2
)
sum
+=
__shfl_xor
(
sum
,
i
,
32
);
if
(
tgx
==
0
)
localData
[
tbx
+
j
].
bornSum
+=
sum
;
#else
tempBuffer
[
threadIdx
.
x
]
=
sum
;
if
(
tgx
%
4
==
0
)
if
(
tgx
%
4
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
1
]
+
tempBuffer
[
threadIdx
.
x
+
2
]
+
tempBuffer
[
threadIdx
.
x
+
3
];
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
1
]
+
tempBuffer
[
threadIdx
.
x
+
2
]
+
tempBuffer
[
threadIdx
.
x
+
3
];
if
(
tgx
==
0
)
if
(
tgx
==
0
)
localData
[
tbx
+
j
].
bornSum
+=
tempBuffer
[
threadIdx
.
x
]
+
tempBuffer
[
threadIdx
.
x
+
4
]
+
tempBuffer
[
threadIdx
.
x
+
8
]
+
tempBuffer
[
threadIdx
.
x
+
12
]
+
tempBuffer
[
threadIdx
.
x
+
16
]
+
tempBuffer
[
threadIdx
.
x
+
20
]
+
tempBuffer
[
threadIdx
.
x
+
24
]
+
tempBuffer
[
threadIdx
.
x
+
28
];
localData
[
tbx
+
j
].
bornSum
+=
tempBuffer
[
threadIdx
.
x
]
+
tempBuffer
[
threadIdx
.
x
+
4
]
+
tempBuffer
[
threadIdx
.
x
+
8
]
+
tempBuffer
[
threadIdx
.
x
+
12
]
+
tempBuffer
[
threadIdx
.
x
+
16
]
+
tempBuffer
[
threadIdx
.
x
+
20
]
+
tempBuffer
[
threadIdx
.
x
+
24
]
+
tempBuffer
[
threadIdx
.
x
+
28
];
#endif
}
}
}
}
}
}
...
@@ -351,10 +360,12 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
...
@@ -351,10 +360,12 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
real
energy
=
0
;
real
energy
=
0
;
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__shared__
AtomData2
localData
[
FORCE_WORK_GROUP_SIZE
];
__shared__
AtomData2
localData
[
FORCE_WORK_GROUP_SIZE
];
__shared__
real4
tempBuffer
[
FORCE_WORK_GROUP_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
#ifndef ENABLE_SHUFFLE
__shared__
real4
tempBuffer
[
FORCE_WORK_GROUP_SIZE
];
#endif
do
{
do
{
// Extract the coordinates of this tile
// Extract the coordinates of this tile
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
...
@@ -466,7 +477,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
...
@@ -466,7 +477,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
real4
posq2
=
make_real4
(
localData
[
tbx
+
j
].
x
,
localData
[
tbx
+
j
].
y
,
localData
[
tbx
+
j
].
z
,
localData
[
tbx
+
j
].
q
);
real4
posq2
=
make_real4
(
localData
[
tbx
+
j
].
x
,
localData
[
tbx
+
j
].
y
,
localData
[
tbx
+
j
].
z
,
localData
[
tbx
+
j
].
q
);
real
3
delta
=
make_real
3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real
4
delta
=
make_real
4
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
,
0
);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
...
@@ -503,15 +514,30 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
...
@@ -503,15 +514,30 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
force
.
x
-=
delta
.
x
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
force
.
z
-=
delta
.
z
;
tempBuffer
[
threadIdx
.
x
]
=
make_real4
(
delta
.
x
,
delta
.
y
,
delta
.
z
,
dGpol_dalpha2_ij
*
bornRadius1
)
;
delta
.
w
=
dGpol_dalpha2_ij
*
bornRadius1
;
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
}
}
else
else
tempBuffer
[
threadIdx
.
x
]
=
make_real4
(
0
);
delta
=
make_real4
(
0
);
#endif
#endif
// Sum the forces on atom j.
// Sum the forces on atom j.
#ifdef ENABLE_SHUFFLE
for
(
int
i
=
16
;
i
>=
1
;
i
/=
2
)
{
delta
.
x
+=
__shfl_xor
(
delta
.
x
,
i
,
32
);
delta
.
y
+=
__shfl_xor
(
delta
.
y
,
i
,
32
);
delta
.
z
+=
__shfl_xor
(
delta
.
z
,
i
,
32
);
delta
.
w
+=
__shfl_xor
(
delta
.
w
,
i
,
32
);
}
if
(
tgx
==
0
)
{
localData
[
tbx
+
j
].
fx
+=
delta
.
x
;
localData
[
tbx
+
j
].
fy
+=
delta
.
y
;
localData
[
tbx
+
j
].
fz
+=
delta
.
z
;
localData
[
tbx
+
j
].
fw
+=
delta
.
w
;
}
#else
tempBuffer
[
threadIdx
.
x
]
=
delta
;
if
(
tgx
%
4
==
0
)
if
(
tgx
%
4
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
1
]
+
tempBuffer
[
threadIdx
.
x
+
2
]
+
tempBuffer
[
threadIdx
.
x
+
3
];
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
1
]
+
tempBuffer
[
threadIdx
.
x
+
2
]
+
tempBuffer
[
threadIdx
.
x
+
3
];
if
(
tgx
==
0
)
{
if
(
tgx
==
0
)
{
...
@@ -521,6 +547,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
...
@@ -521,6 +547,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
localData
[
tbx
+
j
].
fz
+=
sum
.
z
;
localData
[
tbx
+
j
].
fz
+=
sum
.
z
;
localData
[
tbx
+
j
].
fw
+=
sum
.
w
;
localData
[
tbx
+
j
].
fw
+=
sum
.
w
;
}
}
#endif
}
}
}
}
}
}
...
...
platforms/cuda2/src/kernels/nonbonded.cu
View file @
2bb875b0
...
@@ -35,9 +35,11 @@ extern "C" __global__ void computeNonbonded(
...
@@ -35,9 +35,11 @@ extern "C" __global__ void computeNonbonded(
#endif
#endif
real
energy
=
0.0
f
;
real
energy
=
0.0
f
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
__shared__
real
tempBuffer
[
3
*
THREAD_BLOCK_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
#ifndef ENABLE_SHUFFLE
__shared__
real
tempBuffer
[
3
*
THREAD_BLOCK_SIZE
];
#endif
do
{
do
{
// Extract the coordinates of this tile
// Extract the coordinates of this tile
...
@@ -186,18 +188,46 @@ extern "C" __global__ void computeNonbonded(
...
@@ -186,18 +188,46 @@ extern "C" __global__ void computeNonbonded(
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
}
}
#endif
#endif
#ifdef USE_SYMMETRIC
#ifdef ENABLE_SHUFFLE
#ifdef USE_SYMMETRIC
delta
*=
dEdR
;
force
-=
delta
;
for
(
int
i
=
16
;
i
>=
1
;
i
/=
2
)
{
delta
.
x
+=
__shfl_xor
(
delta
.
x
,
i
,
32
);
delta
.
y
+=
__shfl_xor
(
delta
.
y
,
i
,
32
);
delta
.
z
+=
__shfl_xor
(
delta
.
z
,
i
,
32
);
}
if
(
tgx
==
0
)
{
localData
[
tbx
+
j
].
fx
+=
delta
.
x
;
localData
[
tbx
+
j
].
fy
+=
delta
.
y
;
localData
[
tbx
+
j
].
fz
+=
delta
.
z
;
}
#else
force
-=
dEdR1
;
for
(
int
i
=
16
;
i
>=
1
;
i
/=
2
)
{
dEdR2
.
x
+=
__shfl_xor
(
dEdR2
.
x
,
i
,
32
);
dEdR2
.
y
+=
__shfl_xor
(
dEdR2
.
y
,
i
,
32
);
dEdR2
.
z
+=
__shfl_xor
(
dEdR2
.
z
,
i
,
32
);
}
if
(
tgx
==
0
)
{
localData
[
tbx
+
j
].
fx
+=
dEdR2
.
x
;
localData
[
tbx
+
j
].
fy
+=
dEdR2
.
y
;
localData
[
tbx
+
j
].
fz
+=
dEdR2
.
z
;
}
#endif
#else
#ifdef USE_SYMMETRIC
delta
*=
dEdR
;
delta
*=
dEdR
;
force
-=
delta
;
force
-=
delta
;
tempBuffer
[
bufferIndex
]
=
delta
.
x
;
tempBuffer
[
bufferIndex
]
=
delta
.
x
;
tempBuffer
[
bufferIndex
+
1
]
=
delta
.
y
;
tempBuffer
[
bufferIndex
+
1
]
=
delta
.
y
;
tempBuffer
[
bufferIndex
+
2
]
=
delta
.
z
;
tempBuffer
[
bufferIndex
+
2
]
=
delta
.
z
;
#else
#else
force
-=
dEdR1
;
force
-=
dEdR1
;
tempBuffer
[
bufferIndex
]
=
dEdR2
.
x
;
tempBuffer
[
bufferIndex
]
=
dEdR2
.
x
;
tempBuffer
[
bufferIndex
+
1
]
=
dEdR2
.
y
;
tempBuffer
[
bufferIndex
+
1
]
=
dEdR2
.
y
;
tempBuffer
[
bufferIndex
+
2
]
=
dEdR2
.
z
;
tempBuffer
[
bufferIndex
+
2
]
=
dEdR2
.
z
;
#endif
#endif
// Sum the forces on atom2.
// Sum the forces on atom2.
...
@@ -211,6 +241,7 @@ extern "C" __global__ void computeNonbonded(
...
@@ -211,6 +241,7 @@ extern "C" __global__ void computeNonbonded(
localData
[
tbx
+
j
].
fy
+=
tempBuffer
[
bufferIndex
+
1
]
+
tempBuffer
[
bufferIndex
+
13
]
+
tempBuffer
[
bufferIndex
+
25
]
+
tempBuffer
[
bufferIndex
+
37
]
+
tempBuffer
[
bufferIndex
+
49
]
+
tempBuffer
[
bufferIndex
+
61
]
+
tempBuffer
[
bufferIndex
+
73
]
+
tempBuffer
[
bufferIndex
+
85
];
localData
[
tbx
+
j
].
fy
+=
tempBuffer
[
bufferIndex
+
1
]
+
tempBuffer
[
bufferIndex
+
13
]
+
tempBuffer
[
bufferIndex
+
25
]
+
tempBuffer
[
bufferIndex
+
37
]
+
tempBuffer
[
bufferIndex
+
49
]
+
tempBuffer
[
bufferIndex
+
61
]
+
tempBuffer
[
bufferIndex
+
73
]
+
tempBuffer
[
bufferIndex
+
85
];
localData
[
tbx
+
j
].
fz
+=
tempBuffer
[
bufferIndex
+
2
]
+
tempBuffer
[
bufferIndex
+
14
]
+
tempBuffer
[
bufferIndex
+
26
]
+
tempBuffer
[
bufferIndex
+
38
]
+
tempBuffer
[
bufferIndex
+
50
]
+
tempBuffer
[
bufferIndex
+
62
]
+
tempBuffer
[
bufferIndex
+
74
]
+
tempBuffer
[
bufferIndex
+
86
];
localData
[
tbx
+
j
].
fz
+=
tempBuffer
[
bufferIndex
+
2
]
+
tempBuffer
[
bufferIndex
+
14
]
+
tempBuffer
[
bufferIndex
+
26
]
+
tempBuffer
[
bufferIndex
+
38
]
+
tempBuffer
[
bufferIndex
+
50
]
+
tempBuffer
[
bufferIndex
+
62
]
+
tempBuffer
[
bufferIndex
+
74
]
+
tempBuffer
[
bufferIndex
+
86
];
}
}
#endif
}
}
}
}
}
}
...
@@ -285,14 +316,12 @@ extern "C" __global__ void computeNonbonded(
...
@@ -285,14 +316,12 @@ extern "C" __global__ void computeNonbonded(
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0xFFFFFFFF
)));
__threadfence_block
();
}
}
if
(
pos
<
end
&&
x
!=
y
)
{
if
(
pos
<
end
&&
x
!=
y
)
{
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fx
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fx
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fy
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fy
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fz
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fz
*
0xFFFFFFFF
)));
__threadfence_block
();
}
}
pos
++
;
pos
++
;
}
while
(
pos
<
end
);
}
while
(
pos
<
end
);
...
...
platforms/cuda2/src/kernels/pme.cu
View file @
2bb875b0
...
@@ -77,18 +77,16 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, unsi
...
@@ -77,18 +77,16 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, unsi
for
(
int
baseIndex
=
blockIdx
.
x
*
BUFFER_SIZE
;
baseIndex
<
NUM_ATOMS
;
baseIndex
+=
gridDim
.
x
*
BUFFER_SIZE
)
{
for
(
int
baseIndex
=
blockIdx
.
x
*
BUFFER_SIZE
;
baseIndex
<
NUM_ATOMS
;
baseIndex
+=
gridDim
.
x
*
BUFFER_SIZE
)
{
// Load the next block of atoms into the buffers.
// Load the next block of atoms into the buffers.
if
(
threadIdx
.
x
<
BUFFER_SIZE
)
{
int
atomIndex
=
baseIndex
+
threadIdx
.
x
;
int
atomIndex
=
baseIndex
+
threadIdx
.
x
;
if
(
atomIndex
<
NUM_ATOMS
)
{
if
(
atomIndex
<
NUM_ATOMS
)
{
real4
pos
=
posq
[
atomIndex
];
real4
pos
=
posq
[
atomIndex
];
charge
[
threadIdx
.
x
]
=
pos
.
w
;
charge
[
threadIdx
.
x
]
=
pos
.
w
;
pos
.
x
-=
floor
(
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
periodicBoxSize
.
x
;
pos
.
x
-=
floor
(
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
periodicBoxSize
.
x
;
pos
.
y
-=
floor
(
pos
.
y
*
invPeriodicBoxSize
.
y
)
*
periodicBoxSize
.
y
;
pos
.
y
-=
floor
(
pos
.
y
*
invPeriodicBoxSize
.
y
)
*
periodicBoxSize
.
y
;
pos
.
z
-=
floor
(
pos
.
z
*
invPeriodicBoxSize
.
z
)
*
periodicBoxSize
.
z
;
pos
.
z
-=
floor
(
pos
.
z
*
invPeriodicBoxSize
.
z
)
*
periodicBoxSize
.
z
;
basex
[
threadIdx
.
x
]
=
(
int
)
((
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
GRID_SIZE_X
);
basex
[
threadIdx
.
x
]
=
(
int
)
((
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
GRID_SIZE_X
);
basey
[
threadIdx
.
x
]
=
(
int
)
((
pos
.
y
*
invPeriodicBoxSize
.
y
)
*
GRID_SIZE_Y
);
basey
[
threadIdx
.
x
]
=
(
int
)
((
pos
.
y
*
invPeriodicBoxSize
.
y
)
*
GRID_SIZE_Y
);
basez
[
threadIdx
.
x
]
=
(
int
)
((
pos
.
z
*
invPeriodicBoxSize
.
z
)
*
GRID_SIZE_Z
);
basez
[
threadIdx
.
x
]
=
(
int
)
((
pos
.
z
*
invPeriodicBoxSize
.
z
)
*
GRID_SIZE_Z
);
}
}
}
__syncthreads
();
__syncthreads
();
int
lastIndex
=
min
(
BUFFER_SIZE
,
NUM_ATOMS
-
baseIndex
);
int
lastIndex
=
min
(
BUFFER_SIZE
,
NUM_ATOMS
-
baseIndex
);
...
@@ -162,12 +160,11 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, re
...
@@ -162,12 +160,11 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, re
extern
"C"
__global__
void
gridInterpolateForce
(
const
real4
*
__restrict__
posq
,
unsigned
long
long
*
__restrict__
forceBuffers
,
const
real2
*
__restrict__
pmeGrid
,
extern
"C"
__global__
void
gridInterpolateForce
(
const
real4
*
__restrict__
posq
,
unsigned
long
long
*
__restrict__
forceBuffers
,
const
real2
*
__restrict__
pmeGrid
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
extern
__shared__
real3
bsplinesCache
[];
real3
data
[
PME_ORDER
];
real3
*
data
=
&
bsplinesCache
[
threadIdx
.
x
*
PME_ORDER
];
real3
ddata
[
PME_ORDER
];
real3
*
ddata
=
&
bsplinesCache
[
threadIdx
.
x
*
PME_ORDER
+
blockDim
.
x
*
PME_ORDER
];
const
real
scale
=
RECIP
(
PME_ORDER
-
1
);
const
real
scale
=
RECIP
(
PME_ORDER
-
1
);
for
(
int
atom
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
atom
<
NUM_ATOMS
;
atom
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
atom
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
atom
<
NUM_ATOMS
;
atom
+=
blockDim
.
x
*
gridDim
.
x
)
{
real
4
force
=
make_real
4
(
0
);
real
3
force
=
make_real
3
(
0
);
real4
pos
=
posq
[
atom
];
real4
pos
=
posq
[
atom
];
pos
.
x
-=
floor
(
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
periodicBoxSize
.
x
;
pos
.
x
-=
floor
(
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
periodicBoxSize
.
x
;
pos
.
y
-=
floor
(
pos
.
y
*
invPeriodicBoxSize
.
y
)
*
periodicBoxSize
.
y
;
pos
.
y
-=
floor
(
pos
.
y
*
invPeriodicBoxSize
.
y
)
*
periodicBoxSize
.
y
;
...
@@ -204,19 +201,25 @@ extern "C" __global__ void gridInterpolateForce(const real4* __restrict__ posq,
...
@@ -204,19 +201,25 @@ extern "C" __global__ void gridInterpolateForce(const real4* __restrict__ posq,
// Compute the force on this atom.
// Compute the force on this atom.
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xindex
=
gridIndex
.
x
+
ix
;
int
xbase
=
gridIndex
.
x
+
ix
;
xindex
-=
(
xindex
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
xbase
-=
(
xbase
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
xbase
=
xbase
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
real
dx
=
data
[
ix
].
x
;
real
ddx
=
ddata
[
ix
].
x
;
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
yindex
=
gridIndex
.
y
+
iy
;
int
ybase
=
gridIndex
.
y
+
iy
;
yindex
-=
(
yindex
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
ybase
-=
(
ybase
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
ybase
=
xbase
+
ybase
*
GRID_SIZE_Z
;
real
dy
=
data
[
iy
].
y
;
real
ddy
=
ddata
[
iy
].
y
;
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
int
zindex
=
gridIndex
.
z
+
iz
;
int
zindex
=
gridIndex
.
z
+
iz
;
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
int
index
=
xindex
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
yindex
*
GRID_SIZE_Z
+
zindex
;
int
index
=
ybase
+
zindex
;
real
gridvalue
=
pmeGrid
[
index
].
x
;
real
gridvalue
=
pmeGrid
[
index
].
x
;
force
.
x
+=
dd
ata
[
ix
].
x
*
data
[
iy
].
y
*
data
[
iz
].
z
*
gridvalue
;
force
.
x
+=
dd
x
*
d
y
*
data
[
iz
].
z
*
gridvalue
;
force
.
y
+=
d
ata
[
ix
].
x
*
ddata
[
iy
].
y
*
data
[
iz
].
z
*
gridvalue
;
force
.
y
+=
d
x
*
dd
y
*
data
[
iz
].
z
*
gridvalue
;
force
.
z
+=
d
ata
[
ix
].
x
*
data
[
iy
].
y
*
ddata
[
iz
].
z
*
gridvalue
;
force
.
z
+=
d
x
*
d
y
*
ddata
[
iz
].
z
*
gridvalue
;
}
}
}
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment