Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
2bb875b0
"wrappers/python/src/vscode:/vscode.git/clone" did not exist on "5e8265c7e39089727e0826607c4c39e84f9ae94b"
Commit
2bb875b0
authored
Jul 06, 2012
by
Peter Eastman
Browse files
Optimizations to new CUDA platform
parent
ca7fd533
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
104 additions
and
42 deletions
+104
-42
platforms/cuda2/src/CudaKernels.cpp
platforms/cuda2/src/CudaKernels.cpp
+4
-3
platforms/cuda2/src/CudaNonbondedUtilities.cpp
platforms/cuda2/src/CudaNonbondedUtilities.cpp
+2
-0
platforms/cuda2/src/kernels/gbsaObc1.cu
platforms/cuda2/src/kernels/gbsaObc1.cu
+36
-9
platforms/cuda2/src/kernels/nonbonded.cu
platforms/cuda2/src/kernels/nonbonded.cu
+35
-6
platforms/cuda2/src/kernels/pme.cu
platforms/cuda2/src/kernels/pme.cu
+27
-24
No files found.
platforms/cuda2/src/CudaKernels.cpp
View file @
2bb875b0
...
@@ -1448,6 +1448,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
...
@@ -1448,6 +1448,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeConvolutionKernel
=
cu
.
getKernel
(
module
,
"reciprocalConvolution"
);
pmeConvolutionKernel
=
cu
.
getKernel
(
module
,
"reciprocalConvolution"
);
pmeInterpolateForceKernel
=
cu
.
getKernel
(
module
,
"gridInterpolateForce"
);
pmeInterpolateForceKernel
=
cu
.
getKernel
(
module
,
"gridInterpolateForce"
);
pmeFinishSpreadChargeKernel
=
cu
.
getKernel
(
module
,
"finishSpreadCharge"
);
pmeFinishSpreadChargeKernel
=
cu
.
getKernel
(
module
,
"finishSpreadCharge"
);
cuFuncSetCacheConfig
(
pmeInterpolateForceKernel
,
CU_FUNC_CACHE_PREFER_L1
);
// Create required data structures.
// Create required data structures.
...
@@ -1606,9 +1607,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1606,9 +1607,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
void
*
interpolateArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
cu
.
getForce
().
getDevicePointer
(),
&
pmeGrid
->
getDevicePointer
(),
void
*
interpolateArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
cu
.
getForce
().
getDevicePointer
(),
&
pmeGrid
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
interpolateForceThreads
=
64
;
cu
.
executeKernel
(
pmeInterpolateForceKernel
,
interpolateArgs
,
cu
.
getNumAtoms
());
int
interpolateSharedSize
=
2
*
interpolateForceThreads
*
PmeOrder
*
(
cu
.
getUseDoublePrecision
()
?
sizeof
(
double3
)
:
sizeof
(
float3
));
cu
.
executeKernel
(
pmeInterpolateForceKernel
,
interpolateArgs
,
cu
.
getNumAtoms
(),
interpolateForceThreads
,
interpolateSharedSize
);
}
}
double
energy
=
(
includeReciprocal
?
ewaldSelfEnergy
:
0.0
);
double
energy
=
(
includeReciprocal
?
ewaldSelfEnergy
:
0.0
);
if
(
dispersionCoefficient
!=
0.0
&&
includeDirect
)
{
if
(
dispersionCoefficient
!=
0.0
&&
includeDirect
)
{
...
@@ -1970,6 +1969,8 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
...
@@ -1970,6 +1969,8 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
defines
[
"USE_CUTOFF"
]
=
"1"
;
defines
[
"USE_CUTOFF"
]
=
"1"
;
if
(
nb
.
getUsePeriodic
())
if
(
nb
.
getUsePeriodic
())
defines
[
"USE_PERIODIC"
]
=
"1"
;
defines
[
"USE_PERIODIC"
]
=
"1"
;
if
(
cu
.
getComputeCapability
()
>=
3.0
&&
!
cu
.
getUseDoublePrecision
())
defines
[
"ENABLE_SHUFFLE"
]
=
"1"
;
defines
[
"CUTOFF_SQUARED"
]
=
cu
.
doubleToString
(
nb
.
getCutoffDistance
()
*
nb
.
getCutoffDistance
());
defines
[
"CUTOFF_SQUARED"
]
=
cu
.
doubleToString
(
nb
.
getCutoffDistance
()
*
nb
.
getCutoffDistance
());
defines
[
"PREFACTOR"
]
=
cu
.
doubleToString
(
prefactor
);
defines
[
"PREFACTOR"
]
=
cu
.
doubleToString
(
prefactor
);
defines
[
"NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getNumAtoms
());
defines
[
"NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getNumAtoms
());
...
...
platforms/cuda2/src/CudaNonbondedUtilities.cpp
View file @
2bb875b0
...
@@ -447,6 +447,8 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
...
@@ -447,6 +447,8 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
defines
[
"NUM_BLOCKS"
]
=
context
.
intToString
(
context
.
getNumAtomBlocks
());
defines
[
"NUM_BLOCKS"
]
=
context
.
intToString
(
context
.
getNumAtomBlocks
());
if
((
localDataSize
/
4
)
%
2
==
0
&&
!
context
.
getUseDoublePrecision
())
if
((
localDataSize
/
4
)
%
2
==
0
&&
!
context
.
getUseDoublePrecision
())
defines
[
"PARAMETER_SIZE_IS_EVEN"
]
=
"1"
;
defines
[
"PARAMETER_SIZE_IS_EVEN"
]
=
"1"
;
if
(
context
.
getComputeCapability
()
>=
3.0
&&
!
context
.
getUseDoublePrecision
())
defines
[
"ENABLE_SHUFFLE"
]
=
"1"
;
CUmodule
program
=
context
.
createModule
(
CudaKernelSources
::
vectorOps
+
context
.
replaceStrings
(
CudaKernelSources
::
nonbonded
,
replacements
),
defines
);
CUmodule
program
=
context
.
createModule
(
CudaKernelSources
::
vectorOps
+
context
.
replaceStrings
(
CudaKernelSources
::
nonbonded
,
replacements
),
defines
);
CUfunction
kernel
=
context
.
getKernel
(
program
,
"computeNonbonded"
);
CUfunction
kernel
=
context
.
getKernel
(
program
,
"computeNonbonded"
);
...
...
platforms/cuda2/src/kernels/gbsaObc1.cu
View file @
2bb875b0
...
@@ -87,10 +87,11 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
...
@@ -87,10 +87,11 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
#endif
#endif
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__shared__
AtomData1
localData
[
FORCE_WORK_GROUP_SIZE
];
__shared__
AtomData1
localData
[
FORCE_WORK_GROUP_SIZE
];
__shared__
real
tempBuffer
[
FORCE_WORK_GROUP_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
#ifndef ENABLE_SHUFFLE
__shared__
real
tempBuffer
[
FORCE_WORK_GROUP_SIZE
];
#endif
do
{
do
{
// Extract the coordinates of this tile
// Extract the coordinates of this tile
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
...
@@ -204,7 +205,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
...
@@ -204,7 +205,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
tempBuffer
[
threadIdx
.
x
]
=
0
.0
f
;
real
sum
=
0
;
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#else
#else
...
@@ -236,16 +237,24 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
...
@@ -236,16 +237,24 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
(
params1
.
y
*
params1
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
(
params1
.
y
*
params1
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
if
(
params2
.
x
<
params1
.
y
-
r
)
if
(
params2
.
x
<
params1
.
y
-
r
)
term
+=
2.0
f
*
(
RECIP
(
params2
.
x
)
-
l_ij
);
term
+=
2.0
f
*
(
RECIP
(
params2
.
x
)
-
l_ij
);
tempBuffer
[
threadIdx
.
x
]
=
term
;
sum
=
term
;
}
}
}
}
// Sum the forces on atom j.
// Sum the forces on atom j.
#ifdef ENABLE_SHUFFLE
for
(
int
i
=
16
;
i
>=
1
;
i
/=
2
)
sum
+=
__shfl_xor
(
sum
,
i
,
32
);
if
(
tgx
==
0
)
localData
[
tbx
+
j
].
bornSum
+=
sum
;
#else
tempBuffer
[
threadIdx
.
x
]
=
sum
;
if
(
tgx
%
4
==
0
)
if
(
tgx
%
4
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
1
]
+
tempBuffer
[
threadIdx
.
x
+
2
]
+
tempBuffer
[
threadIdx
.
x
+
3
];
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
1
]
+
tempBuffer
[
threadIdx
.
x
+
2
]
+
tempBuffer
[
threadIdx
.
x
+
3
];
if
(
tgx
==
0
)
if
(
tgx
==
0
)
localData
[
tbx
+
j
].
bornSum
+=
tempBuffer
[
threadIdx
.
x
]
+
tempBuffer
[
threadIdx
.
x
+
4
]
+
tempBuffer
[
threadIdx
.
x
+
8
]
+
tempBuffer
[
threadIdx
.
x
+
12
]
+
tempBuffer
[
threadIdx
.
x
+
16
]
+
tempBuffer
[
threadIdx
.
x
+
20
]
+
tempBuffer
[
threadIdx
.
x
+
24
]
+
tempBuffer
[
threadIdx
.
x
+
28
];
localData
[
tbx
+
j
].
bornSum
+=
tempBuffer
[
threadIdx
.
x
]
+
tempBuffer
[
threadIdx
.
x
+
4
]
+
tempBuffer
[
threadIdx
.
x
+
8
]
+
tempBuffer
[
threadIdx
.
x
+
12
]
+
tempBuffer
[
threadIdx
.
x
+
16
]
+
tempBuffer
[
threadIdx
.
x
+
20
]
+
tempBuffer
[
threadIdx
.
x
+
24
]
+
tempBuffer
[
threadIdx
.
x
+
28
];
#endif
}
}
}
}
}
}
...
@@ -351,9 +360,11 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
...
@@ -351,9 +360,11 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
real
energy
=
0
;
real
energy
=
0
;
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__shared__
AtomData2
localData
[
FORCE_WORK_GROUP_SIZE
];
__shared__
AtomData2
localData
[
FORCE_WORK_GROUP_SIZE
];
__shared__
real4
tempBuffer
[
FORCE_WORK_GROUP_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
#ifndef ENABLE_SHUFFLE
__shared__
real4
tempBuffer
[
FORCE_WORK_GROUP_SIZE
];
#endif
do
{
do
{
// Extract the coordinates of this tile
// Extract the coordinates of this tile
...
@@ -466,7 +477,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
...
@@ -466,7 +477,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
real4
posq2
=
make_real4
(
localData
[
tbx
+
j
].
x
,
localData
[
tbx
+
j
].
y
,
localData
[
tbx
+
j
].
z
,
localData
[
tbx
+
j
].
q
);
real4
posq2
=
make_real4
(
localData
[
tbx
+
j
].
x
,
localData
[
tbx
+
j
].
y
,
localData
[
tbx
+
j
].
z
,
localData
[
tbx
+
j
].
q
);
real
3
delta
=
make_real
3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real
4
delta
=
make_real
4
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
,
0
);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
...
@@ -503,15 +514,30 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
...
@@ -503,15 +514,30 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
force
.
x
-=
delta
.
x
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
force
.
z
-=
delta
.
z
;
tempBuffer
[
threadIdx
.
x
]
=
make_real4
(
delta
.
x
,
delta
.
y
,
delta
.
z
,
dGpol_dalpha2_ij
*
bornRadius1
)
;
delta
.
w
=
dGpol_dalpha2_ij
*
bornRadius1
;
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
}
}
else
else
tempBuffer
[
threadIdx
.
x
]
=
make_real4
(
0
);
delta
=
make_real4
(
0
);
#endif
#endif
// Sum the forces on atom j.
// Sum the forces on atom j.
#ifdef ENABLE_SHUFFLE
for
(
int
i
=
16
;
i
>=
1
;
i
/=
2
)
{
delta
.
x
+=
__shfl_xor
(
delta
.
x
,
i
,
32
);
delta
.
y
+=
__shfl_xor
(
delta
.
y
,
i
,
32
);
delta
.
z
+=
__shfl_xor
(
delta
.
z
,
i
,
32
);
delta
.
w
+=
__shfl_xor
(
delta
.
w
,
i
,
32
);
}
if
(
tgx
==
0
)
{
localData
[
tbx
+
j
].
fx
+=
delta
.
x
;
localData
[
tbx
+
j
].
fy
+=
delta
.
y
;
localData
[
tbx
+
j
].
fz
+=
delta
.
z
;
localData
[
tbx
+
j
].
fw
+=
delta
.
w
;
}
#else
tempBuffer
[
threadIdx
.
x
]
=
delta
;
if
(
tgx
%
4
==
0
)
if
(
tgx
%
4
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
1
]
+
tempBuffer
[
threadIdx
.
x
+
2
]
+
tempBuffer
[
threadIdx
.
x
+
3
];
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
1
]
+
tempBuffer
[
threadIdx
.
x
+
2
]
+
tempBuffer
[
threadIdx
.
x
+
3
];
if
(
tgx
==
0
)
{
if
(
tgx
==
0
)
{
...
@@ -521,6 +547,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
...
@@ -521,6 +547,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
localData
[
tbx
+
j
].
fz
+=
sum
.
z
;
localData
[
tbx
+
j
].
fz
+=
sum
.
z
;
localData
[
tbx
+
j
].
fw
+=
sum
.
w
;
localData
[
tbx
+
j
].
fw
+=
sum
.
w
;
}
}
#endif
}
}
}
}
}
}
...
...
platforms/cuda2/src/kernels/nonbonded.cu
View file @
2bb875b0
...
@@ -35,9 +35,11 @@ extern "C" __global__ void computeNonbonded(
...
@@ -35,9 +35,11 @@ extern "C" __global__ void computeNonbonded(
#endif
#endif
real
energy
=
0.0
f
;
real
energy
=
0.0
f
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
__shared__
real
tempBuffer
[
3
*
THREAD_BLOCK_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
#ifndef ENABLE_SHUFFLE
__shared__
real
tempBuffer
[
3
*
THREAD_BLOCK_SIZE
];
#endif
do
{
do
{
// Extract the coordinates of this tile
// Extract the coordinates of this tile
...
@@ -186,18 +188,46 @@ extern "C" __global__ void computeNonbonded(
...
@@ -186,18 +188,46 @@ extern "C" __global__ void computeNonbonded(
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
}
}
#endif
#endif
#ifdef USE_SYMMETRIC
#ifdef ENABLE_SHUFFLE
#ifdef USE_SYMMETRIC
delta
*=
dEdR
;
force
-=
delta
;
for
(
int
i
=
16
;
i
>=
1
;
i
/=
2
)
{
delta
.
x
+=
__shfl_xor
(
delta
.
x
,
i
,
32
);
delta
.
y
+=
__shfl_xor
(
delta
.
y
,
i
,
32
);
delta
.
z
+=
__shfl_xor
(
delta
.
z
,
i
,
32
);
}
if
(
tgx
==
0
)
{
localData
[
tbx
+
j
].
fx
+=
delta
.
x
;
localData
[
tbx
+
j
].
fy
+=
delta
.
y
;
localData
[
tbx
+
j
].
fz
+=
delta
.
z
;
}
#else
force
-=
dEdR1
;
for
(
int
i
=
16
;
i
>=
1
;
i
/=
2
)
{
dEdR2
.
x
+=
__shfl_xor
(
dEdR2
.
x
,
i
,
32
);
dEdR2
.
y
+=
__shfl_xor
(
dEdR2
.
y
,
i
,
32
);
dEdR2
.
z
+=
__shfl_xor
(
dEdR2
.
z
,
i
,
32
);
}
if
(
tgx
==
0
)
{
localData
[
tbx
+
j
].
fx
+=
dEdR2
.
x
;
localData
[
tbx
+
j
].
fy
+=
dEdR2
.
y
;
localData
[
tbx
+
j
].
fz
+=
dEdR2
.
z
;
}
#endif
#else
#ifdef USE_SYMMETRIC
delta
*=
dEdR
;
delta
*=
dEdR
;
force
-=
delta
;
force
-=
delta
;
tempBuffer
[
bufferIndex
]
=
delta
.
x
;
tempBuffer
[
bufferIndex
]
=
delta
.
x
;
tempBuffer
[
bufferIndex
+
1
]
=
delta
.
y
;
tempBuffer
[
bufferIndex
+
1
]
=
delta
.
y
;
tempBuffer
[
bufferIndex
+
2
]
=
delta
.
z
;
tempBuffer
[
bufferIndex
+
2
]
=
delta
.
z
;
#else
#else
force
-=
dEdR1
;
force
-=
dEdR1
;
tempBuffer
[
bufferIndex
]
=
dEdR2
.
x
;
tempBuffer
[
bufferIndex
]
=
dEdR2
.
x
;
tempBuffer
[
bufferIndex
+
1
]
=
dEdR2
.
y
;
tempBuffer
[
bufferIndex
+
1
]
=
dEdR2
.
y
;
tempBuffer
[
bufferIndex
+
2
]
=
dEdR2
.
z
;
tempBuffer
[
bufferIndex
+
2
]
=
dEdR2
.
z
;
#endif
#endif
// Sum the forces on atom2.
// Sum the forces on atom2.
...
@@ -211,6 +241,7 @@ extern "C" __global__ void computeNonbonded(
...
@@ -211,6 +241,7 @@ extern "C" __global__ void computeNonbonded(
localData
[
tbx
+
j
].
fy
+=
tempBuffer
[
bufferIndex
+
1
]
+
tempBuffer
[
bufferIndex
+
13
]
+
tempBuffer
[
bufferIndex
+
25
]
+
tempBuffer
[
bufferIndex
+
37
]
+
tempBuffer
[
bufferIndex
+
49
]
+
tempBuffer
[
bufferIndex
+
61
]
+
tempBuffer
[
bufferIndex
+
73
]
+
tempBuffer
[
bufferIndex
+
85
];
localData
[
tbx
+
j
].
fy
+=
tempBuffer
[
bufferIndex
+
1
]
+
tempBuffer
[
bufferIndex
+
13
]
+
tempBuffer
[
bufferIndex
+
25
]
+
tempBuffer
[
bufferIndex
+
37
]
+
tempBuffer
[
bufferIndex
+
49
]
+
tempBuffer
[
bufferIndex
+
61
]
+
tempBuffer
[
bufferIndex
+
73
]
+
tempBuffer
[
bufferIndex
+
85
];
localData
[
tbx
+
j
].
fz
+=
tempBuffer
[
bufferIndex
+
2
]
+
tempBuffer
[
bufferIndex
+
14
]
+
tempBuffer
[
bufferIndex
+
26
]
+
tempBuffer
[
bufferIndex
+
38
]
+
tempBuffer
[
bufferIndex
+
50
]
+
tempBuffer
[
bufferIndex
+
62
]
+
tempBuffer
[
bufferIndex
+
74
]
+
tempBuffer
[
bufferIndex
+
86
];
localData
[
tbx
+
j
].
fz
+=
tempBuffer
[
bufferIndex
+
2
]
+
tempBuffer
[
bufferIndex
+
14
]
+
tempBuffer
[
bufferIndex
+
26
]
+
tempBuffer
[
bufferIndex
+
38
]
+
tempBuffer
[
bufferIndex
+
50
]
+
tempBuffer
[
bufferIndex
+
62
]
+
tempBuffer
[
bufferIndex
+
74
]
+
tempBuffer
[
bufferIndex
+
86
];
}
}
#endif
}
}
}
}
}
}
...
@@ -285,14 +316,12 @@ extern "C" __global__ void computeNonbonded(
...
@@ -285,14 +316,12 @@ extern "C" __global__ void computeNonbonded(
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0xFFFFFFFF
)));
__threadfence_block
();
}
}
if
(
pos
<
end
&&
x
!=
y
)
{
if
(
pos
<
end
&&
x
!=
y
)
{
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fx
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fx
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fy
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fy
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fz
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fz
*
0xFFFFFFFF
)));
__threadfence_block
();
}
}
pos
++
;
pos
++
;
}
while
(
pos
<
end
);
}
while
(
pos
<
end
);
...
...
platforms/cuda2/src/kernels/pme.cu
View file @
2bb875b0
...
@@ -77,7 +77,6 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, unsi
...
@@ -77,7 +77,6 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, unsi
for
(
int
baseIndex
=
blockIdx
.
x
*
BUFFER_SIZE
;
baseIndex
<
NUM_ATOMS
;
baseIndex
+=
gridDim
.
x
*
BUFFER_SIZE
)
{
for
(
int
baseIndex
=
blockIdx
.
x
*
BUFFER_SIZE
;
baseIndex
<
NUM_ATOMS
;
baseIndex
+=
gridDim
.
x
*
BUFFER_SIZE
)
{
// Load the next block of atoms into the buffers.
// Load the next block of atoms into the buffers.
if
(
threadIdx
.
x
<
BUFFER_SIZE
)
{
int
atomIndex
=
baseIndex
+
threadIdx
.
x
;
int
atomIndex
=
baseIndex
+
threadIdx
.
x
;
if
(
atomIndex
<
NUM_ATOMS
)
{
if
(
atomIndex
<
NUM_ATOMS
)
{
real4
pos
=
posq
[
atomIndex
];
real4
pos
=
posq
[
atomIndex
];
...
@@ -89,7 +88,6 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, unsi
...
@@ -89,7 +88,6 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, unsi
basey
[
threadIdx
.
x
]
=
(
int
)
((
pos
.
y
*
invPeriodicBoxSize
.
y
)
*
GRID_SIZE_Y
);
basey
[
threadIdx
.
x
]
=
(
int
)
((
pos
.
y
*
invPeriodicBoxSize
.
y
)
*
GRID_SIZE_Y
);
basez
[
threadIdx
.
x
]
=
(
int
)
((
pos
.
z
*
invPeriodicBoxSize
.
z
)
*
GRID_SIZE_Z
);
basez
[
threadIdx
.
x
]
=
(
int
)
((
pos
.
z
*
invPeriodicBoxSize
.
z
)
*
GRID_SIZE_Z
);
}
}
}
__syncthreads
();
__syncthreads
();
int
lastIndex
=
min
(
BUFFER_SIZE
,
NUM_ATOMS
-
baseIndex
);
int
lastIndex
=
min
(
BUFFER_SIZE
,
NUM_ATOMS
-
baseIndex
);
for
(
int
index
=
0
;
index
<
lastIndex
;
index
++
)
{
for
(
int
index
=
0
;
index
<
lastIndex
;
index
++
)
{
...
@@ -162,12 +160,11 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, re
...
@@ -162,12 +160,11 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, re
extern
"C"
__global__
void
gridInterpolateForce
(
const
real4
*
__restrict__
posq
,
unsigned
long
long
*
__restrict__
forceBuffers
,
const
real2
*
__restrict__
pmeGrid
,
extern
"C"
__global__
void
gridInterpolateForce
(
const
real4
*
__restrict__
posq
,
unsigned
long
long
*
__restrict__
forceBuffers
,
const
real2
*
__restrict__
pmeGrid
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
extern
__shared__
real3
bsplinesCache
[];
real3
data
[
PME_ORDER
];
real3
*
data
=
&
bsplinesCache
[
threadIdx
.
x
*
PME_ORDER
];
real3
ddata
[
PME_ORDER
];
real3
*
ddata
=
&
bsplinesCache
[
threadIdx
.
x
*
PME_ORDER
+
blockDim
.
x
*
PME_ORDER
];
const
real
scale
=
RECIP
(
PME_ORDER
-
1
);
const
real
scale
=
RECIP
(
PME_ORDER
-
1
);
for
(
int
atom
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
atom
<
NUM_ATOMS
;
atom
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
atom
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
atom
<
NUM_ATOMS
;
atom
+=
blockDim
.
x
*
gridDim
.
x
)
{
real
4
force
=
make_real
4
(
0
);
real
3
force
=
make_real
3
(
0
);
real4
pos
=
posq
[
atom
];
real4
pos
=
posq
[
atom
];
pos
.
x
-=
floor
(
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
periodicBoxSize
.
x
;
pos
.
x
-=
floor
(
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
periodicBoxSize
.
x
;
pos
.
y
-=
floor
(
pos
.
y
*
invPeriodicBoxSize
.
y
)
*
periodicBoxSize
.
y
;
pos
.
y
-=
floor
(
pos
.
y
*
invPeriodicBoxSize
.
y
)
*
periodicBoxSize
.
y
;
...
@@ -204,19 +201,25 @@ extern "C" __global__ void gridInterpolateForce(const real4* __restrict__ posq,
...
@@ -204,19 +201,25 @@ extern "C" __global__ void gridInterpolateForce(const real4* __restrict__ posq,
// Compute the force on this atom.
// Compute the force on this atom.
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xindex
=
gridIndex
.
x
+
ix
;
int
xbase
=
gridIndex
.
x
+
ix
;
xindex
-=
(
xindex
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
xbase
-=
(
xbase
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
xbase
=
xbase
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
real
dx
=
data
[
ix
].
x
;
real
ddx
=
ddata
[
ix
].
x
;
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
yindex
=
gridIndex
.
y
+
iy
;
int
ybase
=
gridIndex
.
y
+
iy
;
yindex
-=
(
yindex
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
ybase
-=
(
ybase
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
ybase
=
xbase
+
ybase
*
GRID_SIZE_Z
;
real
dy
=
data
[
iy
].
y
;
real
ddy
=
ddata
[
iy
].
y
;
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
int
zindex
=
gridIndex
.
z
+
iz
;
int
zindex
=
gridIndex
.
z
+
iz
;
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
int
index
=
xindex
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
yindex
*
GRID_SIZE_Z
+
zindex
;
int
index
=
ybase
+
zindex
;
real
gridvalue
=
pmeGrid
[
index
].
x
;
real
gridvalue
=
pmeGrid
[
index
].
x
;
force
.
x
+=
dd
ata
[
ix
].
x
*
data
[
iy
].
y
*
data
[
iz
].
z
*
gridvalue
;
force
.
x
+=
dd
x
*
d
y
*
data
[
iz
].
z
*
gridvalue
;
force
.
y
+=
d
ata
[
ix
].
x
*
ddata
[
iy
].
y
*
data
[
iz
].
z
*
gridvalue
;
force
.
y
+=
d
x
*
dd
y
*
data
[
iz
].
z
*
gridvalue
;
force
.
z
+=
d
ata
[
ix
].
x
*
data
[
iy
].
y
*
ddata
[
iz
].
z
*
gridvalue
;
force
.
z
+=
d
x
*
d
y
*
ddata
[
iz
].
z
*
gridvalue
;
}
}
}
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment