Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
6768d049
Commit
6768d049
authored
Apr 10, 2010
by
Peter Eastman
Browse files
Optimizations to the CUDA implementation of CCMA
parent
c961d453
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
28 additions
and
51 deletions
+28
-51
platforms/cuda/src/kernels/cudatypes.h
platforms/cuda/src/kernels/cudatypes.h
+1
-1
platforms/cuda/src/kernels/gpu.cpp
platforms/cuda/src/kernels/gpu.cpp
+6
-7
platforms/cuda/src/kernels/gputypes.h
platforms/cuda/src/kernels/gputypes.h
+2
-1
platforms/cuda/src/kernels/kCCMA.cu
platforms/cuda/src/kernels/kCCMA.cu
+19
-42
No files found.
platforms/cuda/src/kernels/cudatypes.h
View file @
6768d049
...
...
@@ -460,7 +460,7 @@ struct cudaGmxSimulation {
float
*
pCcmaDelta2
;
// Workspace for CCMA
int
*
pCcmaAtomConstraints
;
// The indices of constraints involving each atom
int
*
pCcmaNumAtomConstraints
;
// The number of constraints involving each atom
int
*
pC
cmaConverged
;
// Used by CCMA to communicate whether iteration
has converged
int
*
c
cmaConverged
DeviceMarker
;
// Device memory used to communicate that CCMA
has converged
float
*
pCcmaReducedMass
;
// The reduced mass for each CCMA constraint
unsigned
int
*
pConstraintMatrixColumn
;
// The column of each element in the constraint matrix.
float
*
pConstraintMatrixValue
;
// The value of each element in the constraint matrix.
...
...
platforms/cuda/src/kernels/gpu.cpp
View file @
6768d049
...
...
@@ -1488,7 +1488,7 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
result
=
QUERN_solve_with_r
(
numCCMA
,
rRowStart
,
rColIndex
,
rValue
,
&
rhs
[
0
],
&
rhs
[
0
]);
for
(
int
j
=
0
;
j
<
numCCMA
;
j
++
)
{
double
value
=
rhs
[
j
]
*
distance
[
ccmaConstraints
[
i
]]
/
distance
[
ccmaConstraints
[
j
]];
if
(
abs
(
value
)
>
0.
1
)
if
(
abs
(
value
)
>
0.
05
)
matrix
[
j
].
push_back
(
pair
<
int
,
double
>
(
i
,
value
));
}
}
...
...
@@ -1533,9 +1533,6 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
CUDAStream
<
float
>*
psCcmaDelta2
=
new
CUDAStream
<
float
>
(
numCCMA
,
1
,
"CcmaDelta2"
);
gpu
->
psCcmaDelta2
=
psCcmaDelta2
;
gpu
->
sim
.
pCcmaDelta2
=
psCcmaDelta2
->
_pDevData
;
CUDAStream
<
int
>*
psCcmaConverged
=
new
CUDAStream
<
int
>
(
gpu
->
sim
.
blocks
,
1
,
"CcmaConverged"
);
gpu
->
psCcmaConverged
=
psCcmaConverged
;
gpu
->
sim
.
pCcmaConverged
=
psCcmaConverged
->
_pDevData
;
CUDAStream
<
float
>*
psCcmaReducedMass
=
new
CUDAStream
<
float
>
(
numCCMA
,
1
,
"CcmaReducedMass"
);
gpu
->
psCcmaReducedMass
=
psCcmaReducedMass
;
gpu
->
sim
.
pCcmaReducedMass
=
psCcmaReducedMass
->
_pDevData
;
...
...
@@ -1545,6 +1542,9 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
CUDAStream
<
float
>*
psConstraintMatrixValue
=
new
CUDAStream
<
float
>
(
numCCMA
*
maxRowElements
,
1
,
"ConstraintMatrixValue"
);
gpu
->
psConstraintMatrixValue
=
psConstraintMatrixValue
;
gpu
->
sim
.
pConstraintMatrixValue
=
psConstraintMatrixValue
->
_pDevData
;
cudaHostAlloc
((
void
**
)
&
gpu
->
ccmaConvergedHostMarker
,
sizeof
(
int
),
cudaHostAllocMapped
);
cudaHostGetDevicePointer
((
void
**
)
&
gpu
->
sim
.
ccmaConvergedDeviceMarker
,
(
void
*
)
gpu
->
ccmaConvergedHostMarker
,
0
);
cudaEventCreate
(
&
gpu
->
ccmaEvent
);
gpu
->
sim
.
ccmaConstraints
=
numCCMA
;
for
(
int
i
=
0
;
i
<
numCCMA
;
i
++
)
{
int
index
=
constraintOrder
[
i
];
...
...
@@ -1802,7 +1802,7 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
cudaSetDevice
(
device
);
// Ignore errors
status
=
cudaGetDevice
(
&
gpu
->
device
);
RTERROR
(
status
,
"Error getting CUDA device"
)
status
=
cudaSetDeviceFlags
(
useBlockingSync
?
cudaDeviceBlockingSync
:
cudaDeviceScheduleAuto
);
status
=
cudaSetDeviceFlags
(
cudaDeviceMapHost
+
(
useBlockingSync
?
cudaDeviceBlockingSync
:
cudaDeviceScheduleAuto
)
)
;
RTERROR
(
status
,
"Error setting device flags"
)
gpu
->
useBlockingSync
=
useBlockingSync
;
...
...
@@ -1988,7 +1988,6 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
gpu
->
psCcmaNumAtomConstraints
=
NULL
;
gpu
->
psCcmaDelta1
=
NULL
;
gpu
->
psCcmaDelta2
=
NULL
;
gpu
->
psCcmaConverged
=
NULL
;
gpu
->
psCcmaReducedMass
=
NULL
;
gpu
->
psConstraintMatrixColumn
=
NULL
;
gpu
->
psConstraintMatrixValue
=
NULL
;
...
...
@@ -2207,8 +2206,8 @@ void gpuShutDown(gpuContext gpu)
delete
gpu
->
psCcmaNumAtomConstraints
;
delete
gpu
->
psCcmaDelta1
;
delete
gpu
->
psCcmaDelta2
;
delete
gpu
->
psCcmaConverged
;
delete
gpu
->
psCcmaReducedMass
;
cudaEventDestroy
(
gpu
->
ccmaEvent
);
delete
gpu
->
psConstraintMatrixColumn
;
delete
gpu
->
psConstraintMatrixValue
;
delete
gpu
->
psTabulatedFunctionParams
;
...
...
platforms/cuda/src/kernels/gputypes.h
View file @
6768d049
...
...
@@ -171,7 +171,8 @@ struct _gpuContext {
CUDAStream
<
int
>*
psCcmaNumAtomConstraints
;
// The number of constraints involving each atom
CUDAStream
<
float
>*
psCcmaDelta1
;
// Workspace for CCMA
CUDAStream
<
float
>*
psCcmaDelta2
;
// Workspace for CCMA
CUDAStream
<
int
>*
psCcmaConverged
;
// Used by CCMA to communicate whether iteration has converged
int
*
ccmaConvergedHostMarker
;
// Host memory used to communicate that CCMA has converged
cudaEvent_t
ccmaEvent
;
// Used to optimize communication during CCMA
CUDAStream
<
float
>*
psCcmaReducedMass
;
// The reduced mass for each CCMA constraint
CUDAStream
<
float
>*
psRigidClusterMatrix
;
// The inverse constraint matrix for each rigid cluster
CUDAStream
<
unsigned
int
>*
psRigidClusterConstraintIndex
;
// The index of each cluster in the stream containing cluster constraints.
...
...
platforms/cuda/src/kernels/kCCMA.cu
View file @
6768d049
...
...
@@ -71,11 +71,6 @@ kComputeCCMAConstraintDirections()
dir
.
z
=
oldPos1
.
z
-
oldPos2
.
z
;
cSim
.
pCcmaDistance
[
index
]
=
dir
;
}
// Mark that no blocks have converged yet.
for
(
unsigned
int
index
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
index
<
gridDim
.
x
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
cSim
.
pCcmaConverged
[
index
]
=
0
;
}
__global__
void
...
...
@@ -88,12 +83,12 @@ __launch_bounds__(256, 1)
#endif
kComputeCCMAConstraintForces
(
float4
*
atomPositions
,
bool
addOldPosition
)
{
if
(
cSim
.
pCcmaConverged
[
blockIdx
.
x
])
return
;
// The constraint iteration has already converged.
extern
__shared__
int
convergedBuffer
[];
__shared__
int
converged
;
float
lowerTol
=
1.0
f
-
2.0
f
*
cSim
.
shakeTolerance
+
cSim
.
shakeTolerance
*
cSim
.
shakeTolerance
;
float
upperTol
=
1.0
f
+
2.0
f
*
cSim
.
shakeTolerance
+
cSim
.
shakeTolerance
*
cSim
.
shakeTolerance
;
int
threadConverged
=
1
;
if
(
threadIdx
.
x
==
0
)
converged
=
1
;
__syncthreads
();
// Calculate the constraint force for each constraint.
...
...
@@ -120,20 +115,12 @@ kComputeCCMAConstraintForces(float4* atomPositions, bool addOldPosition)
// See whether it has converged.
threadConverged
&=
(
rp2
>=
lowerTol
*
dist2
&&
rp2
<=
upperTol
*
dist2
);
}
// Perform a parallel reduction to see if all constraints handled by this block have converged.
convergedBuffer
[
threadIdx
.
x
]
=
threadConverged
;
__syncthreads
();
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
if
(
threadIdx
.
x
%
(
2
*
step
)
==
0
)
convergedBuffer
[
threadIdx
.
x
]
&=
convergedBuffer
[
threadIdx
.
x
+
step
];
__syncthreads
();
if
(
converged
&&
(
rp2
<
lowerTol
*
dist2
||
rp2
>
upperTol
*
dist2
))
{
converged
=
0
;
*
cSim
.
ccmaConvergedDeviceMarker
=
0
;
}
}
if
(
threadIdx
.
x
==
0
)
cSim
.
pCcmaConverged
[
blockIdx
.
x
]
=
convergedBuffer
[
0
];
}
__global__
void
...
...
@@ -146,22 +133,8 @@ __launch_bounds__(256, 1)
#endif
kMultiplyByCCMAConstraintMatrix
()
{
extern
__shared__
int
convergedBuffer
[];
// First see whether all work groups have converged.
convergedBuffer
[
threadIdx
.
x
]
=
true
;
for
(
int
index
=
threadIdx
.
x
;
index
<
gridDim
.
x
;
index
+=
blockDim
.
x
)
convergedBuffer
[
threadIdx
.
x
]
&=
cSim
.
pCcmaConverged
[
index
];
__syncthreads
();
for
(
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
if
(
threadIdx
.
x
%
(
2
*
step
)
==
0
)
convergedBuffer
[
threadIdx
.
x
]
&=
convergedBuffer
[
threadIdx
.
x
+
step
];
__syncthreads
();
}
if
(
threadIdx
.
x
==
0
)
cSim
.
pCcmaConverged
[
blockIdx
.
x
]
=
convergedBuffer
[
0
];
if
(
cSim
.
pCcmaConverged
[
0
])
return
;
// The constraint iteration has already converged.
if
(
*
cSim
.
ccmaConvergedDeviceMarker
)
return
;
// The constraint iteration has already converged
// Multiply by the inverse constraint matrix.
...
...
@@ -190,7 +163,7 @@ __launch_bounds__(256, 1)
#endif
kUpdateCCMAAtomPositions
(
float4
*
atomPositions
,
int
iteration
)
{
if
(
cSim
.
pC
cmaConverged
[
blockIdx
.
x
]
)
if
(
*
cSim
.
c
cmaConverged
DeviceMarker
)
return
;
// The constraint iteration has already converged.
float
damping
=
(
iteration
<
2
?
0.5
f
:
1.0
f
);
for
(
unsigned
int
index
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
index
<
cSim
.
atoms
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
...
...
@@ -218,13 +191,17 @@ void kApplyCCMA(gpuContext gpu, float4* posq, bool addOldPosition)
{
kComputeCCMAConstraintDirections
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
ccma_threads_per_block
>>>
();
LAUNCHERROR
(
"kComputeCCMAConstraintDirections"
);
const
int
checkInterval
=
3
;
for
(
int
i
=
0
;
i
<
150
;
i
++
)
{
if
((
i
+
1
)
%
checkInterval
==
0
)
*
gpu
->
ccmaConvergedHostMarker
=
1
;
kComputeCCMAConstraintForces
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
ccma_threads_per_block
,
gpu
->
sim
.
ccma_threads_per_block
*
sizeof
(
int
)
>>>
(
posq
,
addOldPosition
);
gpu
->
psCcmaConverged
->
Download
(
);
cudaEventRecord
(
gpu
->
ccmaEvent
,
0
);
kMultiplyByCCMAConstraintMatrix
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
ccma_threads_per_block
,
gpu
->
sim
.
ccma_threads_per_block
*
sizeof
(
int
)
>>>
();
if
((
*
gpu
->
psCcmaConverged
)[
0
])
kUpdateCCMAAtomPositions
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
ccma_threads_per_block
>>>
(
posq
,
3
*
i
+
2
);
cudaEventSynchronize
(
gpu
->
ccmaEvent
);
if
((
i
+
1
)
%
checkInterval
==
0
&&
*
gpu
->
ccmaConvergedHostMarker
)
break
;
kUpdateCCMAAtomPositions
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
ccma_threads_per_block
>>>
(
posq
,
i
);
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment