Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
1d0193eb
Commit
1d0193eb
authored
Apr 09, 2010
by
Peter Eastman
Browse files
Restructured CCMA code to work on Fermi boards
parent
5301b9cc
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
154 additions
and
142 deletions
+154
-142
platforms/cuda/src/kernels/cudatypes.h
platforms/cuda/src/kernels/cudatypes.h
+1
-2
platforms/cuda/src/kernels/gpu.cpp
platforms/cuda/src/kernels/gpu.cpp
+5
-13
platforms/cuda/src/kernels/gputypes.h
platforms/cuda/src/kernels/gputypes.h
+2
-3
platforms/cuda/src/kernels/kCCMA.cu
platforms/cuda/src/kernels/kCCMA.cu
+146
-124
No files found.
platforms/cuda/src/kernels/cudatypes.h
View file @
1d0193eb
...
...
@@ -460,8 +460,7 @@ struct cudaGmxSimulation {
float
*
pCcmaDelta2
;
// Workspace for CCMA
int
*
pCcmaAtomConstraints
;
// The indices of constraints involving each atom
int
*
pCcmaNumAtomConstraints
;
// The number of constraints involving each atom
short
*
pSyncCounter
;
// Used for global thread synchronization
unsigned
int
*
pRequiredIterations
;
// Used by CCMA to communicate whether iteration has converged
int
*
pCcmaConverged
;
// Used by CCMA to communicate whether iteration has converged
float
*
pCcmaReducedMass
;
// The reduced mass for each CCMA constraint
unsigned
int
*
pConstraintMatrixColumn
;
// The column of each element in the constraint matrix.
float
*
pConstraintMatrixValue
;
// The value of each element in the constraint matrix.
...
...
platforms/cuda/src/kernels/gpu.cpp
View file @
1d0193eb
...
...
@@ -1533,12 +1533,9 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
CUDAStream
<
float
>*
psCcmaDelta2
=
new
CUDAStream
<
float
>
(
numCCMA
,
1
,
"CcmaDelta2"
);
gpu
->
psCcmaDelta2
=
psCcmaDelta2
;
gpu
->
sim
.
pCcmaDelta2
=
psCcmaDelta2
->
_pDevData
;
CUDAStream
<
short
>*
psSyncCounter
=
new
CUDAStream
<
short
>
(
3
*
gpu
->
sim
.
blocks
,
1
,
"SyncCounter"
);
gpu
->
psSyncCounter
=
psSyncCounter
;
gpu
->
sim
.
pSyncCounter
=
psSyncCounter
->
_pDevData
;
CUDAStream
<
unsigned
int
>*
psRequiredIterations
=
new
CUDAStream
<
unsigned
int
>
(
1
,
1
,
"RequiredIterations"
);
gpu
->
psRequiredIterations
=
psRequiredIterations
;
gpu
->
sim
.
pRequiredIterations
=
psRequiredIterations
->
_pDevData
;
CUDAStream
<
int
>*
psCcmaConverged
=
new
CUDAStream
<
int
>
(
gpu
->
sim
.
blocks
,
1
,
"CcmaConverged"
);
gpu
->
psCcmaConverged
=
psCcmaConverged
;
gpu
->
sim
.
pCcmaConverged
=
psCcmaConverged
->
_pDevData
;
CUDAStream
<
float
>*
psCcmaReducedMass
=
new
CUDAStream
<
float
>
(
numCCMA
,
1
,
"CcmaReducedMass"
);
gpu
->
psCcmaReducedMass
=
psCcmaReducedMass
;
gpu
->
sim
.
pCcmaReducedMass
=
psCcmaReducedMass
->
_pDevData
;
...
...
@@ -1562,8 +1559,6 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
}
(
*
psConstraintMatrixColumn
)[
i
+
matrix
[
index
].
size
()
*
numCCMA
]
=
numCCMA
;
}
for
(
unsigned
int
i
=
0
;
i
<
psSyncCounter
->
_length
;
i
++
)
(
*
psSyncCounter
)[
i
]
=
-
1
;
for
(
unsigned
int
i
=
0
;
i
<
atomConstraints
.
size
();
i
++
)
{
(
*
psCcmaNumAtomConstraints
)[
i
]
=
atomConstraints
[
i
].
size
();
for
(
unsigned
int
j
=
0
;
j
<
atomConstraints
[
i
].
size
();
j
++
)
{
...
...
@@ -1576,7 +1571,6 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
psCcmaReducedMass
->
Upload
();
psCcmaAtomConstraints
->
Upload
();
psCcmaNumAtomConstraints
->
Upload
();
psSyncCounter
->
Upload
();
psConstraintMatrixColumn
->
Upload
();
psConstraintMatrixValue
->
Upload
();
gpu
->
sim
.
ccma_threads_per_block
=
(
gpu
->
sim
.
ccmaConstraints
+
gpu
->
sim
.
blocks
-
1
)
/
gpu
->
sim
.
blocks
;
...
...
@@ -1994,8 +1988,7 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
gpu
->
psCcmaNumAtomConstraints
=
NULL
;
gpu
->
psCcmaDelta1
=
NULL
;
gpu
->
psCcmaDelta2
=
NULL
;
gpu
->
psSyncCounter
=
NULL
;
gpu
->
psRequiredIterations
=
NULL
;
gpu
->
psCcmaConverged
=
NULL
;
gpu
->
psCcmaReducedMass
=
NULL
;
gpu
->
psConstraintMatrixColumn
=
NULL
;
gpu
->
psConstraintMatrixValue
=
NULL
;
...
...
@@ -2214,8 +2207,7 @@ void gpuShutDown(gpuContext gpu)
delete
gpu
->
psCcmaNumAtomConstraints
;
delete
gpu
->
psCcmaDelta1
;
delete
gpu
->
psCcmaDelta2
;
delete
gpu
->
psSyncCounter
;
delete
gpu
->
psRequiredIterations
;
delete
gpu
->
psCcmaConverged
;
delete
gpu
->
psCcmaReducedMass
;
delete
gpu
->
psConstraintMatrixColumn
;
delete
gpu
->
psConstraintMatrixValue
;
...
...
platforms/cuda/src/kernels/gputypes.h
View file @
1d0193eb
...
...
@@ -171,9 +171,8 @@ struct _gpuContext {
CUDAStream
<
int
>*
psCcmaNumAtomConstraints
;
// The number of constraints involving each atom
CUDAStream
<
float
>*
psCcmaDelta1
;
// Workspace for CCMA
CUDAStream
<
float
>*
psCcmaDelta2
;
// Workspace for CCMA
CUDAStream
<
short
>*
psSyncCounter
;
// Used for global thread synchronization
CUDAStream
<
unsigned
int
>*
psRequiredIterations
;
// Used by SHAKE to communicate whether iteration has converged
CUDAStream
<
float
>*
psCcmaReducedMass
;
// The reduced mass for each SHAKE constraint
CUDAStream
<
int
>*
psCcmaConverged
;
// Used by CCMA to communicate whether iteration has converged
CUDAStream
<
float
>*
psCcmaReducedMass
;
// The reduced mass for each CCMA constraint
CUDAStream
<
float
>*
psRigidClusterMatrix
;
// The inverse constraint matrix for each rigid cluster
CUDAStream
<
unsigned
int
>*
psRigidClusterConstraintIndex
;
// The index of each cluster in the stream containing cluster constraints.
CUDAStream
<
unsigned
int
>*
psRigidClusterMatrixIndex
;
// The index of each cluster in the stream containing cluster matrices.
...
...
platforms/cuda/src/kernels/kCCMA.cu
View file @
1d0193eb
...
...
@@ -48,24 +48,6 @@ void GetCCMASim(gpuContext gpu)
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
/**
* Synchronize all threads across all blocks.
*/
__device__
void
kSyncAllThreads_kernel
(
short
*
syncCounter
,
short
newCount
)
{
__syncthreads
();
if
(
threadIdx
.
x
==
0
)
syncCounter
[
blockIdx
.
x
]
=
newCount
;
if
(
threadIdx
.
x
<
gridDim
.
x
)
{
volatile
short
*
counter
=
&
syncCounter
[
threadIdx
.
x
];
do
{
}
while
(
*
counter
!=
newCount
);
}
__syncthreads
();
}
__global__
void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
...
...
@@ -74,52 +56,53 @@ __launch_bounds__(512, 1)
#else
__launch_bounds__
(
256
,
1
)
#endif
k
ApplyCCMA_kernel
(
float4
*
atomPositions
,
bool
addOldPosition
)
k
ComputeCCMAConstraintDirections
(
)
{
// Initialize counters used for monitoring convergence and doing global thread synchronization.
__shared__
unsigned
int
requiredIterations
;
if
(
threadIdx
.
x
==
0
)
{
requiredIterations
=
0
;
cSim
.
pSyncCounter
[
gridDim
.
x
+
blockIdx
.
x
]
=
-
1
;
cSim
.
pSyncCounter
[
2
*
gridDim
.
x
+
blockIdx
.
x
]
=
-
1
;
cSim
.
pRequiredIterations
[
0
]
=
0
;
}
// Calculate the direction of each constraint.
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
while
(
pos
<
cSim
.
ccmaConstraints
)
for
(
unsigned
int
index
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
index
<
cSim
.
ccmaConstraints
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int2
atoms
=
cSim
.
pCcmaAtoms
[
pos
];
float4
dir
=
cSim
.
pCcmaDistance
[
pos
];
int2
atoms
=
cSim
.
pCcmaAtoms
[
index
];
float4
dir
=
cSim
.
pCcmaDistance
[
index
];
float4
oldPos1
=
cSim
.
pOldPosq
[
atoms
.
x
];
float4
oldPos2
=
cSim
.
pOldPosq
[
atoms
.
y
];
dir
.
x
=
oldPos1
.
x
-
oldPos2
.
x
;
dir
.
y
=
oldPos1
.
y
-
oldPos2
.
y
;
dir
.
z
=
oldPos1
.
z
-
oldPos2
.
z
;
cSim
.
pCcmaDistance
[
pos
]
=
dir
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
cSim
.
pCcmaDistance
[
index
]
=
dir
;
}
__syncthreads
();
//
Iteratively update the atom positions
.
//
Mark that no blocks have converged yet
.
unsigned
int
maxIterations
=
150
;
for
(
unsigned
int
index
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
index
<
gridDim
.
x
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
cSim
.
pCcmaConverged
[
index
]
=
false
;
}
__global__
void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__
(
512
,
1
)
#else
__launch_bounds__
(
256
,
1
)
#endif
kComputeCCMAConstraintForces
(
float4
*
atomPositions
,
bool
addOldPosition
)
{
if
(
cSim
.
pCcmaConverged
[
blockIdx
.
x
])
return
;
// The constraint iteration has already converged.
extern
__shared__
int
convergedBuffer
[];
float
lowerTol
=
1.0
f
-
2.0
f
*
cSim
.
shakeTolerance
+
cSim
.
shakeTolerance
*
cSim
.
shakeTolerance
;
float
upperTol
=
1.0
f
+
2.0
f
*
cSim
.
shakeTolerance
+
cSim
.
shakeTolerance
*
cSim
.
shakeTolerance
;
for
(
unsigned
int
iteration
=
0
;
iteration
<
maxIterations
;
iteration
++
)
{
int
threadConverged
=
1
;
// Calculate the constraint force for each constraint.
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
while
(
pos
<
cSim
.
ccmaConstraints
)
for
(
unsigned
int
index
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
index
<
cSim
.
ccmaConstraints
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int2
atoms
=
cSim
.
pCcmaAtoms
[
pos
];
int2
atoms
=
cSim
.
pCcmaAtoms
[
index
];
float4
delta1
=
atomPositions
[
atoms
.
x
];
float4
delta2
=
atomPositions
[
atoms
.
y
];
float4
dir
=
cSim
.
pCcmaDistance
[
pos
];
float4
dir
=
cSim
.
pCcmaDistance
[
index
];
float3
rp_ij
=
make_float3
(
delta1
.
x
-
delta2
.
x
,
delta1
.
y
-
delta2
.
y
,
delta1
.
z
-
delta2
.
z
);
if
(
addOldPosition
)
{
...
...
@@ -132,51 +115,92 @@ kApplyCCMA_kernel(float4* atomPositions, bool addOldPosition)
float
diff
=
dist2
-
rp2
;
float
rrpr
=
rp_ij
.
x
*
dir
.
x
+
rp_ij
.
y
*
dir
.
y
+
rp_ij
.
z
*
dir
.
z
;
float
d_ij2
=
dir
.
x
*
dir
.
x
+
dir
.
y
*
dir
.
y
+
dir
.
z
*
dir
.
z
;
float
reducedMass
=
cSim
.
pCcmaReducedMass
[
pos
];
cSim
.
pCcmaDelta1
[
pos
]
=
(
rrpr
>
d_ij2
*
1e-6
f
?
reducedMass
*
diff
/
rrpr
:
0.0
f
);
if
(
requiredIterations
==
iteration
&&
(
rp2
<
lowerTol
*
dist2
||
rp2
>
upperTol
*
dist2
))
requiredIterations
=
iteration
+
1
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
float
reducedMass
=
cSim
.
pCcmaReducedMass
[
index
];
cSim
.
pCcmaDelta1
[
index
]
=
(
rrpr
>
d_ij2
*
1e-6
f
?
reducedMass
*
diff
/
rrpr
:
0.0
f
);
// See whether it has converged.
threadConverged
&=
(
rp2
>=
lowerTol
*
dist2
&&
rp2
<=
upperTol
*
dist2
);
}
// Perform a parallel reduction to see if all constraints handled by this block have converged.
convergedBuffer
[
threadIdx
.
x
]
=
threadConverged
;
__syncthreads
();
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
if
(
threadIdx
.
x
%
(
2
*
step
)
==
0
)
convergedBuffer
[
threadIdx
.
x
]
&=
convergedBuffer
[
threadIdx
.
x
+
step
];
__syncthreads
();
}
if
(
threadIdx
.
x
==
0
)
cSim
.
pCcmaConverged
[
blockIdx
.
x
]
=
convergedBuffer
[
0
];
}
__global__
void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__
(
512
,
1
)
#else
__launch_bounds__
(
256
,
1
)
#endif
kMultiplyByCCMAConstraintMatrix
()
{
extern
__shared__
int
convergedBuffer
[];
// First see whether all work groups have converged.
convergedBuffer
[
threadIdx
.
x
]
=
true
;
for
(
int
index
=
threadIdx
.
x
;
index
<
gridDim
.
x
;
index
+=
blockDim
.
x
)
convergedBuffer
[
threadIdx
.
x
]
&=
cSim
.
pCcmaConverged
[
index
];
__syncthreads
();
for
(
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
if
(
threadIdx
.
x
%
(
2
*
step
)
==
0
)
convergedBuffer
[
threadIdx
.
x
]
&=
convergedBuffer
[
threadIdx
.
x
+
step
];
__syncthreads
();
if
(
threadIdx
.
x
==
0
&&
requiredIterations
>
iteration
)
cSim
.
pRequi
redI
terations
[
0
]
=
requiredIterations
;
kSyncAllThreads_kernel
(
cSim
.
pSyncCounter
,
iteration
)
;
if
(
iteration
==
cSim
.
pRequiredIterations
[
0
])
break
;
//
All
constraint
s have
converged.
}
if
(
th
re
a
dI
dx
.
x
==
0
)
cSim
.
pCcmaConverged
[
blockIdx
.
x
]
=
convergedBuffer
[
0
]
;
if
(
cSim
.
pCcmaConverged
[
0
])
return
;
//
The
constraint
iteration has already
converged.
// Multiply by the inverse constraint matrix.
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
while
(
pos
<
cSim
.
ccmaConstraints
)
for
(
unsigned
int
index
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
index
<
cSim
.
ccmaConstraints
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
float
sum
=
0.0
f
;
for
(
unsigned
int
i
=
0
;
;
i
++
)
{
unsigned
int
index
=
pos
+
i
*
cSim
.
ccmaConstraints
;
unsigned
int
column
=
cSim
.
pConstraintMatrixColumn
[
index
];
unsigned
int
element
=
index
+
i
*
cSim
.
ccmaConstraints
;
unsigned
int
column
=
cSim
.
pConstraintMatrixColumn
[
element
];
if
(
column
>=
cSim
.
ccmaConstraints
)
break
;
sum
+=
cSim
.
pCcmaDelta1
[
column
]
*
cSim
.
pConstraintMatrixValue
[
index
];
sum
+=
cSim
.
pCcmaDelta1
[
column
]
*
cSim
.
pConstraintMatrixValue
[
element
];
}
cSim
.
pCcmaDelta2
[
pos
]
=
sum
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
cSim
.
pCcmaDelta2
[
index
]
=
sum
;
}
kSyncAllThreads_kernel
(
&
cSim
.
pSyncCounter
[
gridDim
.
x
],
iteration
);
// Update the position of each atom.
}
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
__global__
void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__
(
512
,
1
)
#else
__launch_bounds__
(
256
,
1
)
#endif
kUpdateCCMAAtomPositions
(
float4
*
atomPositions
,
int
iteration
)
{
if
(
cSim
.
pCcmaConverged
[
blockIdx
.
x
])
return
;
// The constraint iteration has already converged.
float
damping
=
(
iteration
<
2
?
0.5
f
:
1.0
f
);
while
(
pos
<
cSim
.
atoms
)
for
(
unsigned
int
index
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
index
<
cSim
.
atoms
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
float4
atomPos
=
atomPositions
[
pos
];
float
invMass
=
cSim
.
pVelm4
[
pos
].
w
;
int
num
=
cSim
.
pCcmaNumAtomConstraints
[
pos
];
float4
atomPos
=
atomPositions
[
index
];
float
invMass
=
cSim
.
pVelm4
[
index
].
w
;
int
num
=
cSim
.
pCcmaNumAtomConstraints
[
index
];
for
(
int
i
=
0
;
i
<
num
;
i
++
)
{
int
index
=
pos
+
i
*
cSim
.
atoms
;
int
constraint
=
cSim
.
pCcmaAtomConstraints
[
index
];
int
constraint
=
cSim
.
pCcmaAtomConstraints
[
index
+
i
*
cSim
.
atoms
];
bool
forward
=
(
constraint
>
0
);
constraint
=
(
forward
?
constraint
-
1
:
-
constraint
-
1
);
float
constraintForce
=
damping
*
invMass
*
cSim
.
pCcmaDelta2
[
constraint
];
...
...
@@ -186,36 +210,34 @@ kApplyCCMA_kernel(float4* atomPositions, bool addOldPosition)
atomPos
.
y
+=
constraintForce
*
dir
.
y
;
atomPos
.
z
+=
constraintForce
*
dir
.
z
;
}
atomPositions
[
pos
]
=
atomPos
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
if
(
threadIdx
.
x
==
0
)
requiredIterations
=
iteration
+
1
;
kSyncAllThreads_kernel
(
&
cSim
.
pSyncCounter
[
2
*
gridDim
.
x
],
iteration
);
atomPositions
[
index
]
=
atomPos
;
}
}
// Reset the initial sync counter to be ready for the next call.
if
(
threadIdx
.
x
==
0
)
cSim
.
pSyncCounter
[
blockIdx
.
x
]
=
-
1
;
void
kApplyCCMA
(
gpuContext
gpu
,
float4
*
posq
,
bool
addOldPosition
)
{
kComputeCCMAConstraintDirections
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
ccma_threads_per_block
>>>
();
LAUNCHERROR
(
"kComputeCCMAConstraintDirections"
);
for
(
int
i
=
0
;
i
<
150
;
i
++
)
{
kComputeCCMAConstraintForces
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
ccma_threads_per_block
>>>
(
posq
,
addOldPosition
);
kMultiplyByCCMAConstraintMatrix
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
ccma_threads_per_block
>>>
();
gpu
->
psCcmaConverged
->
Download
();
if
((
*
gpu
->
psCcmaConverged
)[
0
])
break
;
kUpdateCCMAAtomPositions
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
ccma_threads_per_block
>>>
(
posq
,
i
);
}
}
void
kApplyFirstCCMA
(
gpuContext
gpu
)
{
// printf("kApplyFirstCCMA\n");
if
(
gpu
->
sim
.
ccmaConstraints
>
0
)
{
kApplyCCMA_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
ccma_threads_per_block
>>>
(
gpu
->
sim
.
pPosqP
,
true
);
LAUNCHERROR
(
"kApplyCCMA"
);
}
kApplyCCMA
(
gpu
,
gpu
->
sim
.
pPosqP
,
true
);
}
void
kApplySecondCCMA
(
gpuContext
gpu
)
{
// printf("kApplySecondCCMA\n");
if
(
gpu
->
sim
.
ccmaConstraints
>
0
)
{
kApplyCCMA_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
ccma_threads_per_block
>>>
(
gpu
->
sim
.
pPosq
,
false
);
LAUNCHERROR
(
"kApplyCCMA"
);
}
kApplyCCMA
(
gpu
,
gpu
->
sim
.
pPosq
,
false
);
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment