Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
1d0193eb
Commit
1d0193eb
authored
Apr 09, 2010
by
Peter Eastman
Browse files
Restructured CCMA code to work on Fermi boards
parent
5301b9cc
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
154 additions
and
142 deletions
+154
-142
platforms/cuda/src/kernels/cudatypes.h
platforms/cuda/src/kernels/cudatypes.h
+1
-2
platforms/cuda/src/kernels/gpu.cpp
platforms/cuda/src/kernels/gpu.cpp
+5
-13
platforms/cuda/src/kernels/gputypes.h
platforms/cuda/src/kernels/gputypes.h
+2
-3
platforms/cuda/src/kernels/kCCMA.cu
platforms/cuda/src/kernels/kCCMA.cu
+146
-124
No files found.
platforms/cuda/src/kernels/cudatypes.h
View file @
1d0193eb
...
@@ -460,8 +460,7 @@ struct cudaGmxSimulation {
...
@@ -460,8 +460,7 @@ struct cudaGmxSimulation {
float
*
pCcmaDelta2
;
// Workspace for CCMA
float
*
pCcmaDelta2
;
// Workspace for CCMA
int
*
pCcmaAtomConstraints
;
// The indices of constraints involving each atom
int
*
pCcmaAtomConstraints
;
// The indices of constraints involving each atom
int
*
pCcmaNumAtomConstraints
;
// The number of constraints involving each atom
int
*
pCcmaNumAtomConstraints
;
// The number of constraints involving each atom
short
*
pSyncCounter
;
// Used for global thread synchronization
int
*
pCcmaConverged
;
// Used by CCMA to communicate whether iteration has converged
unsigned
int
*
pRequiredIterations
;
// Used by CCMA to communicate whether iteration has converged
float
*
pCcmaReducedMass
;
// The reduced mass for each CCMA constraint
float
*
pCcmaReducedMass
;
// The reduced mass for each CCMA constraint
unsigned
int
*
pConstraintMatrixColumn
;
// The column of each element in the constraint matrix.
unsigned
int
*
pConstraintMatrixColumn
;
// The column of each element in the constraint matrix.
float
*
pConstraintMatrixValue
;
// The value of each element in the constraint matrix.
float
*
pConstraintMatrixValue
;
// The value of each element in the constraint matrix.
...
...
platforms/cuda/src/kernels/gpu.cpp
View file @
1d0193eb
...
@@ -1533,12 +1533,9 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
...
@@ -1533,12 +1533,9 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
CUDAStream
<
float
>*
psCcmaDelta2
=
new
CUDAStream
<
float
>
(
numCCMA
,
1
,
"CcmaDelta2"
);
CUDAStream
<
float
>*
psCcmaDelta2
=
new
CUDAStream
<
float
>
(
numCCMA
,
1
,
"CcmaDelta2"
);
gpu
->
psCcmaDelta2
=
psCcmaDelta2
;
gpu
->
psCcmaDelta2
=
psCcmaDelta2
;
gpu
->
sim
.
pCcmaDelta2
=
psCcmaDelta2
->
_pDevData
;
gpu
->
sim
.
pCcmaDelta2
=
psCcmaDelta2
->
_pDevData
;
CUDAStream
<
short
>*
psSyncCounter
=
new
CUDAStream
<
short
>
(
3
*
gpu
->
sim
.
blocks
,
1
,
"SyncCounter"
);
CUDAStream
<
int
>*
psCcmaConverged
=
new
CUDAStream
<
int
>
(
gpu
->
sim
.
blocks
,
1
,
"CcmaConverged"
);
gpu
->
psSyncCounter
=
psSyncCounter
;
gpu
->
psCcmaConverged
=
psCcmaConverged
;
gpu
->
sim
.
pSyncCounter
=
psSyncCounter
->
_pDevData
;
gpu
->
sim
.
pCcmaConverged
=
psCcmaConverged
->
_pDevData
;
CUDAStream
<
unsigned
int
>*
psRequiredIterations
=
new
CUDAStream
<
unsigned
int
>
(
1
,
1
,
"RequiredIterations"
);
gpu
->
psRequiredIterations
=
psRequiredIterations
;
gpu
->
sim
.
pRequiredIterations
=
psRequiredIterations
->
_pDevData
;
CUDAStream
<
float
>*
psCcmaReducedMass
=
new
CUDAStream
<
float
>
(
numCCMA
,
1
,
"CcmaReducedMass"
);
CUDAStream
<
float
>*
psCcmaReducedMass
=
new
CUDAStream
<
float
>
(
numCCMA
,
1
,
"CcmaReducedMass"
);
gpu
->
psCcmaReducedMass
=
psCcmaReducedMass
;
gpu
->
psCcmaReducedMass
=
psCcmaReducedMass
;
gpu
->
sim
.
pCcmaReducedMass
=
psCcmaReducedMass
->
_pDevData
;
gpu
->
sim
.
pCcmaReducedMass
=
psCcmaReducedMass
->
_pDevData
;
...
@@ -1562,8 +1559,6 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
...
@@ -1562,8 +1559,6 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
}
}
(
*
psConstraintMatrixColumn
)[
i
+
matrix
[
index
].
size
()
*
numCCMA
]
=
numCCMA
;
(
*
psConstraintMatrixColumn
)[
i
+
matrix
[
index
].
size
()
*
numCCMA
]
=
numCCMA
;
}
}
for
(
unsigned
int
i
=
0
;
i
<
psSyncCounter
->
_length
;
i
++
)
(
*
psSyncCounter
)[
i
]
=
-
1
;
for
(
unsigned
int
i
=
0
;
i
<
atomConstraints
.
size
();
i
++
)
{
for
(
unsigned
int
i
=
0
;
i
<
atomConstraints
.
size
();
i
++
)
{
(
*
psCcmaNumAtomConstraints
)[
i
]
=
atomConstraints
[
i
].
size
();
(
*
psCcmaNumAtomConstraints
)[
i
]
=
atomConstraints
[
i
].
size
();
for
(
unsigned
int
j
=
0
;
j
<
atomConstraints
[
i
].
size
();
j
++
)
{
for
(
unsigned
int
j
=
0
;
j
<
atomConstraints
[
i
].
size
();
j
++
)
{
...
@@ -1576,7 +1571,6 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
...
@@ -1576,7 +1571,6 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
psCcmaReducedMass
->
Upload
();
psCcmaReducedMass
->
Upload
();
psCcmaAtomConstraints
->
Upload
();
psCcmaAtomConstraints
->
Upload
();
psCcmaNumAtomConstraints
->
Upload
();
psCcmaNumAtomConstraints
->
Upload
();
psSyncCounter
->
Upload
();
psConstraintMatrixColumn
->
Upload
();
psConstraintMatrixColumn
->
Upload
();
psConstraintMatrixValue
->
Upload
();
psConstraintMatrixValue
->
Upload
();
gpu
->
sim
.
ccma_threads_per_block
=
(
gpu
->
sim
.
ccmaConstraints
+
gpu
->
sim
.
blocks
-
1
)
/
gpu
->
sim
.
blocks
;
gpu
->
sim
.
ccma_threads_per_block
=
(
gpu
->
sim
.
ccmaConstraints
+
gpu
->
sim
.
blocks
-
1
)
/
gpu
->
sim
.
blocks
;
...
@@ -1994,8 +1988,7 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
...
@@ -1994,8 +1988,7 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
gpu
->
psCcmaNumAtomConstraints
=
NULL
;
gpu
->
psCcmaNumAtomConstraints
=
NULL
;
gpu
->
psCcmaDelta1
=
NULL
;
gpu
->
psCcmaDelta1
=
NULL
;
gpu
->
psCcmaDelta2
=
NULL
;
gpu
->
psCcmaDelta2
=
NULL
;
gpu
->
psSyncCounter
=
NULL
;
gpu
->
psCcmaConverged
=
NULL
;
gpu
->
psRequiredIterations
=
NULL
;
gpu
->
psCcmaReducedMass
=
NULL
;
gpu
->
psCcmaReducedMass
=
NULL
;
gpu
->
psConstraintMatrixColumn
=
NULL
;
gpu
->
psConstraintMatrixColumn
=
NULL
;
gpu
->
psConstraintMatrixValue
=
NULL
;
gpu
->
psConstraintMatrixValue
=
NULL
;
...
@@ -2214,8 +2207,7 @@ void gpuShutDown(gpuContext gpu)
...
@@ -2214,8 +2207,7 @@ void gpuShutDown(gpuContext gpu)
delete
gpu
->
psCcmaNumAtomConstraints
;
delete
gpu
->
psCcmaNumAtomConstraints
;
delete
gpu
->
psCcmaDelta1
;
delete
gpu
->
psCcmaDelta1
;
delete
gpu
->
psCcmaDelta2
;
delete
gpu
->
psCcmaDelta2
;
delete
gpu
->
psSyncCounter
;
delete
gpu
->
psCcmaConverged
;
delete
gpu
->
psRequiredIterations
;
delete
gpu
->
psCcmaReducedMass
;
delete
gpu
->
psCcmaReducedMass
;
delete
gpu
->
psConstraintMatrixColumn
;
delete
gpu
->
psConstraintMatrixColumn
;
delete
gpu
->
psConstraintMatrixValue
;
delete
gpu
->
psConstraintMatrixValue
;
...
...
platforms/cuda/src/kernels/gputypes.h
View file @
1d0193eb
...
@@ -171,9 +171,8 @@ struct _gpuContext {
...
@@ -171,9 +171,8 @@ struct _gpuContext {
CUDAStream
<
int
>*
psCcmaNumAtomConstraints
;
// The number of constraints involving each atom
CUDAStream
<
int
>*
psCcmaNumAtomConstraints
;
// The number of constraints involving each atom
CUDAStream
<
float
>*
psCcmaDelta1
;
// Workspace for CCMA
CUDAStream
<
float
>*
psCcmaDelta1
;
// Workspace for CCMA
CUDAStream
<
float
>*
psCcmaDelta2
;
// Workspace for CCMA
CUDAStream
<
float
>*
psCcmaDelta2
;
// Workspace for CCMA
CUDAStream
<
short
>*
psSyncCounter
;
// Used for global thread synchronization
CUDAStream
<
int
>*
psCcmaConverged
;
// Used by CCMA to communicate whether iteration has converged
CUDAStream
<
unsigned
int
>*
psRequiredIterations
;
// Used by SHAKE to communicate whether iteration has converged
CUDAStream
<
float
>*
psCcmaReducedMass
;
// The reduced mass for each CCMA constraint
CUDAStream
<
float
>*
psCcmaReducedMass
;
// The reduced mass for each SHAKE constraint
CUDAStream
<
float
>*
psRigidClusterMatrix
;
// The inverse constraint matrix for each rigid cluster
CUDAStream
<
float
>*
psRigidClusterMatrix
;
// The inverse constraint matrix for each rigid cluster
CUDAStream
<
unsigned
int
>*
psRigidClusterConstraintIndex
;
// The index of each cluster in the stream containing cluster constraints.
CUDAStream
<
unsigned
int
>*
psRigidClusterConstraintIndex
;
// The index of each cluster in the stream containing cluster constraints.
CUDAStream
<
unsigned
int
>*
psRigidClusterMatrixIndex
;
// The index of each cluster in the stream containing cluster matrices.
CUDAStream
<
unsigned
int
>*
psRigidClusterMatrixIndex
;
// The index of each cluster in the stream containing cluster matrices.
...
...
platforms/cuda/src/kernels/kCCMA.cu
View file @
1d0193eb
...
@@ -48,25 +48,7 @@ void GetCCMASim(gpuContext gpu)
...
@@ -48,25 +48,7 @@ void GetCCMASim(gpuContext gpu)
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
}
/**
__global__
void
* Synchronize all threads across all blocks.
*/
__device__
void
kSyncAllThreads_kernel
(
short
*
syncCounter
,
short
newCount
)
{
__syncthreads
();
if
(
threadIdx
.
x
==
0
)
syncCounter
[
blockIdx
.
x
]
=
newCount
;
if
(
threadIdx
.
x
<
gridDim
.
x
)
{
volatile
short
*
counter
=
&
syncCounter
[
threadIdx
.
x
];
do
{
}
while
(
*
counter
!=
newCount
);
}
__syncthreads
();
}
__global__
void
#if (__CUDA_ARCH__ >= 200)
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
__launch_bounds__
(
1024
,
1
)
#elif (__CUDA_ARCH__ >= 130)
#elif (__CUDA_ARCH__ >= 130)
...
@@ -74,148 +56,188 @@ __launch_bounds__(512, 1)
...
@@ -74,148 +56,188 @@ __launch_bounds__(512, 1)
#else
#else
__launch_bounds__
(
256
,
1
)
__launch_bounds__
(
256
,
1
)
#endif
#endif
k
ApplyCCMA_kernel
(
float4
*
atomPositions
,
bool
addOldPosition
)
k
ComputeCCMAConstraintDirections
(
)
{
{
// Initialize counters used for monitoring convergence and doing global thread synchronization.
__shared__
unsigned
int
requiredIterations
;
if
(
threadIdx
.
x
==
0
)
{
requiredIterations
=
0
;
cSim
.
pSyncCounter
[
gridDim
.
x
+
blockIdx
.
x
]
=
-
1
;
cSim
.
pSyncCounter
[
2
*
gridDim
.
x
+
blockIdx
.
x
]
=
-
1
;
cSim
.
pRequiredIterations
[
0
]
=
0
;
}
// Calculate the direction of each constraint.
// Calculate the direction of each constraint.
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
for
(
unsigned
int
index
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
index
<
cSim
.
ccmaConstraints
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
while
(
pos
<
cSim
.
ccmaConstraints
)
{
{
int2
atoms
=
cSim
.
pCcmaAtoms
[
pos
];
int2
atoms
=
cSim
.
pCcmaAtoms
[
index
];
float4
dir
=
cSim
.
pCcmaDistance
[
pos
];
float4
dir
=
cSim
.
pCcmaDistance
[
index
];
float4
oldPos1
=
cSim
.
pOldPosq
[
atoms
.
x
];
float4
oldPos1
=
cSim
.
pOldPosq
[
atoms
.
x
];
float4
oldPos2
=
cSim
.
pOldPosq
[
atoms
.
y
];
float4
oldPos2
=
cSim
.
pOldPosq
[
atoms
.
y
];
dir
.
x
=
oldPos1
.
x
-
oldPos2
.
x
;
dir
.
x
=
oldPos1
.
x
-
oldPos2
.
x
;
dir
.
y
=
oldPos1
.
y
-
oldPos2
.
y
;
dir
.
y
=
oldPos1
.
y
-
oldPos2
.
y
;
dir
.
z
=
oldPos1
.
z
-
oldPos2
.
z
;
dir
.
z
=
oldPos1
.
z
-
oldPos2
.
z
;
cSim
.
pCcmaDistance
[
pos
]
=
dir
;
cSim
.
pCcmaDistance
[
index
]
=
dir
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
__syncthreads
();
//
Iteratively update the atom positions
.
//
Mark that no blocks have converged yet
.
unsigned
int
maxIterations
=
150
;
for
(
unsigned
int
index
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
index
<
gridDim
.
x
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
cSim
.
pCcmaConverged
[
index
]
=
false
;
}
__global__
void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__
(
512
,
1
)
#else
__launch_bounds__
(
256
,
1
)
#endif
kComputeCCMAConstraintForces
(
float4
*
atomPositions
,
bool
addOldPosition
)
{
if
(
cSim
.
pCcmaConverged
[
blockIdx
.
x
])
return
;
// The constraint iteration has already converged.
extern
__shared__
int
convergedBuffer
[];
float
lowerTol
=
1.0
f
-
2.0
f
*
cSim
.
shakeTolerance
+
cSim
.
shakeTolerance
*
cSim
.
shakeTolerance
;
float
lowerTol
=
1.0
f
-
2.0
f
*
cSim
.
shakeTolerance
+
cSim
.
shakeTolerance
*
cSim
.
shakeTolerance
;
float
upperTol
=
1.0
f
+
2.0
f
*
cSim
.
shakeTolerance
+
cSim
.
shakeTolerance
*
cSim
.
shakeTolerance
;
float
upperTol
=
1.0
f
+
2.0
f
*
cSim
.
shakeTolerance
+
cSim
.
shakeTolerance
*
cSim
.
shakeTolerance
;
for
(
unsigned
int
iteration
=
0
;
iteration
<
maxIterations
;
iteration
++
)
int
threadConverged
=
1
;
{
// Calculate the constraint force for each constraint.
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
// Calculate the constraint force for each constraint.
while
(
pos
<
cSim
.
ccmaConstraints
)
for
(
unsigned
int
index
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
index
<
cSim
.
ccmaConstraints
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int2
atoms
=
cSim
.
pCcmaAtoms
[
index
];
float4
delta1
=
atomPositions
[
atoms
.
x
];
float4
delta2
=
atomPositions
[
atoms
.
y
];
float4
dir
=
cSim
.
pCcmaDistance
[
index
];
float3
rp_ij
=
make_float3
(
delta1
.
x
-
delta2
.
x
,
delta1
.
y
-
delta2
.
y
,
delta1
.
z
-
delta2
.
z
);
if
(
addOldPosition
)
{
{
int2
atoms
=
cSim
.
pCcmaAtoms
[
pos
];
rp_ij
.
x
+=
dir
.
x
;
float4
delta1
=
atomPositions
[
atoms
.
x
];
rp_ij
.
y
+=
dir
.
y
;
float4
delta2
=
atomPositions
[
atoms
.
y
];
rp_ij
.
z
+=
dir
.
z
;
float4
dir
=
cSim
.
pCcmaDistance
[
pos
];
float3
rp_ij
=
make_float3
(
delta1
.
x
-
delta2
.
x
,
delta1
.
y
-
delta2
.
y
,
delta1
.
z
-
delta2
.
z
);
if
(
addOldPosition
)
{
rp_ij
.
x
+=
dir
.
x
;
rp_ij
.
y
+=
dir
.
y
;
rp_ij
.
z
+=
dir
.
z
;
}
float
rp2
=
rp_ij
.
x
*
rp_ij
.
x
+
rp_ij
.
y
*
rp_ij
.
y
+
rp_ij
.
z
*
rp_ij
.
z
;
float
dist2
=
dir
.
w
*
dir
.
w
;
float
diff
=
dist2
-
rp2
;
float
rrpr
=
rp_ij
.
x
*
dir
.
x
+
rp_ij
.
y
*
dir
.
y
+
rp_ij
.
z
*
dir
.
z
;
float
d_ij2
=
dir
.
x
*
dir
.
x
+
dir
.
y
*
dir
.
y
+
dir
.
z
*
dir
.
z
;
float
reducedMass
=
cSim
.
pCcmaReducedMass
[
pos
];
cSim
.
pCcmaDelta1
[
pos
]
=
(
rrpr
>
d_ij2
*
1e-6
f
?
reducedMass
*
diff
/
rrpr
:
0.0
f
);
if
(
requiredIterations
==
iteration
&&
(
rp2
<
lowerTol
*
dist2
||
rp2
>
upperTol
*
dist2
))
requiredIterations
=
iteration
+
1
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
float
rp2
=
rp_ij
.
x
*
rp_ij
.
x
+
rp_ij
.
y
*
rp_ij
.
y
+
rp_ij
.
z
*
rp_ij
.
z
;
float
dist2
=
dir
.
w
*
dir
.
w
;
float
diff
=
dist2
-
rp2
;
float
rrpr
=
rp_ij
.
x
*
dir
.
x
+
rp_ij
.
y
*
dir
.
y
+
rp_ij
.
z
*
dir
.
z
;
float
d_ij2
=
dir
.
x
*
dir
.
x
+
dir
.
y
*
dir
.
y
+
dir
.
z
*
dir
.
z
;
float
reducedMass
=
cSim
.
pCcmaReducedMass
[
index
];
cSim
.
pCcmaDelta1
[
index
]
=
(
rrpr
>
d_ij2
*
1e-6
f
?
reducedMass
*
diff
/
rrpr
:
0.0
f
);
// See whether it has converged.
threadConverged
&=
(
rp2
>=
lowerTol
*
dist2
&&
rp2
<=
upperTol
*
dist2
);
}
// Perform a parallel reduction to see if all constraints handled by this block have converged.
convergedBuffer
[
threadIdx
.
x
]
=
threadConverged
;
__syncthreads
();
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
if
(
threadIdx
.
x
%
(
2
*
step
)
==
0
)
convergedBuffer
[
threadIdx
.
x
]
&=
convergedBuffer
[
threadIdx
.
x
+
step
];
__syncthreads
();
__syncthreads
();
if
(
threadIdx
.
x
==
0
&&
requiredIterations
>
iteration
)
}
cSim
.
pRequiredIterations
[
0
]
=
requiredIterations
;
if
(
threadIdx
.
x
==
0
)
kSyncAllThreads_kernel
(
cSim
.
pSyncCounter
,
iteration
);
cSim
.
pCcmaConverged
[
blockIdx
.
x
]
=
convergedBuffer
[
0
];
if
(
iteration
==
cSim
.
pRequiredIterations
[
0
])
}
break
;
// All constraints have converged.
// Multiply by the inverse constraint matrix.
__global__
void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__
(
512
,
1
)
#else
__launch_bounds__
(
256
,
1
)
#endif
kMultiplyByCCMAConstraintMatrix
()
{
extern
__shared__
int
convergedBuffer
[];
// First see whether all work groups have converged.
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
convergedBuffer
[
threadIdx
.
x
]
=
true
;
while
(
pos
<
cSim
.
ccmaConstraints
)
for
(
int
index
=
threadIdx
.
x
;
index
<
gridDim
.
x
;
index
+=
blockDim
.
x
)
{
convergedBuffer
[
threadIdx
.
x
]
&=
cSim
.
pCcmaConverged
[
index
];
float
sum
=
0.0
f
;
__syncthreads
();
for
(
unsigned
int
i
=
0
;
;
i
++
)
for
(
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
{
if
(
threadIdx
.
x
%
(
2
*
step
)
==
0
)
unsigned
int
index
=
pos
+
i
*
cSim
.
ccmaConstraints
;
convergedBuffer
[
threadIdx
.
x
]
&=
convergedBuffer
[
threadIdx
.
x
+
step
];
unsigned
int
column
=
cSim
.
pConstraintMatrixColumn
[
index
];
__syncthreads
();
if
(
column
>=
cSim
.
ccmaConstraints
)
}
break
;
if
(
threadIdx
.
x
==
0
)
sum
+=
cSim
.
pCcmaDelta1
[
column
]
*
cSim
.
pConstraintMatrixValue
[
index
];
cSim
.
pCcmaConverged
[
blockIdx
.
x
]
=
convergedBuffer
[
0
];
}
if
(
cSim
.
pCcmaConverged
[
0
])
cSim
.
pCcmaDelta2
[
pos
]
=
sum
;
return
;
// The constraint iteration has already converged.
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
kSyncAllThreads_kernel
(
&
cSim
.
pSyncCounter
[
gridDim
.
x
],
iteration
);
//
Update the position of each atom
.
//
Multiply by the inverse constraint matrix
.
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
for
(
unsigned
int
index
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
index
<
cSim
.
ccmaConstraints
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
float
damping
=
(
iteration
<
2
?
0.5
f
:
1.0
f
);
{
while
(
pos
<
cSim
.
atoms
)
float
sum
=
0.0
f
;
for
(
unsigned
int
i
=
0
;
;
i
++
)
{
{
float4
atomPos
=
atomPositions
[
pos
];
unsigned
int
element
=
index
+
i
*
cSim
.
ccmaConstraints
;
float
invMass
=
cSim
.
pVelm4
[
pos
].
w
;
unsigned
int
column
=
cSim
.
pConstraintMatrixColumn
[
element
];
int
num
=
cSim
.
pCcmaNumAtomConstraints
[
pos
];
if
(
column
>=
cSim
.
ccmaConstraints
)
for
(
int
i
=
0
;
i
<
num
;
i
++
)
break
;
{
sum
+=
cSim
.
pCcmaDelta1
[
column
]
*
cSim
.
pConstraintMatrixValue
[
element
];
int
index
=
pos
+
i
*
cSim
.
atoms
;
int
constraint
=
cSim
.
pCcmaAtomConstraints
[
index
];
bool
forward
=
(
constraint
>
0
);
constraint
=
(
forward
?
constraint
-
1
:
-
constraint
-
1
);
float
constraintForce
=
damping
*
invMass
*
cSim
.
pCcmaDelta2
[
constraint
];
constraintForce
=
(
forward
?
constraintForce
:
-
constraintForce
);
float4
dir
=
cSim
.
pCcmaDistance
[
constraint
];
atomPos
.
x
+=
constraintForce
*
dir
.
x
;
atomPos
.
y
+=
constraintForce
*
dir
.
y
;
atomPos
.
z
+=
constraintForce
*
dir
.
z
;
}
atomPositions
[
pos
]
=
atomPos
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
if
(
threadIdx
.
x
==
0
)
cSim
.
pCcmaDelta2
[
index
]
=
sum
;
requiredIterations
=
iteration
+
1
;
kSyncAllThreads_kernel
(
&
cSim
.
pSyncCounter
[
2
*
gridDim
.
x
],
iteration
);
}
}
}
// Reset the initial sync counter to be ready for the next call.
__global__
void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__
(
512
,
1
)
#else
__launch_bounds__
(
256
,
1
)
#endif
kUpdateCCMAAtomPositions
(
float4
*
atomPositions
,
int
iteration
)
{
if
(
cSim
.
pCcmaConverged
[
blockIdx
.
x
])
return
;
// The constraint iteration has already converged.
float
damping
=
(
iteration
<
2
?
0.5
f
:
1.0
f
);
for
(
unsigned
int
index
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
index
<
cSim
.
atoms
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
float4
atomPos
=
atomPositions
[
index
];
float
invMass
=
cSim
.
pVelm4
[
index
].
w
;
int
num
=
cSim
.
pCcmaNumAtomConstraints
[
index
];
for
(
int
i
=
0
;
i
<
num
;
i
++
)
{
int
constraint
=
cSim
.
pCcmaAtomConstraints
[
index
+
i
*
cSim
.
atoms
];
bool
forward
=
(
constraint
>
0
);
constraint
=
(
forward
?
constraint
-
1
:
-
constraint
-
1
);
float
constraintForce
=
damping
*
invMass
*
cSim
.
pCcmaDelta2
[
constraint
];
constraintForce
=
(
forward
?
constraintForce
:
-
constraintForce
);
float4
dir
=
cSim
.
pCcmaDistance
[
constraint
];
atomPos
.
x
+=
constraintForce
*
dir
.
x
;
atomPos
.
y
+=
constraintForce
*
dir
.
y
;
atomPos
.
z
+=
constraintForce
*
dir
.
z
;
}
atomPositions
[
index
]
=
atomPos
;
}
}
if
(
threadIdx
.
x
==
0
)
void
kApplyCCMA
(
gpuContext
gpu
,
float4
*
posq
,
bool
addOldPosition
)
cSim
.
pSyncCounter
[
blockIdx
.
x
]
=
-
1
;
{
kComputeCCMAConstraintDirections
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
ccma_threads_per_block
>>>
();
LAUNCHERROR
(
"kComputeCCMAConstraintDirections"
);
for
(
int
i
=
0
;
i
<
150
;
i
++
)
{
kComputeCCMAConstraintForces
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
ccma_threads_per_block
>>>
(
posq
,
addOldPosition
);
kMultiplyByCCMAConstraintMatrix
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
ccma_threads_per_block
>>>
();
gpu
->
psCcmaConverged
->
Download
();
if
((
*
gpu
->
psCcmaConverged
)[
0
])
break
;
kUpdateCCMAAtomPositions
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
ccma_threads_per_block
>>>
(
posq
,
i
);
}
}
}
void
kApplyFirstCCMA
(
gpuContext
gpu
)
void
kApplyFirstCCMA
(
gpuContext
gpu
)
{
{
// printf("kApplyFirstCCMA\n");
// printf("kApplyFirstCCMA\n");
if
(
gpu
->
sim
.
ccmaConstraints
>
0
)
if
(
gpu
->
sim
.
ccmaConstraints
>
0
)
{
kApplyCCMA
(
gpu
,
gpu
->
sim
.
pPosqP
,
true
);
kApplyCCMA_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
ccma_threads_per_block
>>>
(
gpu
->
sim
.
pPosqP
,
true
);
LAUNCHERROR
(
"kApplyCCMA"
);
}
}
}
void
kApplySecondCCMA
(
gpuContext
gpu
)
void
kApplySecondCCMA
(
gpuContext
gpu
)
{
{
// printf("kApplySecondCCMA\n");
// printf("kApplySecondCCMA\n");
if
(
gpu
->
sim
.
ccmaConstraints
>
0
)
if
(
gpu
->
sim
.
ccmaConstraints
>
0
)
{
kApplyCCMA
(
gpu
,
gpu
->
sim
.
pPosq
,
false
);
kApplyCCMA_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
ccma_threads_per_block
>>>
(
gpu
->
sim
.
pPosq
,
false
);
LAUNCHERROR
(
"kApplyCCMA"
);
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment