Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
390e0a6b
Commit
390e0a6b
authored
Oct 21, 2014
by
root
Committed by
peastman
Oct 21, 2014
Browse files
Optimizations to multi-GPU calculations
parent
ba66e90e
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
34 additions
and
25 deletions
+34
-25
platforms/cuda/include/CudaParallelKernels.h
platforms/cuda/include/CudaParallelKernels.h
+1
-0
platforms/cuda/src/CudaContext.cpp
platforms/cuda/src/CudaContext.cpp
+10
-0
platforms/cuda/src/CudaParallelKernels.cpp
platforms/cuda/src/CudaParallelKernels.cpp
+16
-9
platforms/cuda/src/CudaPlatform.cpp
platforms/cuda/src/CudaPlatform.cpp
+7
-16
No files found.
platforms/cuda/include/CudaParallelKernels.h
View file @
390e0a6b
...
...
@@ -85,6 +85,7 @@ private:
void
*
pinnedPositionBuffer
;
long
long
*
pinnedForceBuffer
;
CUfunction
sumKernel
;
CUevent
event
;
};
/**
...
...
platforms/cuda/src/CudaContext.cpp
View file @
390e0a6b
...
...
@@ -154,6 +154,16 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
CHECK_RESULT
(
cuCtxCreate
(
&
context
,
flags
,
device
));
contextIsValid
=
true
;
CHECK_RESULT
(
cuCtxSetCacheConfig
(
CU_FUNC_CACHE_PREFER_SHARED
));
if
(
contextIndex
>
0
)
{
int
canAccess
;
cuDeviceCanAccessPeer
(
&
canAccess
,
getDevice
(),
platformData
.
contexts
[
0
]
->
getDevice
());
if
(
canAccess
)
{
platformData
.
contexts
[
0
]
->
setAsCurrent
();
CHECK_RESULT
(
cuCtxEnablePeerAccess
(
getContext
(),
0
));
setAsCurrent
();
CHECK_RESULT
(
cuCtxEnablePeerAccess
(
platformData
.
contexts
[
0
]
->
getContext
(),
0
));
}
}
numAtoms
=
system
.
getNumParticles
();
paddedNumAtoms
=
TileSize
*
((
numAtoms
+
TileSize
-
1
)
/
TileSize
);
numAtomBlocks
=
(
paddedNumAtoms
+
(
TileSize
-
1
))
/
TileSize
;
...
...
platforms/cuda/src/CudaParallelKernels.cpp
View file @
390e0a6b
...
...
@@ -63,21 +63,23 @@ if (result != CUDA_SUCCESS) { \
class
CudaParallelCalcForcesAndEnergyKernel
::
BeginComputationTask
:
public
CudaContext
::
WorkTask
{
public:
BeginComputationTask
(
ContextImpl
&
context
,
CudaContext
&
cu
,
CudaCalcForcesAndEnergyKernel
&
kernel
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
,
void
*
pinnedMemory
)
:
context
(
context
),
cu
(
cu
),
kernel
(
kernel
),
includeForce
(
includeForce
),
includeEnergy
(
includeEnergy
),
groups
(
groups
),
pinnedMemory
(
pinnedMemory
)
{
bool
includeForce
,
bool
includeEnergy
,
int
groups
,
void
*
pinnedMemory
,
CUevent
event
)
:
context
(
context
),
cu
(
cu
),
kernel
(
kernel
),
includeForce
(
includeForce
),
includeEnergy
(
includeEnergy
),
groups
(
groups
),
pinnedMemory
(
pinnedMemory
)
,
event
(
event
)
{
}
void
execute
()
{
// Copy coordinates over to this device and execute the kernel.
cu
.
setAsCurrent
();
if
(
cu
.
getContextIndex
()
>
0
)
{
if
(
cu
.
getPlatformData
().
peerAccessSupported
&&
cu
.
getPlatformData
().
contexts
.
size
()
<
3
)
{
if
(
cu
.
getPlatformData
().
peerAccessSupported
&&
false
)
{
// Why is the peer-to-peer copy slower???
CudaContext
&
context0
=
*
cu
.
getPlatformData
().
contexts
[
0
];
int
numBytes
=
cu
.
getPosq
().
getSize
()
*
cu
.
getPosq
().
getElementSize
();
CHECK_RESULT
(
cuMemcpy
Peer
Async
(
cu
.
getPosq
().
getDevicePointer
(),
cu
.
getContext
(),
context0
.
getPosq
().
getDevicePointer
(),
context0
.
getContext
(),
numBytes
,
0
),
"Error copying positions"
);
CHECK_RESULT
(
cuMemcpyAsync
(
cu
.
getPosq
().
getDevicePointer
(),
context0
.
getPosq
().
getDevicePointer
(),
numBytes
,
0
),
"Error copying positions"
);
}
else
else
{
cuStreamWaitEvent
(
cu
.
getCurrentStream
(),
event
,
0
);
cu
.
getPosq
().
upload
(
pinnedMemory
,
false
);
}
}
kernel
.
beginComputation
(
context
,
includeForce
,
includeEnergy
,
groups
);
}
...
...
@@ -88,6 +90,7 @@ private:
bool
includeForce
,
includeEnergy
;
int
groups
;
void
*
pinnedMemory
;
CUevent
event
;
};
class
CudaParallelCalcForcesAndEnergyKernel
::
FinishComputationTask
:
public
CudaContext
::
WorkTask
{
...
...
@@ -108,7 +111,7 @@ public:
int
numBytes
=
numAtoms
*
3
*
sizeof
(
long
long
);
int
offset
=
(
cu
.
getContextIndex
()
-
1
)
*
numBytes
;
CudaContext
&
context0
=
*
cu
.
getPlatformData
().
contexts
[
0
];
CHECK_RESULT
(
cuMemcpy
Peer
(
contextForces
.
getDevicePointer
()
+
offset
,
context0
.
getContext
(),
cu
.
getForce
().
getDevicePointer
(),
cu
.
getContext
(),
numBytes
),
"Error copying forces"
);
CHECK_RESULT
(
cuMemcpy
(
contextForces
.
getDevicePointer
()
+
offset
,
cu
.
getForce
().
getDevicePointer
(),
numBytes
),
"Error copying forces"
);
}
else
cu
.
getForce
().
download
(
&
pinnedMemory
[(
cu
.
getContextIndex
()
-
1
)
*
numAtoms
*
3
]);
...
...
@@ -146,6 +149,7 @@ CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel()
cuMemFreeHost
(
pinnedPositionBuffer
);
if
(
pinnedForceBuffer
!=
NULL
)
cuMemFreeHost
(
pinnedForceBuffer
);
cuEventDestroy
(
event
);
}
void
CudaParallelCalcForcesAndEnergyKernel
::
initialize
(
const
System
&
system
)
{
...
...
@@ -157,6 +161,7 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
getKernel
(
i
).
initialize
(
system
);
for
(
int
i
=
0
;
i
<
(
int
)
contextNonbondedFractions
.
size
();
i
++
)
contextNonbondedFractions
[
i
]
=
1
/
(
double
)
contextNonbondedFractions
.
size
();
CHECK_RESULT
(
cuEventCreate
(
&
event
,
0
),
"Error creating event"
);
}
void
CudaParallelCalcForcesAndEnergyKernel
::
beginComputation
(
ContextImpl
&
context
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
)
{
...
...
@@ -170,13 +175,15 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
// Copy coordinates over to each device and execute the kernel.
if
(
!
(
cu
.
getPlatformData
().
peerAccessSupported
&&
cu
.
getPlatformData
().
contexts
.
size
()
<
3
))
cu
.
getPosq
().
download
(
pinnedPositionBuffer
);
if
(
!
(
cu
.
getPlatformData
().
peerAccessSupported
&&
false
))
{
// Why is this faster than a peer-to-peer copy???
cu
.
getPosq
().
download
(
pinnedPositionBuffer
,
false
);
cuEventRecord
(
event
,
cu
.
getCurrentStream
());
}
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
{
data
.
contextEnergy
[
i
]
=
0.0
;
CudaContext
&
cu
=
*
data
.
contexts
[
i
];
CudaContext
::
WorkThread
&
thread
=
cu
.
getWorkThread
();
thread
.
addTask
(
new
BeginComputationTask
(
context
,
cu
,
getKernel
(
i
),
includeForce
,
includeEnergy
,
groups
,
pinnedPositionBuffer
));
thread
.
addTask
(
new
BeginComputationTask
(
context
,
cu
,
getKernel
(
i
),
includeForce
,
includeEnergy
,
groups
,
pinnedPositionBuffer
,
event
));
}
}
...
...
platforms/cuda/src/CudaPlatform.cpp
View file @
390e0a6b
...
...
@@ -229,22 +229,13 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
// Determine whether peer-to-peer copying is supported, and enable it if so.
peerAccessSupported
=
false
;
// Disable until I figure out why it usually makes things slower
// peerAccessSupported = true;
// for (int i = 1; i < contexts.size(); i++) {
// int canAccess;
// cuDeviceCanAccessPeer(&canAccess, contexts[i]->getDevice(), contexts[0]->getDevice());
// if (!canAccess) {
// peerAccessSupported = false;
// break;
// }
// }
if
(
peerAccessSupported
)
{
for
(
int
i
=
1
;
i
<
contexts
.
size
();
i
++
)
{
contexts
[
0
]
->
setAsCurrent
();
CHECK_RESULT
(
cuCtxEnablePeerAccess
(
contexts
[
i
]
->
getContext
(),
0
),
"Error enabling peer access"
);
contexts
[
i
]
->
setAsCurrent
();
CHECK_RESULT
(
cuCtxEnablePeerAccess
(
contexts
[
0
]
->
getContext
(),
0
),
"Error enabling peer access"
);
peerAccessSupported
=
true
;
for
(
int
i
=
1
;
i
<
contexts
.
size
();
i
++
)
{
int
canAccess
;
cuDeviceCanAccessPeer
(
&
canAccess
,
contexts
[
i
]
->
getDevice
(),
contexts
[
0
]
->
getDevice
());
if
(
!
canAccess
)
{
peerAccessSupported
=
false
;
break
;
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment