Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
390e0a6b
Commit
390e0a6b
authored
Oct 21, 2014
by
root
Committed by
peastman
Oct 21, 2014
Browse files
Optimizations to multi-GPU calculations
parent
ba66e90e
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
34 additions
and
25 deletions
+34
-25
platforms/cuda/include/CudaParallelKernels.h
platforms/cuda/include/CudaParallelKernels.h
+1
-0
platforms/cuda/src/CudaContext.cpp
platforms/cuda/src/CudaContext.cpp
+10
-0
platforms/cuda/src/CudaParallelKernels.cpp
platforms/cuda/src/CudaParallelKernels.cpp
+16
-9
platforms/cuda/src/CudaPlatform.cpp
platforms/cuda/src/CudaPlatform.cpp
+7
-16
No files found.
platforms/cuda/include/CudaParallelKernels.h
View file @
390e0a6b
...
@@ -85,6 +85,7 @@ private:
...
@@ -85,6 +85,7 @@ private:
void
*
pinnedPositionBuffer
;
void
*
pinnedPositionBuffer
;
long
long
*
pinnedForceBuffer
;
long
long
*
pinnedForceBuffer
;
CUfunction
sumKernel
;
CUfunction
sumKernel
;
CUevent
event
;
};
};
/**
/**
...
...
platforms/cuda/src/CudaContext.cpp
View file @
390e0a6b
...
@@ -154,6 +154,16 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
...
@@ -154,6 +154,16 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
CHECK_RESULT
(
cuCtxCreate
(
&
context
,
flags
,
device
));
CHECK_RESULT
(
cuCtxCreate
(
&
context
,
flags
,
device
));
contextIsValid
=
true
;
contextIsValid
=
true
;
CHECK_RESULT
(
cuCtxSetCacheConfig
(
CU_FUNC_CACHE_PREFER_SHARED
));
CHECK_RESULT
(
cuCtxSetCacheConfig
(
CU_FUNC_CACHE_PREFER_SHARED
));
if
(
contextIndex
>
0
)
{
int
canAccess
;
cuDeviceCanAccessPeer
(
&
canAccess
,
getDevice
(),
platformData
.
contexts
[
0
]
->
getDevice
());
if
(
canAccess
)
{
platformData
.
contexts
[
0
]
->
setAsCurrent
();
CHECK_RESULT
(
cuCtxEnablePeerAccess
(
getContext
(),
0
));
setAsCurrent
();
CHECK_RESULT
(
cuCtxEnablePeerAccess
(
platformData
.
contexts
[
0
]
->
getContext
(),
0
));
}
}
numAtoms
=
system
.
getNumParticles
();
numAtoms
=
system
.
getNumParticles
();
paddedNumAtoms
=
TileSize
*
((
numAtoms
+
TileSize
-
1
)
/
TileSize
);
paddedNumAtoms
=
TileSize
*
((
numAtoms
+
TileSize
-
1
)
/
TileSize
);
numAtomBlocks
=
(
paddedNumAtoms
+
(
TileSize
-
1
))
/
TileSize
;
numAtomBlocks
=
(
paddedNumAtoms
+
(
TileSize
-
1
))
/
TileSize
;
...
...
platforms/cuda/src/CudaParallelKernels.cpp
View file @
390e0a6b
...
@@ -63,22 +63,24 @@ if (result != CUDA_SUCCESS) { \
...
@@ -63,22 +63,24 @@ if (result != CUDA_SUCCESS) { \
class
CudaParallelCalcForcesAndEnergyKernel
::
BeginComputationTask
:
public
CudaContext
::
WorkTask
{
class
CudaParallelCalcForcesAndEnergyKernel
::
BeginComputationTask
:
public
CudaContext
::
WorkTask
{
public:
public:
BeginComputationTask
(
ContextImpl
&
context
,
CudaContext
&
cu
,
CudaCalcForcesAndEnergyKernel
&
kernel
,
BeginComputationTask
(
ContextImpl
&
context
,
CudaContext
&
cu
,
CudaCalcForcesAndEnergyKernel
&
kernel
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
,
void
*
pinnedMemory
)
:
context
(
context
),
cu
(
cu
),
kernel
(
kernel
),
bool
includeForce
,
bool
includeEnergy
,
int
groups
,
void
*
pinnedMemory
,
CUevent
event
)
:
context
(
context
),
cu
(
cu
),
kernel
(
kernel
),
includeForce
(
includeForce
),
includeEnergy
(
includeEnergy
),
groups
(
groups
),
pinnedMemory
(
pinnedMemory
)
{
includeForce
(
includeForce
),
includeEnergy
(
includeEnergy
),
groups
(
groups
),
pinnedMemory
(
pinnedMemory
)
,
event
(
event
)
{
}
}
void
execute
()
{
void
execute
()
{
// Copy coordinates over to this device and execute the kernel.
// Copy coordinates over to this device and execute the kernel.
cu
.
setAsCurrent
();
cu
.
setAsCurrent
();
if
(
cu
.
getContextIndex
()
>
0
)
{
if
(
cu
.
getContextIndex
()
>
0
)
{
if
(
cu
.
getPlatformData
().
peerAccessSupported
&&
cu
.
getPlatformData
().
contexts
.
size
()
<
3
)
{
if
(
cu
.
getPlatformData
().
peerAccessSupported
&&
false
)
{
// Why is the peer-to-peer copy slower???
CudaContext
&
context0
=
*
cu
.
getPlatformData
().
contexts
[
0
];
CudaContext
&
context0
=
*
cu
.
getPlatformData
().
contexts
[
0
];
int
numBytes
=
cu
.
getPosq
().
getSize
()
*
cu
.
getPosq
().
getElementSize
();
int
numBytes
=
cu
.
getPosq
().
getSize
()
*
cu
.
getPosq
().
getElementSize
();
CHECK_RESULT
(
cuMemcpy
Peer
Async
(
cu
.
getPosq
().
getDevicePointer
(),
cu
.
getContext
(),
context0
.
getPosq
().
getDevicePointer
(),
context0
.
getContext
(),
numBytes
,
0
),
"Error copying positions"
);
CHECK_RESULT
(
cuMemcpyAsync
(
cu
.
getPosq
().
getDevicePointer
(),
context0
.
getPosq
().
getDevicePointer
(),
numBytes
,
0
),
"Error copying positions"
);
}
}
else
else
{
cuStreamWaitEvent
(
cu
.
getCurrentStream
(),
event
,
0
);
cu
.
getPosq
().
upload
(
pinnedMemory
,
false
);
cu
.
getPosq
().
upload
(
pinnedMemory
,
false
);
}
}
}
kernel
.
beginComputation
(
context
,
includeForce
,
includeEnergy
,
groups
);
kernel
.
beginComputation
(
context
,
includeForce
,
includeEnergy
,
groups
);
}
}
private:
private:
...
@@ -88,6 +90,7 @@ private:
...
@@ -88,6 +90,7 @@ private:
bool
includeForce
,
includeEnergy
;
bool
includeForce
,
includeEnergy
;
int
groups
;
int
groups
;
void
*
pinnedMemory
;
void
*
pinnedMemory
;
CUevent
event
;
};
};
class
CudaParallelCalcForcesAndEnergyKernel
::
FinishComputationTask
:
public
CudaContext
::
WorkTask
{
class
CudaParallelCalcForcesAndEnergyKernel
::
FinishComputationTask
:
public
CudaContext
::
WorkTask
{
...
@@ -108,7 +111,7 @@ public:
...
@@ -108,7 +111,7 @@ public:
int
numBytes
=
numAtoms
*
3
*
sizeof
(
long
long
);
int
numBytes
=
numAtoms
*
3
*
sizeof
(
long
long
);
int
offset
=
(
cu
.
getContextIndex
()
-
1
)
*
numBytes
;
int
offset
=
(
cu
.
getContextIndex
()
-
1
)
*
numBytes
;
CudaContext
&
context0
=
*
cu
.
getPlatformData
().
contexts
[
0
];
CudaContext
&
context0
=
*
cu
.
getPlatformData
().
contexts
[
0
];
CHECK_RESULT
(
cuMemcpy
Peer
(
contextForces
.
getDevicePointer
()
+
offset
,
context0
.
getContext
(),
cu
.
getForce
().
getDevicePointer
(),
cu
.
getContext
(),
numBytes
),
"Error copying forces"
);
CHECK_RESULT
(
cuMemcpy
(
contextForces
.
getDevicePointer
()
+
offset
,
cu
.
getForce
().
getDevicePointer
(),
numBytes
),
"Error copying forces"
);
}
}
else
else
cu
.
getForce
().
download
(
&
pinnedMemory
[(
cu
.
getContextIndex
()
-
1
)
*
numAtoms
*
3
]);
cu
.
getForce
().
download
(
&
pinnedMemory
[(
cu
.
getContextIndex
()
-
1
)
*
numAtoms
*
3
]);
...
@@ -146,6 +149,7 @@ CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel()
...
@@ -146,6 +149,7 @@ CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel()
cuMemFreeHost
(
pinnedPositionBuffer
);
cuMemFreeHost
(
pinnedPositionBuffer
);
if
(
pinnedForceBuffer
!=
NULL
)
if
(
pinnedForceBuffer
!=
NULL
)
cuMemFreeHost
(
pinnedForceBuffer
);
cuMemFreeHost
(
pinnedForceBuffer
);
cuEventDestroy
(
event
);
}
}
void
CudaParallelCalcForcesAndEnergyKernel
::
initialize
(
const
System
&
system
)
{
void
CudaParallelCalcForcesAndEnergyKernel
::
initialize
(
const
System
&
system
)
{
...
@@ -157,6 +161,7 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
...
@@ -157,6 +161,7 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
getKernel
(
i
).
initialize
(
system
);
getKernel
(
i
).
initialize
(
system
);
for
(
int
i
=
0
;
i
<
(
int
)
contextNonbondedFractions
.
size
();
i
++
)
for
(
int
i
=
0
;
i
<
(
int
)
contextNonbondedFractions
.
size
();
i
++
)
contextNonbondedFractions
[
i
]
=
1
/
(
double
)
contextNonbondedFractions
.
size
();
contextNonbondedFractions
[
i
]
=
1
/
(
double
)
contextNonbondedFractions
.
size
();
CHECK_RESULT
(
cuEventCreate
(
&
event
,
0
),
"Error creating event"
);
}
}
void
CudaParallelCalcForcesAndEnergyKernel
::
beginComputation
(
ContextImpl
&
context
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
)
{
void
CudaParallelCalcForcesAndEnergyKernel
::
beginComputation
(
ContextImpl
&
context
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
)
{
...
@@ -170,13 +175,15 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
...
@@ -170,13 +175,15 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
// Copy coordinates over to each device and execute the kernel.
// Copy coordinates over to each device and execute the kernel.
if
(
!
(
cu
.
getPlatformData
().
peerAccessSupported
&&
cu
.
getPlatformData
().
contexts
.
size
()
<
3
))
if
(
!
(
cu
.
getPlatformData
().
peerAccessSupported
&&
false
))
{
// Why is this faster than a peer-to-peer copy???
cu
.
getPosq
().
download
(
pinnedPositionBuffer
);
cu
.
getPosq
().
download
(
pinnedPositionBuffer
,
false
);
cuEventRecord
(
event
,
cu
.
getCurrentStream
());
}
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
{
data
.
contextEnergy
[
i
]
=
0.0
;
data
.
contextEnergy
[
i
]
=
0.0
;
CudaContext
&
cu
=
*
data
.
contexts
[
i
];
CudaContext
&
cu
=
*
data
.
contexts
[
i
];
CudaContext
::
WorkThread
&
thread
=
cu
.
getWorkThread
();
CudaContext
::
WorkThread
&
thread
=
cu
.
getWorkThread
();
thread
.
addTask
(
new
BeginComputationTask
(
context
,
cu
,
getKernel
(
i
),
includeForce
,
includeEnergy
,
groups
,
pinnedPositionBuffer
));
thread
.
addTask
(
new
BeginComputationTask
(
context
,
cu
,
getKernel
(
i
),
includeForce
,
includeEnergy
,
groups
,
pinnedPositionBuffer
,
event
));
}
}
}
}
...
...
platforms/cuda/src/CudaPlatform.cpp
View file @
390e0a6b
...
@@ -229,22 +229,13 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
...
@@ -229,22 +229,13 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
// Determine whether peer-to-peer copying is supported, and enable it if so.
// Determine whether peer-to-peer copying is supported, and enable it if so.
peerAccessSupported
=
false
;
// Disable until I figure out why it usually makes things slower
peerAccessSupported
=
true
;
// peerAccessSupported = true;
// for (int i = 1; i < contexts.size(); i++) {
// int canAccess;
// cuDeviceCanAccessPeer(&canAccess, contexts[i]->getDevice(), contexts[0]->getDevice());
// if (!canAccess) {
// peerAccessSupported = false;
// break;
// }
// }
if
(
peerAccessSupported
)
{
for
(
int
i
=
1
;
i
<
contexts
.
size
();
i
++
)
{
for
(
int
i
=
1
;
i
<
contexts
.
size
();
i
++
)
{
contexts
[
0
]
->
setAsCurrent
();
int
canAccess
;
CHECK_RESULT
(
cuCtxEnablePeerAccess
(
contexts
[
i
]
->
getContext
(),
0
),
"Error enabling peer access"
);
cuDeviceCanAccessPeer
(
&
canAccess
,
contexts
[
i
]
->
getDevice
(),
contexts
[
0
]
->
getDevice
());
contexts
[
i
]
->
setAsCurrent
();
if
(
!
canAccess
)
{
CHECK_RESULT
(
cuCtxEnablePeerAccess
(
contexts
[
0
]
->
getContext
(),
0
),
"Error enabling peer access"
);
peerAccessSupported
=
false
;
break
;
}
}
}
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment