Commit 306d99e8 authored by Peter Eastman's avatar Peter Eastman
Browse files

Enhancements to CUDAStream to reduce the risk of bugs and make debugging easier

parent 968cb132
...@@ -76,40 +76,42 @@ struct CUDAStream : public SoADeviceObject ...@@ -76,40 +76,42 @@ struct CUDAStream : public SoADeviceObject
T** _pDevStream; T** _pDevStream;
T* _pSysData; T* _pSysData;
T* _pDevData; T* _pDevData;
CUDAStream(int length, int subStreams = 1); std::string _name;
CUDAStream(unsigned int length, unsigned int subStreams = 1); CUDAStream(int length, int subStreams = 1, std::string name="");
CUDAStream(unsigned int length, int subStreams = 1); CUDAStream(unsigned int length, unsigned int subStreams = 1, std::string name="");
CUDAStream(int length, unsigned int subStreams = 1); CUDAStream(unsigned int length, int subStreams = 1, std::string name="");
CUDAStream(int length, unsigned int subStreams = 1, std::string name="");
virtual ~CUDAStream(); virtual ~CUDAStream();
void Allocate(); void Allocate();
void Deallocate(); void Deallocate();
void Upload(); void Upload();
void Download(); void Download();
void Collapse(unsigned int newstreams = 1, unsigned int interleave = 1); void Collapse(unsigned int newstreams = 1, unsigned int interleave = 1);
T& operator[](int index);
}; };
float CompareStreams(CUDAStream<float>& s1, CUDAStream<float>& s2, float tolerance, unsigned int maxindex = 0); float CompareStreams(CUDAStream<float>& s1, CUDAStream<float>& s2, float tolerance, unsigned int maxindex = 0);
template <typename T> template <typename T>
CUDAStream<T>::CUDAStream(int length, unsigned int subStreams) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0) CUDAStream<T>::CUDAStream(int length, unsigned int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
{ {
Allocate(); Allocate();
} }
template <typename T> template <typename T>
CUDAStream<T>::CUDAStream(unsigned int length, int subStreams) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0) CUDAStream<T>::CUDAStream(unsigned int length, int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
{ {
Allocate(); Allocate();
} }
template <typename T> template <typename T>
CUDAStream<T>::CUDAStream(unsigned int length, unsigned int subStreams) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0) CUDAStream<T>::CUDAStream(unsigned int length, unsigned int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
{ {
Allocate(); Allocate();
} }
template <typename T> template <typename T>
CUDAStream<T>::CUDAStream(int length, int subStreams) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0) CUDAStream<T>::CUDAStream(int length, int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
{ {
Allocate(); Allocate();
} }
...@@ -129,7 +131,7 @@ void CUDAStream<T>::Allocate() ...@@ -129,7 +131,7 @@ void CUDAStream<T>::Allocate()
_pSysData = new T[_subStreams * _stride]; _pSysData = new T[_subStreams * _stride];
status = cudaMalloc((void **) &_pDevData, _stride * _subStreams * sizeof(T)); status = cudaMalloc((void **) &_pDevData, _stride * _subStreams * sizeof(T));
RTERROR(status, "cudaMalloc CUDAStream::Allocate failed"); RTERROR(status, (_name+": cudaMalloc in CUDAStream::Allocate failed").c_str());
for (unsigned int i = 0; i < _subStreams; i++) for (unsigned int i = 0; i < _subStreams; i++)
{ {
...@@ -149,7 +151,7 @@ void CUDAStream<T>::Deallocate() ...@@ -149,7 +151,7 @@ void CUDAStream<T>::Deallocate()
delete[] _pSysData; delete[] _pSysData;
_pSysData = NULL; _pSysData = NULL;
status = cudaFree(_pDevData); status = cudaFree(_pDevData);
RTERROR(status, "cudaFree CUDAStream::Deallocate failed"); RTERROR(status, (_name+": cudaFree in CUDAStream::Deallocate failed").c_str());
} }
template <typename T> template <typename T>
...@@ -157,7 +159,7 @@ void CUDAStream<T>::Upload() ...@@ -157,7 +159,7 @@ void CUDAStream<T>::Upload()
{ {
cudaError_t status; cudaError_t status;
status = cudaMemcpy(_pDevData, _pSysData, _stride * _subStreams * sizeof(T), cudaMemcpyHostToDevice); status = cudaMemcpy(_pDevData, _pSysData, _stride * _subStreams * sizeof(T), cudaMemcpyHostToDevice);
RTERROR(status, "cudaMemcpy CUDAStream::Upload failed"); RTERROR(status, (_name+": cudaMemcpy in CUDAStream::Upload failed").c_str());
} }
template <typename T> template <typename T>
...@@ -165,7 +167,7 @@ void CUDAStream<T>::Download() ...@@ -165,7 +167,7 @@ void CUDAStream<T>::Download()
{ {
cudaError_t status; cudaError_t status;
status = cudaMemcpy(_pSysData, _pDevData, _stride * _subStreams * sizeof(T), cudaMemcpyDeviceToHost); status = cudaMemcpy(_pSysData, _pDevData, _stride * _subStreams * sizeof(T), cudaMemcpyDeviceToHost);
RTERROR(status, "cudaMemcpy CUDAStream::Download failed"); RTERROR(status, (_name+": cudaMemcpy in CUDAStream::Download failed").c_str());
} }
template <typename T> template <typename T>
...@@ -210,6 +212,12 @@ void CUDAStream<T>::Collapse(unsigned int newstreams, unsigned int interleave) ...@@ -210,6 +212,12 @@ void CUDAStream<T>::Collapse(unsigned int newstreams, unsigned int interleave)
delete[] pTemp; delete[] pTemp;
} }
template <typename T>
T& CUDAStream<T>::operator[](int index)
{
return _pSysData[index];
}
static const unsigned int GRID = 32; static const unsigned int GRID = 32;
static const unsigned int GRIDBITS = 5; static const unsigned int GRIDBITS = 5;
static const int G8X_NONBOND_THREADS_PER_BLOCK = 256; static const int G8X_NONBOND_THREADS_PER_BLOCK = 256;
......
This diff is collapsed.
...@@ -428,8 +428,8 @@ void testBlockInteractions(bool periodic) { ...@@ -428,8 +428,8 @@ void testBlockInteractions(bool periodic) {
data.gpu->psGridBoundingBox->Download(); data.gpu->psGridBoundingBox->Download();
data.gpu->psGridCenter->Download(); data.gpu->psGridCenter->Download();
for (int i = 0; i < numBlocks; i++) { for (int i = 0; i < numBlocks; i++) {
float4 gridSize = data.gpu->psGridBoundingBox->_pSysData[i]; float4 gridSize = (*data.gpu->psGridBoundingBox)[i];
float4 center = data.gpu->psGridCenter->_pSysData[i]; float4 center = (*data.gpu->psGridCenter)[i];
if (periodic) { if (periodic) {
ASSERT(gridSize.x < 0.5*boxSize); ASSERT(gridSize.x < 0.5*boxSize);
ASSERT(gridSize.y < 0.5*boxSize); ASSERT(gridSize.y < 0.5*boxSize);
...@@ -437,7 +437,7 @@ void testBlockInteractions(bool periodic) { ...@@ -437,7 +437,7 @@ void testBlockInteractions(bool periodic) {
} }
float minx = 0.0, maxx = 0.0, miny = 0.0, maxy = 0.0, minz = 0.0, maxz = 0.0, radius = 0.0; float minx = 0.0, maxx = 0.0, miny = 0.0, maxy = 0.0, minz = 0.0, maxz = 0.0, radius = 0.0;
for (int j = 0; j < blockSize; j++) { for (int j = 0; j < blockSize; j++) {
float4 pos = data.gpu->psPosq4->_pSysData[i*blockSize+j]; float4 pos = (*data.gpu->psPosq4)[i*blockSize+j];
float dx = pos.x-center.x; float dx = pos.x-center.x;
float dy = pos.y-center.y; float dy = pos.y-center.y;
float dz = pos.z-center.z; float dz = pos.z-center.z;
...@@ -467,7 +467,7 @@ void testBlockInteractions(bool periodic) { ...@@ -467,7 +467,7 @@ void testBlockInteractions(bool periodic) {
// Verify that interactions were identified correctly. // Verify that interactions were identified correctly.
data.gpu->psInteractionCount->Download(); data.gpu->psInteractionCount->Download();
int numWithInteractions = data.gpu->psInteractionCount->_pSysData[0]; int numWithInteractions = (*data.gpu->psInteractionCount)[0];
vector<bool> hasInteractions(data.gpu->sim.workUnits, false); vector<bool> hasInteractions(data.gpu->sim.workUnits, false);
data.gpu->psInteractingWorkUnit->Download(); data.gpu->psInteractingWorkUnit->Download();
data.gpu->psInteractionFlag->Download(); data.gpu->psInteractionFlag->Download();
...@@ -475,7 +475,7 @@ void testBlockInteractions(bool periodic) { ...@@ -475,7 +475,7 @@ void testBlockInteractions(bool periodic) {
const unsigned int grid = data.gpu->grid; const unsigned int grid = data.gpu->grid;
const unsigned int dim = (atoms+(grid-1))/grid; const unsigned int dim = (atoms+(grid-1))/grid;
for (int i = 0; i < numWithInteractions; i++) { for (int i = 0; i < numWithInteractions; i++) {
unsigned int workUnit = data.gpu->psInteractingWorkUnit->_pSysData[i]; unsigned int workUnit = (*data.gpu->psInteractingWorkUnit)[i];
unsigned int x = (workUnit >> 17); unsigned int x = (workUnit >> 17);
unsigned int y = ((workUnit >> 2) & 0x7fff); unsigned int y = ((workUnit >> 2) & 0x7fff);
int tile = (x > y ? x+y*dim-y*(y+1)/2 : y+x*dim-x*(x+1)/2); int tile = (x > y ? x+y*dim-y*(y+1)/2 : y+x*dim-x*(x+1)/2);
...@@ -483,10 +483,10 @@ void testBlockInteractions(bool periodic) { ...@@ -483,10 +483,10 @@ void testBlockInteractions(bool periodic) {
// Make sure this tile really should have been flagged based on bounding volumes. // Make sure this tile really should have been flagged based on bounding volumes.
float4 gridSize1 = data.gpu->psGridBoundingBox->_pSysData[x]; float4 gridSize1 = (*data.gpu->psGridBoundingBox)[x];
float4 gridSize2 = data.gpu->psGridBoundingBox->_pSysData[y]; float4 gridSize2 = (*data.gpu->psGridBoundingBox)[y];
float4 center1 = data.gpu->psGridCenter->_pSysData[x]; float4 center1 = (*data.gpu->psGridCenter)[x];
float4 center2 = data.gpu->psGridCenter->_pSysData[y]; float4 center2 = (*data.gpu->psGridCenter)[y];
float dx = center1.x-center2.x; float dx = center1.x-center2.x;
float dy = center1.y-center2.y; float dy = center1.y-center2.y;
float dz = center1.z-center2.z; float dz = center1.z-center2.z;
...@@ -502,12 +502,12 @@ void testBlockInteractions(bool periodic) { ...@@ -502,12 +502,12 @@ void testBlockInteractions(bool periodic) {
// Check the interaction flags. // Check the interaction flags.
unsigned int flags = data.gpu->psInteractionFlag->_pSysData[i]; unsigned int flags = (*data.gpu->psInteractionFlag)[i];
for (int atom2 = 0; atom2 < 32; atom2++) { for (int atom2 = 0; atom2 < 32; atom2++) {
if ((flags & 1) == 0) { if ((flags & 1) == 0) {
float4 pos2 = data.gpu->psPosq4->_pSysData[y*blockSize+atom2]; float4 pos2 = (*data.gpu->psPosq4)[y*blockSize+atom2];
for (int atom1 = 0; atom1 < blockSize; ++atom1) { for (int atom1 = 0; atom1 < blockSize; ++atom1) {
float4 pos1 = data.gpu->psPosq4->_pSysData[x*blockSize+atom1]; float4 pos1 = (*data.gpu->psPosq4)[x*blockSize+atom1];
float dx = pos2.x-pos1.x; float dx = pos2.x-pos1.x;
float dy = pos2.y-pos1.y; float dy = pos2.y-pos1.y;
float dz = pos2.z-pos1.z; float dz = pos2.z-pos1.z;
...@@ -536,13 +536,13 @@ void testBlockInteractions(bool periodic) { ...@@ -536,13 +536,13 @@ void testBlockInteractions(bool periodic) {
data.gpu->psWorkUnit->Download(); data.gpu->psWorkUnit->Download();
for (int i = 0; i < hasInteractions.size(); i++) for (int i = 0; i < hasInteractions.size(); i++)
if (!hasInteractions[i]) { if (!hasInteractions[i]) {
unsigned int workUnit = data.gpu->psWorkUnit->_pSysData[i]; unsigned int workUnit = (*data.gpu->psWorkUnit)[i];
unsigned int x = (workUnit >> 17); unsigned int x = (workUnit >> 17);
unsigned int y = ((workUnit >> 2) & 0x7fff); unsigned int y = ((workUnit >> 2) & 0x7fff);
for (int atom1 = 0; atom1 < blockSize; ++atom1) { for (int atom1 = 0; atom1 < blockSize; ++atom1) {
float4 pos1 = data.gpu->psPosq4->_pSysData[x*blockSize+atom1]; float4 pos1 = (*data.gpu->psPosq4)[x*blockSize+atom1];
for (int atom2 = 0; atom2 < blockSize; ++atom2) { for (int atom2 = 0; atom2 < blockSize; ++atom2) {
float4 pos2 = data.gpu->psPosq4->_pSysData[y*blockSize+atom2]; float4 pos2 = (*data.gpu->psPosq4)[y*blockSize+atom2];
float dx = pos1.x-pos2.x; float dx = pos1.x-pos2.x;
float dy = pos1.y-pos2.y; float dy = pos1.y-pos2.y;
float dz = pos1.z-pos2.z; float dz = pos1.z-pos2.z;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment