"vscode:/vscode.git/clone" did not exist on "14b2cfc528cfd123aff42c1383e6e6ae1ad7a9eb"
Commit 306d99e8 authored by Peter Eastman's avatar Peter Eastman
Browse files

Enhancements to CUDAStream to reduce the risk of bugs and make debugging easier

parent 968cb132
...@@ -76,40 +76,42 @@ struct CUDAStream : public SoADeviceObject ...@@ -76,40 +76,42 @@ struct CUDAStream : public SoADeviceObject
T** _pDevStream; T** _pDevStream;
T* _pSysData; T* _pSysData;
T* _pDevData; T* _pDevData;
CUDAStream(int length, int subStreams = 1); std::string _name;
CUDAStream(unsigned int length, unsigned int subStreams = 1); CUDAStream(int length, int subStreams = 1, std::string name="");
CUDAStream(unsigned int length, int subStreams = 1); CUDAStream(unsigned int length, unsigned int subStreams = 1, std::string name="");
CUDAStream(int length, unsigned int subStreams = 1); CUDAStream(unsigned int length, int subStreams = 1, std::string name="");
CUDAStream(int length, unsigned int subStreams = 1, std::string name="");
virtual ~CUDAStream(); virtual ~CUDAStream();
void Allocate(); void Allocate();
void Deallocate(); void Deallocate();
void Upload(); void Upload();
void Download(); void Download();
void Collapse(unsigned int newstreams = 1, unsigned int interleave = 1); void Collapse(unsigned int newstreams = 1, unsigned int interleave = 1);
T& operator[](int index);
}; };
float CompareStreams(CUDAStream<float>& s1, CUDAStream<float>& s2, float tolerance, unsigned int maxindex = 0); float CompareStreams(CUDAStream<float>& s1, CUDAStream<float>& s2, float tolerance, unsigned int maxindex = 0);
template <typename T> template <typename T>
CUDAStream<T>::CUDAStream(int length, unsigned int subStreams) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0) CUDAStream<T>::CUDAStream(int length, unsigned int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
{ {
Allocate(); Allocate();
} }
template <typename T> template <typename T>
CUDAStream<T>::CUDAStream(unsigned int length, int subStreams) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0) CUDAStream<T>::CUDAStream(unsigned int length, int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
{ {
Allocate(); Allocate();
} }
template <typename T> template <typename T>
CUDAStream<T>::CUDAStream(unsigned int length, unsigned int subStreams) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0) CUDAStream<T>::CUDAStream(unsigned int length, unsigned int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
{ {
Allocate(); Allocate();
} }
template <typename T> template <typename T>
CUDAStream<T>::CUDAStream(int length, int subStreams) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0) CUDAStream<T>::CUDAStream(int length, int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
{ {
Allocate(); Allocate();
} }
...@@ -129,7 +131,7 @@ void CUDAStream<T>::Allocate() ...@@ -129,7 +131,7 @@ void CUDAStream<T>::Allocate()
_pSysData = new T[_subStreams * _stride]; _pSysData = new T[_subStreams * _stride];
status = cudaMalloc((void **) &_pDevData, _stride * _subStreams * sizeof(T)); status = cudaMalloc((void **) &_pDevData, _stride * _subStreams * sizeof(T));
RTERROR(status, "cudaMalloc CUDAStream::Allocate failed"); RTERROR(status, (_name+": cudaMalloc in CUDAStream::Allocate failed").c_str());
for (unsigned int i = 0; i < _subStreams; i++) for (unsigned int i = 0; i < _subStreams; i++)
{ {
...@@ -149,7 +151,7 @@ void CUDAStream<T>::Deallocate() ...@@ -149,7 +151,7 @@ void CUDAStream<T>::Deallocate()
delete[] _pSysData; delete[] _pSysData;
_pSysData = NULL; _pSysData = NULL;
status = cudaFree(_pDevData); status = cudaFree(_pDevData);
RTERROR(status, "cudaFree CUDAStream::Deallocate failed"); RTERROR(status, (_name+": cudaFree in CUDAStream::Deallocate failed").c_str());
} }
template <typename T> template <typename T>
...@@ -157,7 +159,7 @@ void CUDAStream<T>::Upload() ...@@ -157,7 +159,7 @@ void CUDAStream<T>::Upload()
{ {
cudaError_t status; cudaError_t status;
status = cudaMemcpy(_pDevData, _pSysData, _stride * _subStreams * sizeof(T), cudaMemcpyHostToDevice); status = cudaMemcpy(_pDevData, _pSysData, _stride * _subStreams * sizeof(T), cudaMemcpyHostToDevice);
RTERROR(status, "cudaMemcpy CUDAStream::Upload failed"); RTERROR(status, (_name+": cudaMemcpy in CUDAStream::Upload failed").c_str());
} }
template <typename T> template <typename T>
...@@ -165,7 +167,7 @@ void CUDAStream<T>::Download() ...@@ -165,7 +167,7 @@ void CUDAStream<T>::Download()
{ {
cudaError_t status; cudaError_t status;
status = cudaMemcpy(_pSysData, _pDevData, _stride * _subStreams * sizeof(T), cudaMemcpyDeviceToHost); status = cudaMemcpy(_pSysData, _pDevData, _stride * _subStreams * sizeof(T), cudaMemcpyDeviceToHost);
RTERROR(status, "cudaMemcpy CUDAStream::Download failed"); RTERROR(status, (_name+": cudaMemcpy in CUDAStream::Download failed").c_str());
} }
template <typename T> template <typename T>
...@@ -210,6 +212,12 @@ void CUDAStream<T>::Collapse(unsigned int newstreams, unsigned int interleave) ...@@ -210,6 +212,12 @@ void CUDAStream<T>::Collapse(unsigned int newstreams, unsigned int interleave)
delete[] pTemp; delete[] pTemp;
} }
template <typename T>
T& CUDAStream<T>::operator[](int index)
{
return _pSysData[index];
}
static const unsigned int GRID = 32; static const unsigned int GRID = 32;
static const unsigned int GRIDBITS = 5; static const unsigned int GRIDBITS = 5;
static const int G8X_NONBOND_THREADS_PER_BLOCK = 256; static const int G8X_NONBOND_THREADS_PER_BLOCK = 256;
......
This diff is collapsed.
...@@ -428,8 +428,8 @@ void testBlockInteractions(bool periodic) { ...@@ -428,8 +428,8 @@ void testBlockInteractions(bool periodic) {
data.gpu->psGridBoundingBox->Download(); data.gpu->psGridBoundingBox->Download();
data.gpu->psGridCenter->Download(); data.gpu->psGridCenter->Download();
for (int i = 0; i < numBlocks; i++) { for (int i = 0; i < numBlocks; i++) {
float4 gridSize = data.gpu->psGridBoundingBox->_pSysData[i]; float4 gridSize = (*data.gpu->psGridBoundingBox)[i];
float4 center = data.gpu->psGridCenter->_pSysData[i]; float4 center = (*data.gpu->psGridCenter)[i];
if (periodic) { if (periodic) {
ASSERT(gridSize.x < 0.5*boxSize); ASSERT(gridSize.x < 0.5*boxSize);
ASSERT(gridSize.y < 0.5*boxSize); ASSERT(gridSize.y < 0.5*boxSize);
...@@ -437,7 +437,7 @@ void testBlockInteractions(bool periodic) { ...@@ -437,7 +437,7 @@ void testBlockInteractions(bool periodic) {
} }
float minx = 0.0, maxx = 0.0, miny = 0.0, maxy = 0.0, minz = 0.0, maxz = 0.0, radius = 0.0; float minx = 0.0, maxx = 0.0, miny = 0.0, maxy = 0.0, minz = 0.0, maxz = 0.0, radius = 0.0;
for (int j = 0; j < blockSize; j++) { for (int j = 0; j < blockSize; j++) {
float4 pos = data.gpu->psPosq4->_pSysData[i*blockSize+j]; float4 pos = (*data.gpu->psPosq4)[i*blockSize+j];
float dx = pos.x-center.x; float dx = pos.x-center.x;
float dy = pos.y-center.y; float dy = pos.y-center.y;
float dz = pos.z-center.z; float dz = pos.z-center.z;
...@@ -467,7 +467,7 @@ void testBlockInteractions(bool periodic) { ...@@ -467,7 +467,7 @@ void testBlockInteractions(bool periodic) {
// Verify that interactions were identified correctly. // Verify that interactions were identified correctly.
data.gpu->psInteractionCount->Download(); data.gpu->psInteractionCount->Download();
int numWithInteractions = data.gpu->psInteractionCount->_pSysData[0]; int numWithInteractions = (*data.gpu->psInteractionCount)[0];
vector<bool> hasInteractions(data.gpu->sim.workUnits, false); vector<bool> hasInteractions(data.gpu->sim.workUnits, false);
data.gpu->psInteractingWorkUnit->Download(); data.gpu->psInteractingWorkUnit->Download();
data.gpu->psInteractionFlag->Download(); data.gpu->psInteractionFlag->Download();
...@@ -475,7 +475,7 @@ void testBlockInteractions(bool periodic) { ...@@ -475,7 +475,7 @@ void testBlockInteractions(bool periodic) {
const unsigned int grid = data.gpu->grid; const unsigned int grid = data.gpu->grid;
const unsigned int dim = (atoms+(grid-1))/grid; const unsigned int dim = (atoms+(grid-1))/grid;
for (int i = 0; i < numWithInteractions; i++) { for (int i = 0; i < numWithInteractions; i++) {
unsigned int workUnit = data.gpu->psInteractingWorkUnit->_pSysData[i]; unsigned int workUnit = (*data.gpu->psInteractingWorkUnit)[i];
unsigned int x = (workUnit >> 17); unsigned int x = (workUnit >> 17);
unsigned int y = ((workUnit >> 2) & 0x7fff); unsigned int y = ((workUnit >> 2) & 0x7fff);
int tile = (x > y ? x+y*dim-y*(y+1)/2 : y+x*dim-x*(x+1)/2); int tile = (x > y ? x+y*dim-y*(y+1)/2 : y+x*dim-x*(x+1)/2);
...@@ -483,10 +483,10 @@ void testBlockInteractions(bool periodic) { ...@@ -483,10 +483,10 @@ void testBlockInteractions(bool periodic) {
// Make sure this tile really should have been flagged based on bounding volumes. // Make sure this tile really should have been flagged based on bounding volumes.
float4 gridSize1 = data.gpu->psGridBoundingBox->_pSysData[x]; float4 gridSize1 = (*data.gpu->psGridBoundingBox)[x];
float4 gridSize2 = data.gpu->psGridBoundingBox->_pSysData[y]; float4 gridSize2 = (*data.gpu->psGridBoundingBox)[y];
float4 center1 = data.gpu->psGridCenter->_pSysData[x]; float4 center1 = (*data.gpu->psGridCenter)[x];
float4 center2 = data.gpu->psGridCenter->_pSysData[y]; float4 center2 = (*data.gpu->psGridCenter)[y];
float dx = center1.x-center2.x; float dx = center1.x-center2.x;
float dy = center1.y-center2.y; float dy = center1.y-center2.y;
float dz = center1.z-center2.z; float dz = center1.z-center2.z;
...@@ -502,12 +502,12 @@ void testBlockInteractions(bool periodic) { ...@@ -502,12 +502,12 @@ void testBlockInteractions(bool periodic) {
// Check the interaction flags. // Check the interaction flags.
unsigned int flags = data.gpu->psInteractionFlag->_pSysData[i]; unsigned int flags = (*data.gpu->psInteractionFlag)[i];
for (int atom2 = 0; atom2 < 32; atom2++) { for (int atom2 = 0; atom2 < 32; atom2++) {
if ((flags & 1) == 0) { if ((flags & 1) == 0) {
float4 pos2 = data.gpu->psPosq4->_pSysData[y*blockSize+atom2]; float4 pos2 = (*data.gpu->psPosq4)[y*blockSize+atom2];
for (int atom1 = 0; atom1 < blockSize; ++atom1) { for (int atom1 = 0; atom1 < blockSize; ++atom1) {
float4 pos1 = data.gpu->psPosq4->_pSysData[x*blockSize+atom1]; float4 pos1 = (*data.gpu->psPosq4)[x*blockSize+atom1];
float dx = pos2.x-pos1.x; float dx = pos2.x-pos1.x;
float dy = pos2.y-pos1.y; float dy = pos2.y-pos1.y;
float dz = pos2.z-pos1.z; float dz = pos2.z-pos1.z;
...@@ -536,13 +536,13 @@ void testBlockInteractions(bool periodic) { ...@@ -536,13 +536,13 @@ void testBlockInteractions(bool periodic) {
data.gpu->psWorkUnit->Download(); data.gpu->psWorkUnit->Download();
for (int i = 0; i < hasInteractions.size(); i++) for (int i = 0; i < hasInteractions.size(); i++)
if (!hasInteractions[i]) { if (!hasInteractions[i]) {
unsigned int workUnit = data.gpu->psWorkUnit->_pSysData[i]; unsigned int workUnit = (*data.gpu->psWorkUnit)[i];
unsigned int x = (workUnit >> 17); unsigned int x = (workUnit >> 17);
unsigned int y = ((workUnit >> 2) & 0x7fff); unsigned int y = ((workUnit >> 2) & 0x7fff);
for (int atom1 = 0; atom1 < blockSize; ++atom1) { for (int atom1 = 0; atom1 < blockSize; ++atom1) {
float4 pos1 = data.gpu->psPosq4->_pSysData[x*blockSize+atom1]; float4 pos1 = (*data.gpu->psPosq4)[x*blockSize+atom1];
for (int atom2 = 0; atom2 < blockSize; ++atom2) { for (int atom2 = 0; atom2 < blockSize; ++atom2) {
float4 pos2 = data.gpu->psPosq4->_pSysData[y*blockSize+atom2]; float4 pos2 = (*data.gpu->psPosq4)[y*blockSize+atom2];
float dx = pos1.x-pos2.x; float dx = pos1.x-pos2.x;
float dy = pos1.y-pos2.y; float dy = pos1.y-pos2.y;
float dz = pos1.z-pos2.z; float dz = pos1.z-pos2.z;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment