Commit 5b017677 authored by Peter Eastman's avatar Peter Eastman
Browse files

Converted lots of ints to unsigned ints (might help performance a little by avoiding conversions)

parent e85347d1
......@@ -210,8 +210,8 @@ void CUDAStream<T>::Collapse(unsigned int newstreams, unsigned int interleave)
delete[] pTemp;
}
static const int GRID = 32;
static const int GRIDBITS = 5;
static const unsigned int GRID = 32;
static const unsigned int GRIDBITS = 5;
static const int G8X_NONBOND_THREADS_PER_BLOCK = 256;
static const int GT2XX_NONBOND_THREADS_PER_BLOCK = 320;
static const int G8X_BORNFORCE2_THREADS_PER_BLOCK = 256;
......
......@@ -35,18 +35,18 @@
* different versions of the kernels.
*/
__global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit, int numWorkUnits)
__global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit, unsigned int numWorkUnits)
{
extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
int pos = warp*numWorkUnits/totalWarps;
int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
#ifdef USE_CUTOFF
float3* tempBuffer = (float3*) &sA[cSim.nonbond_threads_per_block];
#endif
int lasty = -1;
unsigned int lasty = 0xFFFFFFFF;
while (pos < end)
{
......@@ -69,7 +69,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
float dEdR;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
unsigned int tj = tgx;
Atom* psA = &sA[tbx];
unsigned int i = x + tgx;
apos = cSim.pPosq[i];
......@@ -130,7 +130,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
else // bExclusion
{
unsigned int xi = x>>GRIDBITS;
int cell = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
unsigned int cell = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
for (unsigned int j = 0; j < GRID; j++)
{
......@@ -177,7 +177,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
// Write results
float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
int offset = x + tgx + warp*cSim.stride;
unsigned int offset = x + tgx + warp*cSim.stride;
of = cSim.pForce4a[offset];
of.x += af.x;
of.y += af.y;
......@@ -188,7 +188,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
#endif
}
......@@ -197,7 +197,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
// Read fixed atom data into registers and GRF
if (lasty != y)
{
int j = y + tgx;
unsigned int j = y + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j];
sA[threadIdx.x].x = temp.x;
......@@ -355,7 +355,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
// Read fixed atom data into registers and GRF
unsigned int xi = x>>GRIDBITS;
unsigned int yi = y>>GRIDBITS;
int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
excl = (excl >> tgx) | (excl << (GRID - tgx));
for (unsigned int j = 0; j < GRID; j++)
......@@ -407,7 +407,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
// Write results
float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
int offset = x + tgx + warp*cSim.stride;
unsigned int offset = x + tgx + warp*cSim.stride;
of = cSim.pForce4a[offset];
of.x += af.x;
of.y += af.y;
......@@ -424,7 +424,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
of.x = sA[threadIdx.x].fx;
of.y = sA[threadIdx.x].fy;
......
......@@ -106,8 +106,8 @@ extern __global__ void kFindBlockBoundsCutoff_kernel();
extern __global__ void kFindBlockBoundsPeriodic_kernel();
extern __global__ void kFindBlocksWithInteractionsCutoff_kernel();
extern __global__ void kFindBlocksWithInteractionsPeriodic_kernel();
extern __global__ void kFindInteractionsWithinBlocksCutoff_kernel(unsigned int*, int);
extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int*, int);
extern __global__ void kFindInteractionsWithinBlocksCutoff_kernel(unsigned int*, unsigned int);
extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int*, unsigned int);
void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
{
......
......@@ -35,18 +35,18 @@
* different versions of the kernels.
*/
__global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit, int numWorkUnits)
__global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit, unsigned int numWorkUnits)
{
extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
int pos = warp*numWorkUnits/totalWarps;
int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
#ifdef USE_CUTOFF
float* tempBuffer = (float*) &sA[cSim.nonbond_threads_per_block];
#endif
int lasty = -1;
unsigned int lasty = -0xFFFFFFFF;
while (pos < end)
{
......@@ -61,7 +61,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
float2 a = cSim.pAttr[i];
float br = cSim.pBornRadii[i];
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
unsigned int tj = tgx;
Atom* psA = &sA[tbx];
float4 af;
af.x = 0.0f;
......@@ -138,7 +138,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
else // bExclusion
{
unsigned int xi = x>>GRIDBITS;
int cell = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
unsigned int cell = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
for (unsigned int j = 0; j < GRID; j++)
{
......@@ -206,7 +206,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
int offset = x + tgx + warp*cSim.stride;
unsigned int offset = x + tgx + warp*cSim.stride;
float4 of = cSim.pForce4a[offset];
of.x += af.x;
of.y += af.y;
......@@ -215,7 +215,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
cSim.pForce4a[offset] = of;
cSim.pBornForce[offset] = af.w;
#else
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
#endif
......@@ -225,7 +225,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
// Read fixed atom data into registers and GRF
if (lasty != y)
{
int j = y + tgx;
unsigned int j = y + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j];
sA[threadIdx.x].br = cSim.pBornRadii[j];
......@@ -255,7 +255,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
{
// Compute all interactions within this block.
for (int j = 0; j < GRID; j++)
for (unsigned int j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
......@@ -318,7 +318,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
{
// Compute only a subset of the interactions in this block.
for (int j = 0; j < GRID; j++)
for (unsigned int j = 0; j < GRID; j++)
{
if ((flags&(1<<j)) != 0)
{
......@@ -427,10 +427,10 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
{
unsigned int xi = x>>GRIDBITS;
unsigned int yi = y>>GRIDBITS;
int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
excl = (excl >> tgx) | (excl << (GRID - tgx));
for (int j = 0; j < GRID; j++)
for (unsigned int j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
......@@ -501,7 +501,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
int offset = x + tgx + warp*cSim.stride;
unsigned int offset = x + tgx + warp*cSim.stride;
float4 of = cSim.pForce4a[offset];
of.x += af.x;
of.y += af.y;
......@@ -518,7 +518,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
cSim.pForce4a[offset] = of;
cSim.pBornForce[offset] = af.w;
#else
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
af.x = sA[threadIdx.x].fx;
......
......@@ -35,7 +35,7 @@
* different versions of the kernels.
*/
__global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit, int workUnits)
__global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit, unsigned int workUnits)
{
extern __shared__ Atom sA[];
int end = workUnits / gridDim.x;
......@@ -61,7 +61,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* wor
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
unsigned int tj = tgx;
Atom* psA = &sA[tbx];
if (x == y) // Handle diagonals uniquely at 50% efficiency
......@@ -121,17 +121,17 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* wor
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
int offset = x + tgx + warp*cSim.stride;
unsigned int offset = x + tgx + warp*cSim.stride;
cSim.pBornSum[offset] += apos.w;
#else
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = apos.w;
#endif
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
int j = y + tgx;
unsigned int j = y + tgx;
unsigned int i = x + tgx;
float4 temp = cSim.pPosq[j];
......@@ -311,12 +311,12 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* wor
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
int offset = x + tgx + warp*cSim.stride;
unsigned int offset = x + tgx + warp*cSim.stride;
cSim.pBornSum[offset] += apos.w;
offset = y + tgx + warp*cSim.stride;
cSim.pBornSum[offset] += sA[threadIdx.x].sum;
#else
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = apos.w;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = sA[threadIdx.x].sum;
......
......@@ -35,18 +35,18 @@
* different versions of the kernels.
*/
__global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* workUnit, int numWorkUnits)
__global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* workUnit, unsigned int numWorkUnits)
{
extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
int pos = warp*numWorkUnits/totalWarps;
int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
#ifdef USE_CUTOFF
float3* tempBuffer = (float3*) &sA[cSim.bornForce2_threads_per_block];
#endif
int lasty = -1;
unsigned int lasty = -0xFFFFFFFF;
while (pos < end)
{
......@@ -60,7 +60,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* wor
float2 a = cSim.pObcData[i];
float fb = cSim.pBornForce[i];
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
unsigned int tj = tgx;
Atom* psA = &sA[tbx];
float3 af;
sA[threadIdx.x].fx = af.x = 0.0f;
......@@ -135,14 +135,14 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* wor
// Write results
float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
int offset = x + tgx + warp*cSim.stride;
unsigned int offset = x + tgx + warp*cSim.stride;
of = cSim.pForce4b[offset];
of.x += af.x + sA[threadIdx.x].fx;
of.y += af.y + sA[threadIdx.x].fy;
of.z += af.z + sA[threadIdx.x].fz;
cSim.pForce4b[offset] = of;
#else
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
of.x = af.x + sA[threadIdx.x].fx;
of.y = af.y + sA[threadIdx.x].fy;
of.z = af.z + sA[threadIdx.x].fz;
......@@ -155,7 +155,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* wor
// Read fixed atom data into registers and GRF
if (lasty != y)
{
int j = y + tgx;
unsigned int j = y + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pObcData[j];
sA[threadIdx.x].fb = cSim.pBornForce[j];
......@@ -177,7 +177,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* wor
{
// Compute all interactions within this block.
for (int j = 0; j < GRID; j++)
for (unsigned int j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
......@@ -273,7 +273,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* wor
{
// Compute only a subset of the interactions in this block.
for (int j = 0; j < GRID; j++)
for (unsigned int j = 0; j < GRID; j++)
{
if ((flags&(1<<j)) != 0)
{
......@@ -404,7 +404,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* wor
// Write results
float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
int offset = x + tgx + warp*cSim.stride;
unsigned int offset = x + tgx + warp*cSim.stride;
of = cSim.pForce4b[offset];
of.x += af.x;
of.y += af.y;
......@@ -417,7 +417,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* wor
of.z += sA[threadIdx.x].fz;
cSim.pForce4b[offset] = of;
#else
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
of.x = af.x;
of.y = af.y;
of.z = af.z;
......
......@@ -117,16 +117,16 @@ __global__ void METHOD_NAME(kFindBlocksWithInteractions, _kernel)()
* Compare each atom in one block to the bounding box of another block, and set
* flags for which ones are interacting.
*/
__global__ void METHOD_NAME(kFindInteractionsWithinBlocks, _kernel)(unsigned int* workUnit, int numWorkUnits)
__global__ void METHOD_NAME(kFindInteractionsWithinBlocks, _kernel)(unsigned int* workUnit, unsigned int numWorkUnits)
{
extern __shared__ unsigned int flags[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
int pos = warp*numWorkUnits/totalWarps;
int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int index = threadIdx.x & (GRID - 1);
int lasty = -1;
unsigned int lasty = 0xFFFFFFFF;
float4 apos;
while (pos < end)
{
......
......@@ -73,9 +73,9 @@ __device__ void kSyncAllThreads_kernel(unsigned int* syncCounter)
} while (counterValue > 0);
}
__device__ void kSolveMatrix_kernel(int numTerms, unsigned int* syncCounter)
__device__ void kSolveMatrix_kernel(unsigned int numTerms, unsigned int* syncCounter)
{
for (int iteration = 0; iteration < numTerms; iteration++) {
for (unsigned int iteration = 0; iteration < numTerms; iteration++) {
float* rhs1 = (iteration%2 == 0 ? cSim.pLincsRhs1 : cSim.pLincsRhs2);
float* rhs2 = (iteration%2 == 0 ? cSim.pLincsRhs2 : cSim.pLincsRhs1);
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
......@@ -121,7 +121,7 @@ __device__ void kUpdateAtomPositions_kernel(float4* atomPositions)
}
}
__global__ void kApplyLincs_kernel(int numTerms, float4* atomPositions, bool addOldPosition)
__global__ void kApplyLincs_kernel(unsigned int numTerms, float4* atomPositions, bool addOldPosition)
{
// Calculate the direction of each constraint, along with the initial RHS and solution vectors.
......@@ -221,13 +221,6 @@ __global__ void kApplyLincs_kernel(int numTerms, float4* atomPositions, bool add
kUpdateAtomPositions_kernel(atomPositions);
}
void printDist(float4 v1, float4 v2)
{
float dx = v1.x-v2.x;
float dy = v1.y-v2.y;
float dz = v1.z-v2.z;
printf("%f ", sqrt(dx*dx+dy*dy+dz*dz));
}
void kApplyFirstLincs(gpuContext gpu)
{
// printf("kApplyFirstLincs\n");
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment