Commit 5b017677 authored by Peter Eastman's avatar Peter Eastman
Browse files

Converted lots of ints to unsigned ints (might help performance a little by avoiding conversions)

parent e85347d1
...@@ -210,8 +210,8 @@ void CUDAStream<T>::Collapse(unsigned int newstreams, unsigned int interleave) ...@@ -210,8 +210,8 @@ void CUDAStream<T>::Collapse(unsigned int newstreams, unsigned int interleave)
delete[] pTemp; delete[] pTemp;
} }
static const int GRID = 32; static const unsigned int GRID = 32;
static const int GRIDBITS = 5; static const unsigned int GRIDBITS = 5;
static const int G8X_NONBOND_THREADS_PER_BLOCK = 256; static const int G8X_NONBOND_THREADS_PER_BLOCK = 256;
static const int GT2XX_NONBOND_THREADS_PER_BLOCK = 320; static const int GT2XX_NONBOND_THREADS_PER_BLOCK = 320;
static const int G8X_BORNFORCE2_THREADS_PER_BLOCK = 256; static const int G8X_BORNFORCE2_THREADS_PER_BLOCK = 256;
......
...@@ -35,18 +35,18 @@ ...@@ -35,18 +35,18 @@
* different versions of the kernels. * different versions of the kernels.
*/ */
__global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit, int numWorkUnits) __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit, unsigned int numWorkUnits)
{ {
extern __shared__ Atom sA[]; extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID; unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID; unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
int pos = warp*numWorkUnits/totalWarps; unsigned int pos = warp*numWorkUnits/totalWarps;
int end = (warp+1)*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
float3* tempBuffer = (float3*) &sA[cSim.nonbond_threads_per_block]; float3* tempBuffer = (float3*) &sA[cSim.nonbond_threads_per_block];
#endif #endif
int lasty = -1; unsigned int lasty = 0xFFFFFFFF;
while (pos < end) while (pos < end)
{ {
...@@ -69,7 +69,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni ...@@ -69,7 +69,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
float dEdR; float dEdR;
unsigned int tgx = threadIdx.x & (GRID - 1); unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx; unsigned int tbx = threadIdx.x - tgx;
int tj = tgx; unsigned int tj = tgx;
Atom* psA = &sA[tbx]; Atom* psA = &sA[tbx];
unsigned int i = x + tgx; unsigned int i = x + tgx;
apos = cSim.pPosq[i]; apos = cSim.pPosq[i];
...@@ -130,7 +130,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni ...@@ -130,7 +130,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
else // bExclusion else // bExclusion
{ {
unsigned int xi = x>>GRIDBITS; unsigned int xi = x>>GRIDBITS;
int cell = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2; unsigned int cell = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx]; unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
for (unsigned int j = 0; j < GRID; j++) for (unsigned int j = 0; j < GRID; j++)
{ {
...@@ -177,7 +177,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni ...@@ -177,7 +177,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
// Write results // Write results
float4 of; float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
int offset = x + tgx + warp*cSim.stride; unsigned int offset = x + tgx + warp*cSim.stride;
of = cSim.pForce4a[offset]; of = cSim.pForce4a[offset];
of.x += af.x; of.x += af.x;
of.y += af.y; of.y += af.y;
...@@ -188,7 +188,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni ...@@ -188,7 +188,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
of.y = af.y; of.y = af.y;
of.z = af.z; of.z = af.z;
of.w = 0.0f; of.w = 0.0f;
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride; unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of; cSim.pForce4a[offset] = of;
#endif #endif
} }
...@@ -197,7 +197,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni ...@@ -197,7 +197,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
// Read fixed atom data into registers and GRF // Read fixed atom data into registers and GRF
if (lasty != y) if (lasty != y)
{ {
int j = y + tgx; unsigned int j = y + tgx;
float4 temp = cSim.pPosq[j]; float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j]; float2 temp1 = cSim.pAttr[j];
sA[threadIdx.x].x = temp.x; sA[threadIdx.x].x = temp.x;
...@@ -355,7 +355,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni ...@@ -355,7 +355,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
// Read fixed atom data into registers and GRF // Read fixed atom data into registers and GRF
unsigned int xi = x>>GRIDBITS; unsigned int xi = x>>GRIDBITS;
unsigned int yi = y>>GRIDBITS; unsigned int yi = y>>GRIDBITS;
int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2; unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx]; unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
excl = (excl >> tgx) | (excl << (GRID - tgx)); excl = (excl >> tgx) | (excl << (GRID - tgx));
for (unsigned int j = 0; j < GRID; j++) for (unsigned int j = 0; j < GRID; j++)
...@@ -407,7 +407,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni ...@@ -407,7 +407,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
// Write results // Write results
float4 of; float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
int offset = x + tgx + warp*cSim.stride; unsigned int offset = x + tgx + warp*cSim.stride;
of = cSim.pForce4a[offset]; of = cSim.pForce4a[offset];
of.x += af.x; of.x += af.x;
of.y += af.y; of.y += af.y;
...@@ -424,7 +424,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni ...@@ -424,7 +424,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
of.y = af.y; of.y = af.y;
of.z = af.z; of.z = af.z;
of.w = 0.0f; of.w = 0.0f;
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride; unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of; cSim.pForce4a[offset] = of;
of.x = sA[threadIdx.x].fx; of.x = sA[threadIdx.x].fx;
of.y = sA[threadIdx.x].fy; of.y = sA[threadIdx.x].fy;
......
...@@ -106,8 +106,8 @@ extern __global__ void kFindBlockBoundsCutoff_kernel(); ...@@ -106,8 +106,8 @@ extern __global__ void kFindBlockBoundsCutoff_kernel();
extern __global__ void kFindBlockBoundsPeriodic_kernel(); extern __global__ void kFindBlockBoundsPeriodic_kernel();
extern __global__ void kFindBlocksWithInteractionsCutoff_kernel(); extern __global__ void kFindBlocksWithInteractionsCutoff_kernel();
extern __global__ void kFindBlocksWithInteractionsPeriodic_kernel(); extern __global__ void kFindBlocksWithInteractionsPeriodic_kernel();
extern __global__ void kFindInteractionsWithinBlocksCutoff_kernel(unsigned int*, int); extern __global__ void kFindInteractionsWithinBlocksCutoff_kernel(unsigned int*, unsigned int);
extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int*, int); extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int*, unsigned int);
void kCalculateCDLJObcGbsaForces1(gpuContext gpu) void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
{ {
......
...@@ -35,18 +35,18 @@ ...@@ -35,18 +35,18 @@
* different versions of the kernels. * different versions of the kernels.
*/ */
__global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit, int numWorkUnits) __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit, unsigned int numWorkUnits)
{ {
extern __shared__ Atom sA[]; extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID; unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID; unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
int pos = warp*numWorkUnits/totalWarps; unsigned int pos = warp*numWorkUnits/totalWarps;
int end = (warp+1)*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
float* tempBuffer = (float*) &sA[cSim.nonbond_threads_per_block]; float* tempBuffer = (float*) &sA[cSim.nonbond_threads_per_block];
#endif #endif
int lasty = -1; unsigned int lasty = -0xFFFFFFFF;
while (pos < end) while (pos < end)
{ {
...@@ -61,7 +61,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -61,7 +61,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
float2 a = cSim.pAttr[i]; float2 a = cSim.pAttr[i];
float br = cSim.pBornRadii[i]; float br = cSim.pBornRadii[i];
unsigned int tbx = threadIdx.x - tgx; unsigned int tbx = threadIdx.x - tgx;
int tj = tgx; unsigned int tj = tgx;
Atom* psA = &sA[tbx]; Atom* psA = &sA[tbx];
float4 af; float4 af;
af.x = 0.0f; af.x = 0.0f;
...@@ -138,7 +138,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -138,7 +138,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
else // bExclusion else // bExclusion
{ {
unsigned int xi = x>>GRIDBITS; unsigned int xi = x>>GRIDBITS;
int cell = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2; unsigned int cell = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx]; unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
for (unsigned int j = 0; j < GRID; j++) for (unsigned int j = 0; j < GRID; j++)
{ {
...@@ -206,7 +206,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -206,7 +206,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
// Write results // Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
int offset = x + tgx + warp*cSim.stride; unsigned int offset = x + tgx + warp*cSim.stride;
float4 of = cSim.pForce4a[offset]; float4 of = cSim.pForce4a[offset];
of.x += af.x; of.x += af.x;
of.y += af.y; of.y += af.y;
...@@ -215,7 +215,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -215,7 +215,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
cSim.pForce4a[offset] = of; cSim.pForce4a[offset] = of;
cSim.pBornForce[offset] = af.w; cSim.pBornForce[offset] = af.w;
#else #else
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride; unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af; cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w; cSim.pBornForce[offset] = af.w;
#endif #endif
...@@ -225,7 +225,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -225,7 +225,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
// Read fixed atom data into registers and GRF // Read fixed atom data into registers and GRF
if (lasty != y) if (lasty != y)
{ {
int j = y + tgx; unsigned int j = y + tgx;
float4 temp = cSim.pPosq[j]; float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j]; float2 temp1 = cSim.pAttr[j];
sA[threadIdx.x].br = cSim.pBornRadii[j]; sA[threadIdx.x].br = cSim.pBornRadii[j];
...@@ -255,7 +255,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -255,7 +255,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
{ {
// Compute all interactions within this block. // Compute all interactions within this block.
for (int j = 0; j < GRID; j++) for (unsigned int j = 0; j < GRID; j++)
{ {
float dx = psA[tj].x - apos.x; float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y; float dy = psA[tj].y - apos.y;
...@@ -318,7 +318,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -318,7 +318,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
{ {
// Compute only a subset of the interactions in this block. // Compute only a subset of the interactions in this block.
for (int j = 0; j < GRID; j++) for (unsigned int j = 0; j < GRID; j++)
{ {
if ((flags&(1<<j)) != 0) if ((flags&(1<<j)) != 0)
{ {
...@@ -427,10 +427,10 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -427,10 +427,10 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
{ {
unsigned int xi = x>>GRIDBITS; unsigned int xi = x>>GRIDBITS;
unsigned int yi = y>>GRIDBITS; unsigned int yi = y>>GRIDBITS;
int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2; unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx]; unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
excl = (excl >> tgx) | (excl << (GRID - tgx)); excl = (excl >> tgx) | (excl << (GRID - tgx));
for (int j = 0; j < GRID; j++) for (unsigned int j = 0; j < GRID; j++)
{ {
float dx = psA[tj].x - apos.x; float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y; float dy = psA[tj].y - apos.y;
...@@ -501,7 +501,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -501,7 +501,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
// Write results // Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
int offset = x + tgx + warp*cSim.stride; unsigned int offset = x + tgx + warp*cSim.stride;
float4 of = cSim.pForce4a[offset]; float4 of = cSim.pForce4a[offset];
of.x += af.x; of.x += af.x;
of.y += af.y; of.y += af.y;
...@@ -518,7 +518,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -518,7 +518,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
cSim.pForce4a[offset] = of; cSim.pForce4a[offset] = of;
cSim.pBornForce[offset] = af.w; cSim.pBornForce[offset] = af.w;
#else #else
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride; unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af; cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w; cSim.pBornForce[offset] = af.w;
af.x = sA[threadIdx.x].fx; af.x = sA[threadIdx.x].fx;
......
...@@ -35,7 +35,7 @@ ...@@ -35,7 +35,7 @@
* different versions of the kernels. * different versions of the kernels.
*/ */
__global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit, int workUnits) __global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit, unsigned int workUnits)
{ {
extern __shared__ Atom sA[]; extern __shared__ Atom sA[];
int end = workUnits / gridDim.x; int end = workUnits / gridDim.x;
...@@ -61,7 +61,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* wor ...@@ -61,7 +61,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* wor
unsigned int tgx = threadIdx.x & (GRID - 1); unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx; unsigned int tbx = threadIdx.x - tgx;
int tj = tgx; unsigned int tj = tgx;
Atom* psA = &sA[tbx]; Atom* psA = &sA[tbx];
if (x == y) // Handle diagonals uniquely at 50% efficiency if (x == y) // Handle diagonals uniquely at 50% efficiency
...@@ -121,17 +121,17 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* wor ...@@ -121,17 +121,17 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* wor
// Write results // Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
int offset = x + tgx + warp*cSim.stride; unsigned int offset = x + tgx + warp*cSim.stride;
cSim.pBornSum[offset] += apos.w; cSim.pBornSum[offset] += apos.w;
#else #else
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride; unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = apos.w; cSim.pBornSum[offset] = apos.w;
#endif #endif
} }
else // 100% utilization else // 100% utilization
{ {
// Read fixed atom data into registers and GRF // Read fixed atom data into registers and GRF
int j = y + tgx; unsigned int j = y + tgx;
unsigned int i = x + tgx; unsigned int i = x + tgx;
float4 temp = cSim.pPosq[j]; float4 temp = cSim.pPosq[j];
...@@ -311,12 +311,12 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* wor ...@@ -311,12 +311,12 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* wor
// Write results // Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
int offset = x + tgx + warp*cSim.stride; unsigned int offset = x + tgx + warp*cSim.stride;
cSim.pBornSum[offset] += apos.w; cSim.pBornSum[offset] += apos.w;
offset = y + tgx + warp*cSim.stride; offset = y + tgx + warp*cSim.stride;
cSim.pBornSum[offset] += sA[threadIdx.x].sum; cSim.pBornSum[offset] += sA[threadIdx.x].sum;
#else #else
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride; unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = apos.w; cSim.pBornSum[offset] = apos.w;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride; offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = sA[threadIdx.x].sum; cSim.pBornSum[offset] = sA[threadIdx.x].sum;
......
...@@ -35,18 +35,18 @@ ...@@ -35,18 +35,18 @@
* different versions of the kernels. * different versions of the kernels.
*/ */
__global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* workUnit, int numWorkUnits) __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* workUnit, unsigned int numWorkUnits)
{ {
extern __shared__ Atom sA[]; extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID; unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID; unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
int pos = warp*numWorkUnits/totalWarps; unsigned int pos = warp*numWorkUnits/totalWarps;
int end = (warp+1)*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
float3* tempBuffer = (float3*) &sA[cSim.bornForce2_threads_per_block]; float3* tempBuffer = (float3*) &sA[cSim.bornForce2_threads_per_block];
#endif #endif
int lasty = -1; unsigned int lasty = -0xFFFFFFFF;
while (pos < end) while (pos < end)
{ {
...@@ -60,7 +60,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* wor ...@@ -60,7 +60,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* wor
float2 a = cSim.pObcData[i]; float2 a = cSim.pObcData[i];
float fb = cSim.pBornForce[i]; float fb = cSim.pBornForce[i];
unsigned int tbx = threadIdx.x - tgx; unsigned int tbx = threadIdx.x - tgx;
int tj = tgx; unsigned int tj = tgx;
Atom* psA = &sA[tbx]; Atom* psA = &sA[tbx];
float3 af; float3 af;
sA[threadIdx.x].fx = af.x = 0.0f; sA[threadIdx.x].fx = af.x = 0.0f;
...@@ -135,14 +135,14 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* wor ...@@ -135,14 +135,14 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* wor
// Write results // Write results
float4 of; float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
int offset = x + tgx + warp*cSim.stride; unsigned int offset = x + tgx + warp*cSim.stride;
of = cSim.pForce4b[offset]; of = cSim.pForce4b[offset];
of.x += af.x + sA[threadIdx.x].fx; of.x += af.x + sA[threadIdx.x].fx;
of.y += af.y + sA[threadIdx.x].fy; of.y += af.y + sA[threadIdx.x].fy;
of.z += af.z + sA[threadIdx.x].fz; of.z += af.z + sA[threadIdx.x].fz;
cSim.pForce4b[offset] = of; cSim.pForce4b[offset] = of;
#else #else
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride; unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
of.x = af.x + sA[threadIdx.x].fx; of.x = af.x + sA[threadIdx.x].fx;
of.y = af.y + sA[threadIdx.x].fy; of.y = af.y + sA[threadIdx.x].fy;
of.z = af.z + sA[threadIdx.x].fz; of.z = af.z + sA[threadIdx.x].fz;
...@@ -155,7 +155,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* wor ...@@ -155,7 +155,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* wor
// Read fixed atom data into registers and GRF // Read fixed atom data into registers and GRF
if (lasty != y) if (lasty != y)
{ {
int j = y + tgx; unsigned int j = y + tgx;
float4 temp = cSim.pPosq[j]; float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pObcData[j]; float2 temp1 = cSim.pObcData[j];
sA[threadIdx.x].fb = cSim.pBornForce[j]; sA[threadIdx.x].fb = cSim.pBornForce[j];
...@@ -177,7 +177,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* wor ...@@ -177,7 +177,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* wor
{ {
// Compute all interactions within this block. // Compute all interactions within this block.
for (int j = 0; j < GRID; j++) for (unsigned int j = 0; j < GRID; j++)
{ {
float dx = psA[tj].x - apos.x; float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y; float dy = psA[tj].y - apos.y;
...@@ -273,7 +273,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* wor ...@@ -273,7 +273,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* wor
{ {
// Compute only a subset of the interactions in this block. // Compute only a subset of the interactions in this block.
for (int j = 0; j < GRID; j++) for (unsigned int j = 0; j < GRID; j++)
{ {
if ((flags&(1<<j)) != 0) if ((flags&(1<<j)) != 0)
{ {
...@@ -404,7 +404,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* wor ...@@ -404,7 +404,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* wor
// Write results // Write results
float4 of; float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
int offset = x + tgx + warp*cSim.stride; unsigned int offset = x + tgx + warp*cSim.stride;
of = cSim.pForce4b[offset]; of = cSim.pForce4b[offset];
of.x += af.x; of.x += af.x;
of.y += af.y; of.y += af.y;
...@@ -417,7 +417,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* wor ...@@ -417,7 +417,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* wor
of.z += sA[threadIdx.x].fz; of.z += sA[threadIdx.x].fz;
cSim.pForce4b[offset] = of; cSim.pForce4b[offset] = of;
#else #else
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride; unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
of.x = af.x; of.x = af.x;
of.y = af.y; of.y = af.y;
of.z = af.z; of.z = af.z;
......
...@@ -117,16 +117,16 @@ __global__ void METHOD_NAME(kFindBlocksWithInteractions, _kernel)() ...@@ -117,16 +117,16 @@ __global__ void METHOD_NAME(kFindBlocksWithInteractions, _kernel)()
* Compare each atom in one block to the bounding box of another block, and set * Compare each atom in one block to the bounding box of another block, and set
* flags for which ones are interacting. * flags for which ones are interacting.
*/ */
__global__ void METHOD_NAME(kFindInteractionsWithinBlocks, _kernel)(unsigned int* workUnit, int numWorkUnits) __global__ void METHOD_NAME(kFindInteractionsWithinBlocks, _kernel)(unsigned int* workUnit, unsigned int numWorkUnits)
{ {
extern __shared__ unsigned int flags[]; extern __shared__ unsigned int flags[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID; unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID; unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
int pos = warp*numWorkUnits/totalWarps; unsigned int pos = warp*numWorkUnits/totalWarps;
int end = (warp+1)*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int index = threadIdx.x & (GRID - 1); unsigned int index = threadIdx.x & (GRID - 1);
int lasty = -1; unsigned int lasty = 0xFFFFFFFF;
float4 apos; float4 apos;
while (pos < end) while (pos < end)
{ {
......
...@@ -73,9 +73,9 @@ __device__ void kSyncAllThreads_kernel(unsigned int* syncCounter) ...@@ -73,9 +73,9 @@ __device__ void kSyncAllThreads_kernel(unsigned int* syncCounter)
} while (counterValue > 0); } while (counterValue > 0);
} }
__device__ void kSolveMatrix_kernel(int numTerms, unsigned int* syncCounter) __device__ void kSolveMatrix_kernel(unsigned int numTerms, unsigned int* syncCounter)
{ {
for (int iteration = 0; iteration < numTerms; iteration++) { for (unsigned int iteration = 0; iteration < numTerms; iteration++) {
float* rhs1 = (iteration%2 == 0 ? cSim.pLincsRhs1 : cSim.pLincsRhs2); float* rhs1 = (iteration%2 == 0 ? cSim.pLincsRhs1 : cSim.pLincsRhs2);
float* rhs2 = (iteration%2 == 0 ? cSim.pLincsRhs2 : cSim.pLincsRhs1); float* rhs2 = (iteration%2 == 0 ? cSim.pLincsRhs2 : cSim.pLincsRhs1);
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
...@@ -121,7 +121,7 @@ __device__ void kUpdateAtomPositions_kernel(float4* atomPositions) ...@@ -121,7 +121,7 @@ __device__ void kUpdateAtomPositions_kernel(float4* atomPositions)
} }
} }
__global__ void kApplyLincs_kernel(int numTerms, float4* atomPositions, bool addOldPosition) __global__ void kApplyLincs_kernel(unsigned int numTerms, float4* atomPositions, bool addOldPosition)
{ {
// Calculate the direction of each constraint, along with the initial RHS and solution vectors. // Calculate the direction of each constraint, along with the initial RHS and solution vectors.
...@@ -221,13 +221,6 @@ __global__ void kApplyLincs_kernel(int numTerms, float4* atomPositions, bool add ...@@ -221,13 +221,6 @@ __global__ void kApplyLincs_kernel(int numTerms, float4* atomPositions, bool add
kUpdateAtomPositions_kernel(atomPositions); kUpdateAtomPositions_kernel(atomPositions);
} }
void printDist(float4 v1, float4 v2)
{
float dx = v1.x-v2.x;
float dy = v1.y-v2.y;
float dz = v1.z-v2.z;
printf("%f ", sqrt(dx*dx+dy*dy+dz*dz));
}
void kApplyFirstLincs(gpuContext gpu) void kApplyFirstLincs(gpuContext gpu)
{ {
// printf("kApplyFirstLincs\n"); // printf("kApplyFirstLincs\n");
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment