Commit 734d5d67 authored by Scott Le Grand's avatar Scott Le Grand
Browse files

No commit message

No commit message
parent a409f0e8
......@@ -222,12 +222,12 @@ static const int GF1XX_BLOCKS_PER_SM = 3;
static const int G8X_NONBOND_THREADS_PER_BLOCK = 256;
static const int GT2XX_NONBOND_THREADS_PER_BLOCK = 320;
static const int GF1XX_NONBOND_THREADS_PER_BLOCK = 256;
static const int GF1XX_NONBOND_THREADS_PER_BLOCK = 768;
//static const int GF1XX_NONBOND_THREADS_PER_BLOCK = 768;
static const int G8X_BORNFORCE2_THREADS_PER_BLOCK = 256;
static const int GT2XX_BORNFORCE2_THREADS_PER_BLOCK = 320;
static const int GF1XX_BORNFORCE2_THREADS_PER_BLOCK = 256;
static const int GF1XX_BORNFORCE2_THREADS_PER_BLOCK = 768;
//static const int GF1XX_BORNFORCE2_THREADS_PER_BLOCK = 768;
static const int G8X_SHAKE_THREADS_PER_BLOCK = 128;
......@@ -252,7 +252,7 @@ static const int GF1XX_RANDOM_THREADS_PER_BLOCK = 768;
static const int G8X_NONBOND_WORKUNITS_PER_SM = 220;
static const int GT2XX_NONBOND_WORKUNITS_PER_SM = 256;
static const int GF1XX_NONBOND_WORKUNITS_PER_SM = 256;
static const int GF1XX_NONBOND_WORKUNITS_PER_SM = 768;
static const unsigned int MAX_STACK_SIZE = 8;
static const unsigned int MAX_TABULATED_FUNCTIONS = 4;
......
......@@ -51,7 +51,15 @@ void GetBrownianUpdateSim(gpuContext gpu)
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void kBrownianUpdatePart1_kernel()
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
void kBrownianUpdatePart1_kernel()
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int rpos = cSim.pRandomPosition[blockIdx.x];
......@@ -82,7 +90,15 @@ void kBrownianUpdatePart1(gpuContext gpu)
LAUNCHERROR("kBrownianUpdatePart1");
}
__global__ void kBrownianUpdatePart2_kernel()
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
void kBrownianUpdatePart2_kernel()
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int rpos = cSim.pRandomPosition[blockIdx.x];
......
......@@ -33,7 +33,15 @@
/* Cuda compiler on Windows does not recognized "static const float" values */
#define LOCAL_HACK_PI 3.1415926535897932384626433832795
__global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit)
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit)
{
extern __shared__ Atom sA[];
unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
......
......@@ -33,7 +33,15 @@
/* Cuda compiler on Windows does not recognized "static const float" values */
#define LOCAL_HACK_PI 3.1415926535897932384626433832795
__global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
{
extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
......
......@@ -118,7 +118,15 @@ void SetCustomTorsionGlobalParams(const vector<float>& paramValues)
angle = (dp >= 0) ? angle : -angle; \
}
__global__ void kCalculateCustomTorsionForces_kernel()
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_LOCALFORCES_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_LOCALFORCES_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_LOCALFORCES_THREADS_PER_BLOCK, 1)
#endif
void kCalculateCustomTorsionForces_kernel()
{
extern __shared__ float stack[];
float* variables = (float*) &stack[cSim.customExpressionStackSize*blockDim.x];
......
......@@ -37,7 +37,15 @@
#include "kCalculateGBVIAux.h"
__global__ void METHOD_NAME(kCalculateGBVI, BornSum_kernel)(unsigned int* workUnit)
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateGBVI, BornSum_kernel)(unsigned int* workUnit)
{
extern __shared__ Atom sA[];
......
......@@ -37,7 +37,15 @@
* different versions of the kernels.
*/
__global__ void METHOD_NAME(kCalculateGBVI, Forces2_kernel)(unsigned int* workUnit, unsigned int numWorkUnits)
__global__ void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_BORNFORCE2_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_BORNFORCE2_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_BORNFORCE2_THREADS_PER_BLOCK, 1)
#endif
METHOD_NAME(kCalculateGBVI, Forces2_kernel)(unsigned int* workUnit, unsigned int numWorkUnits)
{
extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
......
......@@ -123,7 +123,15 @@ void GetCalculateLocalForcesSim(gpuContext gpu)
}
__global__ void kCalculateLocalForces_kernel()
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_LOCALFORCES_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_LOCALFORCES_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_LOCALFORCES_THREADS_PER_BLOCK, 1)
#endif
void kCalculateLocalForces_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
Vectors* A = &sV[threadIdx.x];
......
......@@ -97,7 +97,9 @@ void GetCalculateObcGbsaBornSumSim(gpuContext gpu)
#include "kCalculateObcGbsaBornSum.h"
__global__ void kClearObcGbsaBornSum_kernel()
__global__
__launch_bounds__(384, 1)
void kClearObcGbsaBornSum_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.stride * cSim.nonbondOutputBuffers)
......@@ -107,7 +109,9 @@ __global__ void kClearObcGbsaBornSum_kernel()
}
}
__global__ void kReduceObcGbsaBornSum_kernel()
__global__
__launch_bounds__(384, 1)
void kReduceObcGbsaBornSum_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
......
......@@ -30,7 +30,15 @@
* different versions of the kernels.
*/
__global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit)
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit)
{
extern __shared__ Atom sA[];
/*
......
......@@ -30,7 +30,15 @@
* different versions of the kernels.
*/
__global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* workUnit)
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_BORNFORCE2_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_BORNFORCE2_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_BORNFORCE2_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* workUnit)
{
extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
......
......@@ -53,7 +53,9 @@ void GetForcesSim(gpuContext gpu)
RTERROR(status, "cudaMemcpyFromSymbol: GetForcesSim copy from cSim failed");
}
__global__ void kClearForces_kernel()
__global__
__launch_bounds__(384, 1)
void kClearForces_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.stride * cSim.outputBuffers)
......@@ -70,7 +72,9 @@ void kClearForces(gpuContext gpu)
LAUNCHERROR("kClearForces");
}
__global__ void kClearBornForces_kernel()
__global__
__launch_bounds__(384, 1)
void kClearBornForces_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.stride * cSim.nonbondOutputBuffers)
......@@ -87,7 +91,9 @@ void kClearBornForces(gpuContext gpu)
LAUNCHERROR("kClearBornForces");
}
__global__ void kClearEnergy_kernel()
__global__
__launch_bounds__(384, 1)
void kClearEnergy_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.energyOutputBuffers)
......@@ -104,7 +110,15 @@ void kClearEnergy(gpuContext gpu)
LAUNCHERROR("kClearEnergy");
}
__global__ void kReduceBornSumAndForces_kernel()
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceBornSumAndForces_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
......@@ -207,7 +221,15 @@ void kReduceBornSumAndForces(gpuContext gpu)
LAUNCHERROR("kReduceBornSumAndForces");
}
__global__ void kReduceForces_kernel()
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceForces_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
......@@ -269,7 +291,15 @@ double kReduceEnergy(gpuContext gpu)
return sum;
}
__global__ void kReduceObcGbsaBornForces_kernel()
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
void kReduceObcGbsaBornForces_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
float energy = 0.0f;
......@@ -327,7 +357,15 @@ __global__ void kReduceObcGbsaBornForces_kernel()
cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy / -6.0f;
}
__global__ void kReduceGBVIBornForces_kernel()
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
void kReduceGBVIBornForces_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
float energy = 0.0f;
......
......@@ -101,7 +101,15 @@ void kLangevinUpdatePart2(gpuContext gpu)
}
__global__ void kSelectLangevinStepSize_kernel(float maxStepSize)
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
void kSelectLangevinStepSize_kernel(float maxStepSize)
{
// Calculate the error.
......
......@@ -31,13 +31,21 @@
* different versions of the kernels.
*/
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
#ifdef REMOVE_CM
__global__ void kLangevinUpdatePart1CM_kernel()
void kLangevinUpdatePart1CM_kernel()
#else
__global__ void kLangevinUpdatePart1_kernel()
void kLangevinUpdatePart1_kernel()
#endif
{
__shared__ float params[MaxParams];
__shared__ volatile float params[MaxParams];
if (threadIdx.x < MaxParams)
params[threadIdx.x] = cSim.pLangevinParameters[threadIdx.x];
__syncthreads();
......@@ -127,10 +135,18 @@ __global__ void kLangevinUpdatePart1_kernel()
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
#ifdef REMOVE_CM
__global__ void kLangevinUpdatePart2CM_kernel()
void kLangevinUpdatePart2CM_kernel()
#else
__global__ void kLangevinUpdatePart2_kernel()
void kLangevinUpdatePart2_kernel()
#endif
{
__shared__ float params[MaxParams];
......
......@@ -54,7 +54,15 @@ void GetRandomSim(gpuContext gpu)
extern __shared__ float3 sRand[];
__global__ void kGenerateRandoms_kernel()
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_RANDOM_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_RANDOM_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_RANDOM_THREADS_PER_BLOCK, 1)
#endif
void kGenerateRandoms_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int increment = blockDim.x * gridDim.x;
......
......@@ -57,7 +57,15 @@ void GetSettleSim(gpuContext gpu)
* S. Miyamoto and P. Kollman, J. Comp. Chem., vol 13, no. 8, pp. 952-962 (1992).
*/
__global__ void kApplyFirstSettle_kernel()
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_SHAKE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_SHAKE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_SHAKE_THREADS_PER_BLOCK, 1)
#endif
void kApplyFirstSettle_kernel()
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.settleConstraints)
......@@ -231,7 +239,15 @@ void kApplyFirstSettle(gpuContext gpu)
}
}
__global__ void kApplySecondSettle_kernel()
__global__ void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_SHAKE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_SHAKE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_SHAKE_THREADS_PER_BLOCK, 1)
#endif
kApplySecondSettle_kernel()
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.settleConstraints)
......
......@@ -64,9 +64,23 @@ void GetShakeHSim(gpuContext gpu)
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void kApplyFirstShake_kernel()
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_SHAKE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_SHAKE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_SHAKE_THREADS_PER_BLOCK, 1)
#endif
void kApplyFirstShake_kernel()
{
__shared__ Atom sA[G8X_THREADS_PER_BLOCK];
#if (__CUDA_ARCH__ >= 200)
__shared__ Atom sA[GF1XX_SHAKE_THREADS_PER_BLOCK];
#elif (__CUDA_ARCH__ >= 130)
__shared__ Atom sA[GT2XX_SHAKE_THREADS_PER_BLOCK];
#else
__shared__ Atom sA[G8X_SHAKE_THREADS_PER_BLOCK];
#endif
Atom* psA = &sA[threadIdx.x];
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.ShakeConstraints)
......@@ -226,9 +240,23 @@ void kApplyFirstShake(gpuContext gpu)
}
}
__global__ void kApplySecondShake_kernel()
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_SHAKE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_SHAKE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_SHAKE_THREADS_PER_BLOCK, 1)
#endif
void kApplySecondShake_kernel()
{
__shared__ Atom sA[G8X_THREADS_PER_BLOCK];
#if (__CUDA_ARCH__ >= 200)
__shared__ Atom sA[GF1XX_SHAKE_THREADS_PER_BLOCK];
#elif (__CUDA_ARCH__ >= 130)
__shared__ Atom sA[GT2XX_SHAKE_THREADS_PER_BLOCK];
#else
__shared__ Atom sA[G8X_SHAKE_THREADS_PER_BLOCK];
#endif
Atom* psA = &sA[threadIdx.x];
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.ShakeConstraints)
......@@ -394,7 +422,15 @@ __global__ void kApplySecondShake_kernel()
}
}
__global__ void kApplyNoShake_kernel()
__global__ void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_SHAKE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_SHAKE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_SHAKE_THREADS_PER_BLOCK, 1)
#endif
kApplyNoShake_kernel()
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.NonShakeConstraints)
......
......@@ -89,7 +89,15 @@ void kVerletUpdatePart2(gpuContext gpu)
}
}
__global__ void kSelectVerletStepSize_kernel(float maxStepSize)
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
void kSelectVerletStepSize_kernel(float maxStepSize)
{
// Calculate the error.
......
......@@ -29,16 +29,24 @@
* several times in kVerletUpdate.cu with different #defines to generate
* different versions of the kernels.
*/
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
#ifdef REMOVE_CM
__global__ void kVerletUpdatePart1CM_kernel()
void kVerletUpdatePart1CM_kernel()
#else
__global__ void kVerletUpdatePart1_kernel()
void kVerletUpdatePart1_kernel()
#endif
{
// Load the step size to take.
__shared__ float dtPos;
__shared__ float dtVel;
__shared__ volatile float dtPos;
__shared__ volatile float dtVel;
if (threadIdx.x == 0)
{
float2 stepSize = cSim.pStepSize[0];
......@@ -111,10 +119,18 @@ __global__ void kVerletUpdatePart1_kernel()
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
#ifdef REMOVE_CM
__global__ void kVerletUpdatePart2CM_kernel()
void kVerletUpdatePart2CM_kernel()
#else
__global__ void kVerletUpdatePart2_kernel()
void kVerletUpdatePart2_kernel()
#endif
{
// Load the step size to take.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment