Commit 01440b46 authored by Peter Eastman's avatar Peter Eastman
Browse files

Finished CUDA version of GayBerneForce

parent 222b3bb4
...@@ -6161,9 +6161,9 @@ void CudaCalcGayBerneForceKernel::initialize(const System& system, const GayBern ...@@ -6161,9 +6161,9 @@ void CudaCalcGayBerneForceKernel::initialize(const System& system, const GayBern
sortedPos = new CudaArray(cu, numRealParticles, 4*elementSize, "sortedPos"); sortedPos = new CudaArray(cu, numRealParticles, 4*elementSize, "sortedPos");
maxNeighborBlocks = numRealParticles*2; maxNeighborBlocks = numRealParticles*2;
neighbors = CudaArray::create<int>(cu, maxNeighborBlocks*32, "neighbors"); neighbors = CudaArray::create<int>(cu, maxNeighborBlocks*32, "neighbors");
neighborIndex = CudaArray::create<int>(cu, maxNeighborBlocks, "neighbors"); neighborIndex = CudaArray::create<int>(cu, maxNeighborBlocks, "neighborIndex");
neighborBlockCount = CudaArray::create<int>(cu, 1, "neighborBlockCount"); neighborBlockCount = CudaArray::create<int>(cu, 1, "neighborBlockCount");
if (force.getNonbondedMethod() != GayberneForce::NoCutoff) if (force.getNonbondedMethod() != GayBerneForce::NoCutoff)
CHECK_RESULT(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING), "Error creating event for CustomManyParticleForce"); CHECK_RESULT(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING), "Error creating event for CustomManyParticleForce");
// Create array for accumulating torques. // Create array for accumulating torques.
...@@ -6195,7 +6195,7 @@ void CudaCalcGayBerneForceKernel::initialize(const System& system, const GayBern ...@@ -6195,7 +6195,7 @@ void CudaCalcGayBerneForceKernel::initialize(const System& system, const GayBern
} }
} }
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
CUmodule module = cu.createModule(CudakernelSources::vectorOps+CudaKernelSources::gayBerne, defines); CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::gayBerne, defines);
framesKernel = cu.getKernel(module, "computeEllipsoidFrames"); framesKernel = cu.getKernel(module, "computeEllipsoidFrames");
blockBoundsKernel = cu.getKernel(module, "findBlockBounds"); blockBoundsKernel = cu.getKernel(module, "findBlockBounds");
neighborsKernel = cu.getKernel(module, "findNeighbors"); neighborsKernel = cu.getKernel(module, "findNeighbors");
...@@ -6245,7 +6245,7 @@ double CudaCalcGayBerneForceKernel::execute(ContextImpl& context, bool includeFo ...@@ -6245,7 +6245,7 @@ double CudaCalcGayBerneForceKernel::execute(ContextImpl& context, bool includeFo
neighborsArgs.push_back(&neighborBlockCount->getDevicePointer()); neighborsArgs.push_back(&neighborBlockCount->getDevicePointer());
neighborsArgs.push_back(&exclusions->getDevicePointer()); neighborsArgs.push_back(&exclusions->getDevicePointer());
neighborsArgs.push_back(&exclusionStartIndex->getDevicePointer()); neighborsArgs.push_back(&exclusionStartIndex->getDevicePointer());
forceArgs.push_back(&cu.getLongForceBuffer().getDevicePointer()); forceArgs.push_back(&cu.getForce().getDevicePointer());
forceArgs.push_back(&torque->getDevicePointer()); forceArgs.push_back(&torque->getDevicePointer());
forceArgs.push_back(&numRealParticles); forceArgs.push_back(&numRealParticles);
forceArgs.push_back(&numExceptions); forceArgs.push_back(&numExceptions);
...@@ -6272,7 +6272,7 @@ double CudaCalcGayBerneForceKernel::execute(ContextImpl& context, bool includeFo ...@@ -6272,7 +6272,7 @@ double CudaCalcGayBerneForceKernel::execute(ContextImpl& context, bool includeFo
forceArgs.push_back(cu.getPeriodicBoxVecYPointer()); forceArgs.push_back(cu.getPeriodicBoxVecYPointer());
forceArgs.push_back(cu.getPeriodicBoxVecZPointer()); forceArgs.push_back(cu.getPeriodicBoxVecZPointer());
} }
torqueArgs.push_back(&cu.getLongForceBuffer().getDevicePointer()); torqueArgs.push_back(&cu.getForce().getDevicePointer());
torqueArgs.push_back(&torque->getDevicePointer()); torqueArgs.push_back(&torque->getDevicePointer());
torqueArgs.push_back(&numRealParticles); torqueArgs.push_back(&numRealParticles);
torqueArgs.push_back(&cu.getPosq().getDevicePointer()); torqueArgs.push_back(&cu.getPosq().getDevicePointer());
...@@ -6289,6 +6289,7 @@ double CudaCalcGayBerneForceKernel::execute(ContextImpl& context, bool includeFo ...@@ -6289,6 +6289,7 @@ double CudaCalcGayBerneForceKernel::execute(ContextImpl& context, bool includeFo
cu.executeKernel(neighborsKernel, &neighborsArgs[0], numRealParticles); cu.executeKernel(neighborsKernel, &neighborsArgs[0], numRealParticles);
int* count = (int*) cu.getPinnedBuffer(); int* count = (int*) cu.getPinnedBuffer();
neighborBlockCount->download(count, false); neighborBlockCount->download(count, false);
CHECK_RESULT(cuEventRecord(event, 0), "Error recording event for GayBerneForce");
cu.executeKernel(forceKernel, &forceArgs[0], cu.getNonbondedUtilities().getNumForceThreadBlocks()*cu.getNonbondedUtilities().getForceThreadBlockSize()); cu.executeKernel(forceKernel, &forceArgs[0], cu.getNonbondedUtilities().getNumForceThreadBlocks()*cu.getNonbondedUtilities().getForceThreadBlockSize());
CHECK_RESULT(cuEventSynchronize(event), "Error synchronizing on event for GayBerneForce"); CHECK_RESULT(cuEventSynchronize(event), "Error synchronizing on event for GayBerneForce");
if (*count <= maxNeighborBlocks) if (*count <= maxNeighborBlocks)
...@@ -6302,7 +6303,7 @@ double CudaCalcGayBerneForceKernel::execute(ContextImpl& context, bool includeFo ...@@ -6302,7 +6303,7 @@ double CudaCalcGayBerneForceKernel::execute(ContextImpl& context, bool includeFo
neighborIndex = NULL; neighborIndex = NULL;
maxNeighborBlocks = (int) ceil((*count)*1.1); maxNeighborBlocks = (int) ceil((*count)*1.1);
neighbors = CudaArray::create<int>(cu, maxNeighborBlocks*32, "neighbors"); neighbors = CudaArray::create<int>(cu, maxNeighborBlocks*32, "neighbors");
neighborIndex = CudaArray::create<int>(cu, maxNeighborBlocks, "neighbors"); neighborIndex = CudaArray::create<int>(cu, maxNeighborBlocks, "neighborIndex");
neighborsArgs[10] = &neighbors->getDevicePointer(); neighborsArgs[10] = &neighbors->getDevicePointer();
neighborsArgs[11] = &neighborIndex->getDevicePointer(); neighborsArgs[11] = &neighborIndex->getDevicePointer();
forceArgs[17] = &neighbors->getDevicePointer(); forceArgs[17] = &neighbors->getDevicePointer();
......
...@@ -49,7 +49,7 @@ extern "C" __global__ void computeEllipsoidFrames(int numParticles, const real4* ...@@ -49,7 +49,7 @@ extern "C" __global__ void computeEllipsoidFrames(int numParticles, const real4*
a[2][1] = zdir.y; a[2][1] = zdir.y;
a[2][2] = zdir.z; a[2][2] = zdir.z;
float4 sig = sigParams[originalIndex]; float4 sig = sigParams[originalIndex];
float3 r2 = sig.yzw; float3 r2 = make_float3(sig.y, sig.z, sig.w);
float3 e2 = trimTo3(scale[originalIndex]); float3 e2 = trimTo3(scale[originalIndex]);
for (int i = 0; i < 3; i++) for (int i = 0; i < 3; i++)
for (int j = 0; j < 3; j++) { for (int j = 0; j < 3; j++) {
...@@ -83,8 +83,8 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, ...@@ -83,8 +83,8 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
real4 center = 0.5f*(maxPos+minPos); real4 center = 0.5f*(maxPos+minPos);
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center) APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
#endif #endif
minPos = min(minPos, pos); minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
maxPos = max(maxPos, pos); maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
} }
real4 blockSize = 0.5f*(maxPos-minPos); real4 blockSize = 0.5f*(maxPos-minPos);
blockBoundingBox[index] = blockSize; blockBoundingBox[index] = blockSize;
...@@ -99,7 +99,7 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, ...@@ -99,7 +99,7 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
/** /**
* This is called by findNeighbors() to write a block to the neighbor list. * This is called by findNeighbors() to write a block to the neighbor list.
*/ */
void storeNeighbors(int atom1, int* neighborBuffer, int numAtomsInBuffer, int maxNeighborBlocks, int* __restrict__ neighbors, __device__ void storeNeighbors(int atom1, int* neighborBuffer, int numAtomsInBuffer, int maxNeighborBlocks, int* __restrict__ neighbors,
int* __restrict__ neighborIndex, int* __restrict__ neighborBlockCount) { int* __restrict__ neighborIndex, int* __restrict__ neighborBlockCount) {
int blockIndex = atomicAdd(neighborBlockCount, 1); int blockIndex = atomicAdd(neighborBlockCount, 1);
if (blockIndex >= maxNeighborBlocks) if (blockIndex >= maxNeighborBlocks)
...@@ -178,7 +178,7 @@ typedef struct { ...@@ -178,7 +178,7 @@ typedef struct {
real a[3][3], b[3][3], g[3][3]; real a[3][3], b[3][3], g[3][3];
} AtomData; } AtomData;
void loadAtomData(AtomData* data, int sortedIndex, int originalIndex, const real4* __restrict__ pos, const float4* __restrict__ sigParams, __device__ void loadAtomData(AtomData* data, int sortedIndex, int originalIndex, const real4* __restrict__ pos, const float4* __restrict__ sigParams,
const float2* __restrict__ epsParams, const real* __restrict__ aMatrix, const real* __restrict__ bMatrix, const real* __restrict__ gMatrix) { const float2* __restrict__ epsParams, const real* __restrict__ aMatrix, const real* __restrict__ bMatrix, const real* __restrict__ gMatrix) {
data->sig = sigParams[originalIndex]; data->sig = sigParams[originalIndex];
data->eps = epsParams[originalIndex]; data->eps = epsParams[originalIndex];
...@@ -192,20 +192,19 @@ void loadAtomData(AtomData* data, int sortedIndex, int originalIndex, const real ...@@ -192,20 +192,19 @@ void loadAtomData(AtomData* data, int sortedIndex, int originalIndex, const real
} }
} }
real3 matrixVectorProduct(real (*m)[3], real3 v) { inline __device__ real3 matrixVectorProduct(real (*m)[3], real3 v) {
return make_real3(m[0][0]*v.x + m[0][1]*v.y + m[0][2]*v.z, return make_real3(m[0][0]*v.x + m[0][1]*v.y + m[0][2]*v.z,
m[1][0]*v.x + m[1][1]*v.y + m[1][2]*v.z, m[1][0]*v.x + m[1][1]*v.y + m[1][2]*v.z,
m[2][0]*v.x + m[2][1]*v.y + m[2][2]*v.z); m[2][0]*v.x + m[2][1]*v.y + m[2][2]*v.z);
} }
real3 vectorMatrixProduct(real3 v, real (*m)[3]) { inline __device__ real3 vectorMatrixProduct(real3 v, real (*m)[3]) {
return make_real3(m[0][0]*v.x + m[1][0]*v.y + m[2][0]*v.z, return make_real3(m[0][0]*v.x + m[1][0]*v.y + m[2][0]*v.z,
m[0][1]*v.x + m[1][1]*v.y + m[2][1]*v.z, m[0][1]*v.x + m[1][1]*v.y + m[2][1]*v.z,
m[0][2]*v.x + m[1][2]*v.y + m[2][2]*v.z); m[0][2]*v.x + m[1][2]*v.y + m[2][2]*v.z);
} }
inline __device__ void matrixSum(real (*result)[3], real (*a)[3], real (*b)[3]) {
void matrixSum(real (*result)[3], real (*a)[3], real (*b)[3]) {
result[0][0] = a[0][0]+b[0][0]; result[0][0] = a[0][0]+b[0][0];
result[0][1] = a[0][1]+b[0][1]; result[0][1] = a[0][1]+b[0][1];
result[0][2] = a[0][2]+b[0][2]; result[0][2] = a[0][2]+b[0][2];
...@@ -217,13 +216,12 @@ void matrixSum(real (*result)[3], real (*a)[3], real (*b)[3]) { ...@@ -217,13 +216,12 @@ void matrixSum(real (*result)[3], real (*a)[3], real (*b)[3]) {
result[2][2] = a[2][2]+b[2][2]; result[2][2] = a[2][2]+b[2][2];
} }
real determinant(real (*m)[3]) { inline __device__ real determinant(real (*m)[3]) {
return (m[0][0]*m[1][1]*m[2][2] + m[0][1]*m[1][2]*m[2][0] + m[0][2]*m[1][0]*m[2][1] - return (m[0][0]*m[1][1]*m[2][2] + m[0][1]*m[1][2]*m[2][0] + m[0][2]*m[1][0]*m[2][1] -
m[0][0]*m[1][2]*m[2][1] - m[0][1]*m[1][0]*m[2][2] - m[0][2]*m[1][1]*m[2][0]); m[0][0]*m[1][2]*m[2][1] - m[0][1]*m[1][0]*m[2][2] - m[0][2]*m[1][1]*m[2][0]);
} }
inline __device__ void matrixInverse(real (*result)[3], real (*m)[3]) {
void matrixInverse(real (*result)[3], real (*m)[3]) {
real invDet = RECIP(determinant(m)); real invDet = RECIP(determinant(m));
result[0][0] = invDet*(m[1][1]*m[2][2] - m[1][2]*m[2][1]); result[0][0] = invDet*(m[1][1]*m[2][2] - m[1][2]*m[2][1]);
result[1][0] = -invDet*(m[1][0]*m[2][2] - m[1][2]*m[2][0]); result[1][0] = -invDet*(m[1][0]*m[2][2] - m[1][2]*m[2][0]);
...@@ -236,7 +234,7 @@ void matrixInverse(real (*result)[3], real (*m)[3]) { ...@@ -236,7 +234,7 @@ void matrixInverse(real (*result)[3], real (*m)[3]) {
result[2][2] = invDet*(m[0][0]*m[1][1] - m[0][1]*m[1][0]); result[2][2] = invDet*(m[0][0]*m[1][1] - m[0][1]*m[1][0]);
} }
void computeOneInteraction(AtomData* data1, AtomData* data2, real sigma, real epsilon, real3 dr, real r2, real3* force1, real3* force2, real3* torque1, real3* torque2, real *totalEnergy) { __device__ void computeOneInteraction(AtomData* data1, AtomData* data2, real sigma, real epsilon, real3 dr, real r2, real3* force1, real3* force2, real3* torque1, real3* torque2, real *totalEnergy) {
real rInv = RSQRT(r2); real rInv = RSQRT(r2);
real r = r2*rInv; real r = r2*rInv;
real3 drUnit = dr*rInv; real3 drUnit = dr*rInv;
...@@ -325,7 +323,7 @@ void computeOneInteraction(AtomData* data1, AtomData* data2, real sigma, real ep ...@@ -325,7 +323,7 @@ void computeOneInteraction(AtomData* data1, AtomData* data2, real sigma, real ep
d[2][2] = scale.z*( a[2][0]*(G12[0][1]*G12[1][2] + G12[2][1]*G12[1][0] - G12[1][1]*(G12[0][2] + G12[2][0])) + d[2][2] = scale.z*( a[2][0]*(G12[0][1]*G12[1][2] + G12[2][1]*G12[1][0] - G12[1][1]*(G12[0][2] + G12[2][0])) +
a[2][1]*(G12[1][0]*G12[0][2] + G12[2][0]*G12[0][1] - G12[0][0]*(G12[1][2] + G12[2][1])) + a[2][1]*(G12[1][0]*G12[0][2] + G12[2][0]*G12[0][1] - G12[0][0]*(G12[1][2] + G12[2][1])) +
2*a[2][2]*(G12[1][1]*G12[0][0] - G12[1][0]*G12[0][1])); 2*a[2][2]*(G12[1][1]*G12[0][0] - G12[1][0]*G12[0][1]));
real3 detadq = 0; real3 detadq = make_real3(0);
for (int i = 0; i < 3; i++) for (int i = 0; i < 3; i++)
detadq += cross(make_real3(a[i][0], a[i][1], a[i][2]), make_real3(d[i][0], d[i][1], d[i][2])); detadq += cross(make_real3(a[i][0], a[i][1], a[i][2]), make_real3(d[i][0], d[i][1], d[i][2]));
real3 torque = (dchidq*(u*eta) + detadq*(u*chi) + dudq*(eta*chi))*switchValue; real3 torque = (dchidq*(u*eta) + detadq*(u*chi) + dudq*(eta*chi))*switchValue;
...@@ -338,7 +336,7 @@ void computeOneInteraction(AtomData* data1, AtomData* data2, real sigma, real ep ...@@ -338,7 +336,7 @@ void computeOneInteraction(AtomData* data1, AtomData* data2, real sigma, real ep
* Compute the interactions. * Compute the interactions.
*/ */
extern "C" __global__ void computeForce( extern "C" __global__ void computeForce(
long* __restrict__ forceBuffers, long* __restrict__ torqueBuffers, unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ torqueBuffers,
int numAtoms, int numExceptions, mixed* __restrict__ energyBuffer, const real4* __restrict__ pos, int numAtoms, int numExceptions, mixed* __restrict__ energyBuffer, const real4* __restrict__ pos,
const float4* __restrict__ sigParams, const float2* __restrict__ epsParams, const int* __restrict__ sortedAtoms, const float4* __restrict__ sigParams, const float2* __restrict__ epsParams, const int* __restrict__ sortedAtoms,
const real* __restrict__ aMatrix, const real* __restrict__ bMatrix, const real* __restrict__ gMatrix, const real* __restrict__ aMatrix, const real* __restrict__ bMatrix, const real* __restrict__ gMatrix,
...@@ -362,8 +360,8 @@ extern "C" __global__ void computeForce( ...@@ -362,8 +360,8 @@ extern "C" __global__ void computeForce(
int index1 = sortedAtoms[atom1]; int index1 = sortedAtoms[atom1];
AtomData data1; AtomData data1;
loadAtomData(&data1, atom1, index1, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix); loadAtomData(&data1, atom1, index1, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
real3 force1 = 0.0f; real3 force1 = make_real3(0);
real3 torque1 = 0.0f; real3 torque1 = make_real3(0);
for (int indexInBlock = 0; indexInBlock < NEIGHBOR_BLOCK_SIZE; indexInBlock++) { for (int indexInBlock = 0; indexInBlock < NEIGHBOR_BLOCK_SIZE; indexInBlock++) {
// Load parameters for atom2. // Load parameters for atom2.
...@@ -373,8 +371,8 @@ extern "C" __global__ void computeForce( ...@@ -373,8 +371,8 @@ extern "C" __global__ void computeForce(
int index2 = sortedAtoms[atom2]; int index2 = sortedAtoms[atom2];
AtomData data2; AtomData data2;
loadAtomData(&data2, atom2, index2, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix); loadAtomData(&data2, atom2, index2, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
real3 force2 = 0.0f; real3 force2 = make_real3(0);
real3 torque2 = 0.0f; real3 torque2 = make_real3(0);
// Compute the interaction. // Compute the interaction.
...@@ -407,8 +405,8 @@ extern "C" __global__ void computeForce( ...@@ -407,8 +405,8 @@ extern "C" __global__ void computeForce(
int index1 = sortedAtoms[atom1]; int index1 = sortedAtoms[atom1];
AtomData data1; AtomData data1;
loadAtomData(&data1, atom1, index1, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix); loadAtomData(&data1, atom1, index1, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
real3 force1 = 0.0f; real3 force1 = make_real3(0);
real3 torque1 = 0.0f; real3 torque1 = make_real3(0);
int nextExclusion = exclusionStartIndex[atom1]; int nextExclusion = exclusionStartIndex[atom1];
int lastExclusion = exclusionStartIndex[atom1+1]; int lastExclusion = exclusionStartIndex[atom1+1];
for (int atom2 = atom1+1; atom2 < numAtoms; atom2++) { for (int atom2 = atom1+1; atom2 < numAtoms; atom2++) {
...@@ -424,8 +422,8 @@ extern "C" __global__ void computeForce( ...@@ -424,8 +422,8 @@ extern "C" __global__ void computeForce(
int index2 = sortedAtoms[atom2]; int index2 = sortedAtoms[atom2];
AtomData data2; AtomData data2;
loadAtomData(&data2, atom2, index2, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix); loadAtomData(&data2, atom2, index2, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
real3 force2 = 0.0f; real3 force2 = make_real3(0);
real3 torque2 = 0.0f; real3 torque2 = make_real3(0);
// Compute the interaction. // Compute the interaction.
...@@ -460,8 +458,8 @@ extern "C" __global__ void computeForce( ...@@ -460,8 +458,8 @@ extern "C" __global__ void computeForce(
AtomData data1, data2; AtomData data1, data2;
loadAtomData(&data1, atom1, index1, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix); loadAtomData(&data1, atom1, index1, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
loadAtomData(&data2, atom2, index2, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix); loadAtomData(&data2, atom2, index2, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
real3 force1 = 0, force2 = 0; real3 force1 = make_real3(0), force2 = make_real3(0);
real3 torque1 = 0, torque2 = 0; real3 torque1 = make_real3(0), torque2 = make_real3(0);
real3 delta = data1.pos-data2.pos; real3 delta = data1.pos-data2.pos;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
...@@ -491,7 +489,7 @@ extern "C" __global__ void computeForce( ...@@ -491,7 +489,7 @@ extern "C" __global__ void computeForce(
* Convert the torques to forces on the connected particles. * Convert the torques to forces on the connected particles.
*/ */
extern "C" __global__ void applyTorques( extern "C" __global__ void applyTorques(
long* __restrict__ forceBuffers, long* __restrict__ torqueBuffers, unsigned long long* __restrict__ forceBuffers, long long* __restrict__ torqueBuffers,
int numParticles, const real4* __restrict__ posq, int2* const __restrict__ axisParticleIndices, int numParticles, const real4* __restrict__ posq, int2* const __restrict__ axisParticleIndices,
const int* sortedParticles) { const int* sortedParticles) {
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
...@@ -504,7 +502,7 @@ extern "C" __global__ void applyTorques( ...@@ -504,7 +502,7 @@ extern "C" __global__ void applyTorques(
real scale = 1/(real) 0x100000000; real scale = 1/(real) 0x100000000;
real3 torque = make_real3(scale*torqueBuffers[originalIndex], scale*torqueBuffers[originalIndex+PADDED_NUM_ATOMS], scale*torqueBuffers[originalIndex+2*PADDED_NUM_ATOMS]); real3 torque = make_real3(scale*torqueBuffers[originalIndex], scale*torqueBuffers[originalIndex+PADDED_NUM_ATOMS], scale*torqueBuffers[originalIndex+2*PADDED_NUM_ATOMS]);
real3 force = 0, xforce = 0, yforce = 0; real3 force = make_real3(0), xforce = make_real3(0), yforce = make_real3(0);
// Apply a force to the x particle. // Apply a force to the x particle.
......
...@@ -6384,7 +6384,7 @@ void OpenCLCalcGayBerneForceKernel::initialize(const System& system, const GayBe ...@@ -6384,7 +6384,7 @@ void OpenCLCalcGayBerneForceKernel::initialize(const System& system, const GayBe
sortedPos = new OpenCLArray(cl, numRealParticles, 4*elementSize, "sortedPos"); sortedPos = new OpenCLArray(cl, numRealParticles, 4*elementSize, "sortedPos");
maxNeighborBlocks = numRealParticles*2; maxNeighborBlocks = numRealParticles*2;
neighbors = OpenCLArray::create<cl_int>(cl, maxNeighborBlocks*32, "neighbors"); neighbors = OpenCLArray::create<cl_int>(cl, maxNeighborBlocks*32, "neighbors");
neighborIndex = OpenCLArray::create<cl_int>(cl, maxNeighborBlocks, "neighbors"); neighborIndex = OpenCLArray::create<cl_int>(cl, maxNeighborBlocks, "neighborIndex");
neighborBlockCount = OpenCLArray::create<cl_int>(cl, 1, "neighborBlockCount"); neighborBlockCount = OpenCLArray::create<cl_int>(cl, 1, "neighborBlockCount");
// Create array for accumulating torques. // Create array for accumulating torques.
...@@ -6514,7 +6514,7 @@ double OpenCLCalcGayBerneForceKernel::execute(ContextImpl& context, bool include ...@@ -6514,7 +6514,7 @@ double OpenCLCalcGayBerneForceKernel::execute(ContextImpl& context, bool include
neighborIndex = NULL; neighborIndex = NULL;
maxNeighborBlocks = (int) ceil((*count)*1.1); maxNeighborBlocks = (int) ceil((*count)*1.1);
neighbors = OpenCLArray::create<cl_int>(cl, maxNeighborBlocks*32, "neighbors"); neighbors = OpenCLArray::create<cl_int>(cl, maxNeighborBlocks*32, "neighbors");
neighborIndex = OpenCLArray::create<cl_int>(cl, maxNeighborBlocks, "neighbors"); neighborIndex = OpenCLArray::create<cl_int>(cl, maxNeighborBlocks, "neighborIndex");
neighborsKernel.setArg<cl::Buffer>(10, neighbors->getDeviceBuffer()); neighborsKernel.setArg<cl::Buffer>(10, neighbors->getDeviceBuffer());
neighborsKernel.setArg<cl::Buffer>(11, neighborIndex->getDeviceBuffer()); neighborsKernel.setArg<cl::Buffer>(11, neighborIndex->getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(17, neighbors->getDeviceBuffer()); forceKernel.setArg<cl::Buffer>(17, neighbors->getDeviceBuffer());
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment