2c2 < * Perform the first step of Verlet integration. --- > * Perform the first step of verlet integration. 5,11c5,10 < extern "C" __global__ void integrateVerletPart1(int numAtoms, int paddedNumAtoms, const mixed2* __restrict__ dt, const real4* __restrict__ posq, < const real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta) { < const mixed2 stepSize = dt[0]; < const mixed dtPos = stepSize.y; < const mixed dtVel = 0.5f*(stepSize.x+stepSize.y); < const mixed scale = dtVel/(mixed) 0x100000000; < for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) { --- > __kernel void integrateVerletPart1(int numAtoms, __global const mixed2* restrict dt, __global const real4* restrict posq, __global const real4* restrict posqCorrection, __global mixed4* restrict velm, __global const real4* restrict force, __global mixed4* restrict posDelta) { > mixed2 stepSize = dt[0]; > mixed dtPos = stepSize.y; > mixed dtVel = 0.5f*(stepSize.x+stepSize.y); > int index = get_global_id(0); > while (index < numAtoms) { 17c16 < mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w); --- > mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w); 21,26c20,23 < velocity.x += scale*force[index]*velocity.w; < velocity.y += scale*force[index+paddedNumAtoms]*velocity.w; < velocity.z += scale*force[index+paddedNumAtoms*2]*velocity.w; < pos.x = velocity.x*dtPos; < pos.y = velocity.y*dtPos; < pos.z = velocity.z*dtPos; --- > velocity.x += force[index].x*dtVel*velocity.w; > velocity.y += force[index].y*dtVel*velocity.w; > velocity.z += force[index].z*dtVel*velocity.w; > pos.xyz = velocity.xyz*dtPos; 29a27 > index += get_global_size(0); 34c32 < * Perform the second step of Verlet integration. --- > * Perform the second step of verlet integration. 37,38c35 < extern "C" __global__ void integrateVerletPart2(int numAtoms, mixed2* __restrict__ dt, real4* __restrict__ posq, < real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const mixed4* __restrict__ posDelta) { --- > __kernel void integrateVerletPart2(int numAtoms, __global mixed2* restrict dt, __global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict velm, __global const mixed4* restrict posDelta) { 40c37 < #if __CUDA_ARCH__ >= 130 --- > #ifdef SUPPORTS_DOUBLE_PRECISION 46,47c43 < int index = blockIdx.x*blockDim.x+threadIdx.x; < if (index == 0) --- > if (get_global_id(0) == 0) 49c45,47 < for (; index < numAtoms; index += blockDim.x*gridDim.x) { --- > barrier(CLK_LOCAL_MEM_FENCE); > int index = get_global_id(0); > while (index < numAtoms) { 55c53 < mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w); --- > mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w); 60,64c58,60 < pos.x += delta.x; < pos.y += delta.y; < pos.z += delta.z; < #if __CUDA_ARCH__ >= 130 < velocity = make_mixed4((mixed) (delta.x*oneOverDt), (mixed) (delta.y*oneOverDt), (mixed) (delta.z*oneOverDt), velocity.w); --- > pos.xyz += delta.xyz; > #ifdef SUPPORTS_DOUBLE_PRECISION > velocity.xyz = convert_mixed4(convert_double4(delta)*oneOverDt).xyz; 66c62 < velocity = make_mixed4((mixed) (delta.x*oneOverDt+delta.x*correction), (mixed) (delta.y*oneOverDt+delta.y*correction), (mixed) (delta.z*oneOverDt+delta.z*correction), velocity.w); --- > velocity.xyz = delta.xyz*oneOverDt + delta.xyz*correction; 69,70c65,66 < posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w); < posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0); --- > posq[index] = convert_real4(pos); > posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0); 75a72 > index += get_global_size(0); 83c80 < extern "C" __global__ void selectVerletStepSize(int numAtoms, int paddedNumAtoms, mixed maxStepSize, mixed errorTol, mixed2* __restrict__ dt, const mixed4* __restrict__ velm, const long long* __restrict__ force) { --- > __kernel void selectVerletStepSize(int numAtoms, mixed maxStepSize, mixed errorTol, __global mixed2* restrict dt, __global const mixed4* restrict velm, __global const real4* restrict force, __local mixed* restrict error) { 86,90c83,86 < extern __shared__ mixed error[]; < mixed err = 0.0f; < const mixed scale = RECIP((mixed) 0x100000000); < for (int index = threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) { < mixed3 f = make_mixed3(scale*force[index], scale*force[index+paddedNumAtoms], scale*force[index+paddedNumAtoms*2]); --- > mixed err = 0; > int index = get_local_id(0); > while (index < numAtoms) { > real4 f = force[index]; 92a89 > index += get_global_size(0); 94,95c91,92 < error[threadIdx.x] = err; < __syncthreads(); --- > error[get_local_id(0)] = err; > barrier(CLK_LOCAL_MEM_FENCE); 99,102c96,99 < for (unsigned int offset = 1; offset < blockDim.x; offset *= 2) { < if (threadIdx.x+offset < blockDim.x && (threadIdx.x&(2*offset-1)) == 0) < error[threadIdx.x] += error[threadIdx.x+offset]; < __syncthreads(); --- > for (unsigned int offset = 1; offset < get_local_size(0); offset *= 2) { > if (get_local_id(0)+offset < get_local_size(0) && (get_local_id(0)&(2*offset-1)) == 0) > error[get_local_id(0)] += error[get_local_id(0)+offset]; > barrier(CLK_LOCAL_MEM_FENCE); 104,106c101,103 < if (threadIdx.x == 0) { < mixed totalError = SQRT(error[0]/(numAtoms*3)); < mixed newStepSize = SQRT(errorTol/totalError); --- > if (get_local_id(0) == 0) { > mixed totalError = sqrt(error[0]/(numAtoms*3)); > mixed newStepSize = sqrt(errorTol/totalError);