2c2
<  * Perform the first step of Verlet integration.
---
>  * Perform the first step of verlet integration.
5,11c5,10
< extern "C" __global__ void integrateVerletPart1(int numAtoms, int paddedNumAtoms, const mixed2* __restrict__ dt, const real4* __restrict__ posq,
<         const real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta) {
<     const mixed2 stepSize = dt[0];
<     const mixed dtPos = stepSize.y;
<     const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
<     const mixed scale = dtVel/(mixed) 0x100000000;
<     for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
---
> __kernel void integrateVerletPart1(int numAtoms, __global const mixed2* restrict dt, __global const real4* restrict posq, __global const real4* restrict posqCorrection, __global mixed4* restrict velm, __global const real4* restrict force, __global mixed4* restrict posDelta) {
>     mixed2 stepSize = dt[0];
>     mixed dtPos = stepSize.y;
>     mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
>     int index = get_global_id(0);
>     while (index < numAtoms) {
17c16
<             mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
---
>             mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
21,26c20,23
<             velocity.x += scale*force[index]*velocity.w;
<             velocity.y += scale*force[index+paddedNumAtoms]*velocity.w;
<             velocity.z += scale*force[index+paddedNumAtoms*2]*velocity.w;
<             pos.x = velocity.x*dtPos;
<             pos.y = velocity.y*dtPos;
<             pos.z = velocity.z*dtPos;
---
>             velocity.x += force[index].x*dtVel*velocity.w;
>             velocity.y += force[index].y*dtVel*velocity.w;
>             velocity.z += force[index].z*dtVel*velocity.w;
>             pos.xyz = velocity.xyz*dtPos;
29a27
>         index += get_global_size(0);
34c32
<  * Perform the second step of Verlet integration.
---
>  * Perform the second step of verlet integration.
37,38c35
< extern "C" __global__ void integrateVerletPart2(int numAtoms, mixed2* __restrict__ dt, real4* __restrict__ posq,
<         real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const mixed4* __restrict__ posDelta) {
---
> __kernel void integrateVerletPart2(int numAtoms, __global mixed2* restrict dt, __global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict velm, __global const mixed4* restrict posDelta) {
40c37
< #if __CUDA_ARCH__ >= 130
---
> #ifdef SUPPORTS_DOUBLE_PRECISION
46,47c43
<     int index = blockIdx.x*blockDim.x+threadIdx.x;
<     if (index == 0)
---
>     if (get_global_id(0) == 0)
49c45,47
<     for (; index < numAtoms; index += blockDim.x*gridDim.x) {
---
>     barrier(CLK_LOCAL_MEM_FENCE);
>     int index = get_global_id(0);
>     while (index < numAtoms) {
55c53
<             mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
---
>             mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
60,64c58,60
<             pos.x += delta.x;
<             pos.y += delta.y;
<             pos.z += delta.z;
< #if __CUDA_ARCH__ >= 130
<             velocity = make_mixed4((mixed) (delta.x*oneOverDt), (mixed) (delta.y*oneOverDt), (mixed) (delta.z*oneOverDt), velocity.w);
---
>             pos.xyz += delta.xyz;
> #ifdef SUPPORTS_DOUBLE_PRECISION
>             velocity.xyz = convert_mixed4(convert_double4(delta)*oneOverDt).xyz;
66c62
<             velocity = make_mixed4((mixed) (delta.x*oneOverDt+delta.x*correction), (mixed) (delta.y*oneOverDt+delta.y*correction), (mixed) (delta.z*oneOverDt+delta.z*correction), velocity.w);
---
>             velocity.xyz = delta.xyz*oneOverDt + delta.xyz*correction;
69,70c65,66
<             posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
<             posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
---
>             posq[index] = convert_real4(pos);
>             posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
75a72
>         index += get_global_size(0);
83c80
< extern "C" __global__ void selectVerletStepSize(int numAtoms, int paddedNumAtoms, mixed maxStepSize, mixed errorTol, mixed2* __restrict__ dt, const mixed4* __restrict__ velm, const long long* __restrict__ force) {
---
> __kernel void selectVerletStepSize(int numAtoms, mixed maxStepSize, mixed errorTol, __global mixed2* restrict dt, __global const mixed4* restrict velm, __global const real4* restrict force, __local mixed* restrict error) {
86,90c83,86
<     extern __shared__ mixed error[];
<     mixed err = 0.0f;
<     const mixed scale = RECIP((mixed) 0x100000000);
<     for (int index = threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
<         mixed3 f = make_mixed3(scale*force[index], scale*force[index+paddedNumAtoms], scale*force[index+paddedNumAtoms*2]);
---
>     mixed err = 0;
>     int index = get_local_id(0);
>     while (index < numAtoms) {
>         real4 f = force[index];
92a89
>         index += get_global_size(0);
94,95c91,92
<     error[threadIdx.x] = err;
<     __syncthreads();
---
>     error[get_local_id(0)] = err;
>     barrier(CLK_LOCAL_MEM_FENCE);
99,102c96,99
<     for (unsigned int offset = 1; offset < blockDim.x; offset *= 2) {
<         if (threadIdx.x+offset < blockDim.x && (threadIdx.x&(2*offset-1)) == 0)
<             error[threadIdx.x] += error[threadIdx.x+offset];
<         __syncthreads();
---
>     for (unsigned int offset = 1; offset < get_local_size(0); offset *= 2) {
>         if (get_local_id(0)+offset < get_local_size(0) && (get_local_id(0)&(2*offset-1)) == 0)
>             error[get_local_id(0)] += error[get_local_id(0)+offset];
>         barrier(CLK_LOCAL_MEM_FENCE);
104,106c101,103
<     if (threadIdx.x == 0) {
<         mixed totalError = SQRT(error[0]/(numAtoms*3));
<         mixed newStepSize = SQRT(errorTol/totalError);
---
>     if (get_local_id(0) == 0) {
>         mixed totalError = sqrt(error[0]/(numAtoms*3));
>         mixed newStepSize = sqrt(errorTol/totalError);