/**
 * Perform the first step of Verlet integration.
 */

extern "C" __global__ void integrateVerletPart1(const real2* __restrict__ dt, const real4* __restrict__ posq, real4* __restrict__ velm, const long long* __restrict__ force, real4* __restrict__ posDelta) {
    const real2 stepSize = dt[0];
    const real dtPos = stepSize.y;
    const real dtVel = 0.5f*(stepSize.x+stepSize.y);
    const real scale = dtVel/(real) 0xFFFFFFFF;
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
        real4 velocity = velm[index];
        if (velocity.w != 0.0) {
            real4 pos = posq[index];
            velocity.x += scale*force[index]*velocity.w;
            velocity.y += scale*force[index+PADDED_NUM_ATOMS]*velocity.w;
            velocity.z += scale*force[index+PADDED_NUM_ATOMS*2]*velocity.w;
            pos.x = velocity.x*dtPos;
            pos.y = velocity.y*dtPos;
            pos.z = velocity.z*dtPos;
            posDelta[index] = pos;
            velm[index] = velocity;
        }
    }
}

/**
 * Perform the second step of Verlet integration.
 */

extern "C" __global__ void integrateVerletPart2(real2* __restrict__ dt, real4* __restrict__ posq, real4* __restrict__ velm, const real4* __restrict__ posDelta) {
    real2 stepSize = dt[0];
    double oneOverDt = 1.0/stepSize.y;
    int index = blockIdx.x*blockDim.x+threadIdx.x;
    if (index == 0)
        dt[0].x = stepSize.y;
    for (; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
        real4 velocity = velm[index];
        if (velocity.w != 0.0) {
            real4 pos = posq[index];
            real4 delta = posDelta[index];
            pos.x += delta.x;
            pos.y += delta.y;
            pos.z += delta.z;
            velocity = make_real4((real) (delta.x*oneOverDt), (real) (delta.y*oneOverDt), (real) (delta.z*oneOverDt), velocity.w);
            posq[index] = pos;
            velm[index] = velocity;
        }
    }
}

/**
 * Select the step size to use for the next step.
 */
//
//extern "C" __global__ void selectVerletStepSize(real maxStepSize, real errorTol, real2* __restrict__ dt, const real4* __restrict__ velm, const real4* __restrict__ force, __local real* __restrict__ error) {
//    // Calculate the error.
//
//    real err = 0.0f;
//    for (int index = threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
//        real4 f = force[index];
//        real invMass = velm[index].w;
//        err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass;
//    }
//    error[threadIdx.x] = err;
//    __syncthreads;
//
//    // Sum the errors from all threads.
//
//    for (unsigned int offset = 1; offset < get_local_size(0); offset *= 2) {
//        if (threadIdx.x+offset < get_local_size(0) && (threadIdx.x&(2*offset-1)) == 0)
//            error[threadIdx.x] += error[threadIdx.x+offset];
//        __syncthreads;
//    }
//    if (threadIdx.x == 0) {
//        real totalError = sqrt(error[0]/(NUM_ATOMS*3));
//        real newStepSize = sqrt(errorTol/totalError);
//        real oldStepSize = dt[0].y;
//        if (oldStepSize > 0.0f)
//            newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
//        if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize)
//            newStepSize = oldStepSize; // Keeping dt constant between steps improves the behavior of the integrator.
//        if (newStepSize > maxStepSize)
//            newStepSize = maxStepSize;
//        dt[0].y = newStepSize;
//    }
//}