/**
 * Sum a value over all threads.
 */
DEVICE real reduceValue(real value, LOCAL_ARG volatile real* temp) {
    const int thread = LOCAL_ID;
    SYNC_THREADS;
    temp[thread] = value;
    SYNC_THREADS;
    for (int step = 1; step < 32; step *= 2) {
        if (thread+step < LOCAL_SIZE && thread%(2*step) == 0)
            temp[thread] = temp[thread] + temp[thread+step];
        SYNC_WARPS;
    }
    for (int step = 32; step < LOCAL_SIZE; step *= 2) {
        if (thread+step < LOCAL_SIZE && thread%(2*step) == 0)
            temp[thread] = temp[thread] + temp[thread+step];
        SYNC_THREADS;
    }
    return temp[0];
}

/**
 * First step of computing the radius of gyration: each thread block computes a contribution
 * to the center position.
 */
KERNEL void computeCenterPosition(int numParticles, GLOBAL const real4* RESTRICT posq, GLOBAL const int* RESTRICT particles,
        GLOBAL real* centerBuffer) {
    LOCAL volatile real temp[THREAD_BLOCK_SIZE];
    real3 center = make_real3(0);
    for (int i = GLOBAL_ID; i < numParticles; i += GLOBAL_SIZE)
        center += trimTo3(posq[particles[i]]);
    center.x = reduceValue(center.x, temp);
    center.y = reduceValue(center.y, temp);
    center.z = reduceValue(center.z, temp);
    if (LOCAL_ID == 0) {
        centerBuffer[3*GROUP_ID] = center.x;
        centerBuffer[3*GROUP_ID+1] = center.y;
        centerBuffer[3*GROUP_ID+2] = center.z;
    }
}

/**
 * Second step of computing the radius of gyration: each thread block computes a contribution to Rg^2.
 */
KERNEL void computeRg(int numParticles, GLOBAL const real4* RESTRICT posq, GLOBAL const int* RESTRICT particles,
        GLOBAL real* centerBuffer, GLOBAL real* rgBuffer) {
    LOCAL volatile real temp[THREAD_BLOCK_SIZE];

    // Sum the contributions computed in the previous kernel to find the center position.

    real3 center = make_real3(0);
    for (int i = LOCAL_ID; i < NUM_GROUPS; i += LOCAL_SIZE) {
        center.x += centerBuffer[3*i];
        center.y += centerBuffer[3*i+1];
        center.z += centerBuffer[3*i+2];
    }
    center.x = reduceValue(center.x, temp)/numParticles;
    center.y = reduceValue(center.y, temp)/numParticles;
    center.z = reduceValue(center.z, temp)/numParticles;
    if (GLOBAL_ID == 0) {
        // Save the result so we don't need to compute it again in the next kernel.

        centerBuffer[3*NUM_GROUPS] = center.x;
        centerBuffer[3*NUM_GROUPS+1] = center.y;
        centerBuffer[3*NUM_GROUPS+2] = center.z;
    }

    // Compute the contributions to Rg^2.

    real rg2 = 0;
    for (int i = GLOBAL_ID; i < numParticles; i += GLOBAL_SIZE) {
        real3 delta = trimTo3(posq[particles[i]]) - center;
        rg2 += dot(delta, delta);
    }
    rg2 = reduceValue(rg2, temp);
    if (LOCAL_ID == 0)
        rgBuffer[GROUP_ID] = rg2;
}

/**
 * Third step of computing the radius of gyration: compute the forces.
 */
KERNEL void computeForces(int numParticles, GLOBAL const real4* RESTRICT posq, GLOBAL const int* RESTRICT particles,
        GLOBAL const real* centerBuffer, GLOBAL real* rgBuffer, GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL mixed* RESTRICT energyBuffer) {
    LOCAL volatile real temp[THREAD_BLOCK_SIZE];
    real3 center = make_real3(centerBuffer[3*NUM_GROUPS], centerBuffer[3*NUM_GROUPS+1], centerBuffer[3*NUM_GROUPS+2]);

    // Sum the contributions computed in the previous kernel to find Rg.

    real rg2 = 0;
    for (int i = LOCAL_ID; i < NUM_GROUPS; i += LOCAL_SIZE)
        rg2 += rgBuffer[i];
    real rg = SQRT(reduceValue(rg2, temp)/numParticles);
    if (GLOBAL_ID == 0)
        energyBuffer[0] += rg;

    // Compute the forces.

    real scale = 1/(rg*numParticles);
    for (int i = GLOBAL_ID; i < numParticles; i += GLOBAL_SIZE) {
        int particle = particles[i];
        real3 delta = trimTo3(posq[particle]) - center;
        ATOMIC_ADD(&forceBuffers[particle], (mm_ulong) realToFixedPoint(-scale*delta.x));
        ATOMIC_ADD(&forceBuffers[particle+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(-scale*delta.y));
        ATOMIC_ADD(&forceBuffers[particle+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(-scale*delta.z));
    }
}