/**
 * Calculate the center of mass momentum.
 */

extern "C" __global__ void calcCenterOfMassMomentum(int numAtoms, const real4* __restrict__ velm, float4* __restrict__ cmMomentum) {
    extern __shared__ volatile float3 temp[];
    float3 cm = make_float3(0, 0, 0);
    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
        real4 velocity = velm[index];
        if (velocity.w != 0.0) {
            real mass = RECIP(velocity.w);
            cm.x += (float) velocity.x*mass;
            cm.y += (float) velocity.y*mass;
            cm.z += (float) velocity.z*mass;
        }
    }

    // Sum the threads in this group.

    int thread = threadIdx.x;
    temp[thread].x = cm.x;
    temp[thread].y = cm.y;
    temp[thread].z = cm.z;
    __syncthreads();
    if (thread < 32) {
        temp[thread].x += temp[thread+32].x;
        temp[thread].y += temp[thread+32].y;
        temp[thread].z += temp[thread+32].z;
        if (thread < 16) {
            temp[thread].x += temp[thread+16].x;
            temp[thread].y += temp[thread+16].y;
            temp[thread].z += temp[thread+16].z;
        }
        if (thread < 8) {
            temp[thread].x += temp[thread+8].x;
            temp[thread].y += temp[thread+8].y;
            temp[thread].z += temp[thread+8].z;
        }
        if (thread < 4) {
            temp[thread].x += temp[thread+4].x;
            temp[thread].y += temp[thread+4].y;
            temp[thread].z += temp[thread+4].z;
        }
        if (thread < 2) {
            temp[thread].x += temp[thread+2].x;
            temp[thread].y += temp[thread+2].y;
            temp[thread].z += temp[thread+2].z;
        }
    }
    if (thread == 0) {
        float3 sum = make_float3(temp[thread].x+temp[thread+1].x, temp[thread].y+temp[thread+1].y, temp[thread].z+temp[thread+1].z);
        cmMomentum[blockIdx.x] = make_float4(sum.x, sum.y, sum.z, 0.0f);
    }
}

/**
 * Remove center of mass motion.
 */

extern "C" __global__ void removeCenterOfMassMomentum(unsigned int numAtoms, real4* __restrict__ velm, const float4* __restrict__ cmMomentum) {
    // First sum all of the momenta that were calculated by individual groups.

    extern volatile float3 temp[];
    float3 cm = make_float3(0, 0, 0);
    for (unsigned int index = threadIdx.x; index < gridDim.x; index += blockDim.x) {
        float4 momentum = cmMomentum[index];
        cm.x += momentum.x;
        cm.y += momentum.y;
        cm.z += momentum.z;
    }
    int thread = threadIdx.x;
    temp[thread].x = cm.x;
    temp[thread].y = cm.y;
    temp[thread].z = cm.z;
    __syncthreads();
    if (thread < 32) {
        temp[thread].x += temp[thread+32].x;
        temp[thread].y += temp[thread+32].y;
        temp[thread].z += temp[thread+32].z;
        if (thread < 16) {
            temp[thread].x += temp[thread+16].x;
            temp[thread].y += temp[thread+16].y;
            temp[thread].z += temp[thread+16].z;
        }
        if (thread < 8) {
            temp[thread].x += temp[thread+8].x;
            temp[thread].y += temp[thread+8].y;
            temp[thread].z += temp[thread+8].z;
        }
        if (thread < 4) {
            temp[thread].x += temp[thread+4].x;
            temp[thread].y += temp[thread+4].y;
            temp[thread].z += temp[thread+4].z;
        }
        if (thread < 2) {
            temp[thread].x += temp[thread+2].x;
            temp[thread].y += temp[thread+2].y;
            temp[thread].z += temp[thread+2].z;
        }
    }
    __syncthreads();
    cm = make_float3(INVERSE_TOTAL_MASS*(temp[0].x+temp[1].x), INVERSE_TOTAL_MASS*(temp[0].y+temp[1].y), INVERSE_TOTAL_MASS*(temp[0].z+temp[1].z));

    // Now remove the center of mass velocity from each atom.

    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
        real4 velocity = velm[index];
        velocity.x -= cm.x;
        velocity.y -= cm.y;
        velocity.z -= cm.z;
        velm[index] = velocity;
    }
}