float alpha1 = alphas1[Get_3D_Index(x,y,z,DATA_W,DATA_H)];
float alpha2 = alphas2[Get_3D_Index(x,y,z,DATA_W,DATA_H)];
float alpha3 = alphas3[Get_3D_Index(x,y,z,DATA_W,DATA_H)];
float alpha4 = alphas4[Get_3D_Index(x,y,z,DATA_W,DATA_H)];
s_Y[threadIdx.y][ 0 ][threadIdx.x] =
whitened_volumes[Get_4D_Index(x,y,z,c_Permutation_Vector[ 0 ],DATA_W,DATA_H,DATA_D)];
s_Y[threadIdx.y][ 1 ][threadIdx.x] = alpha1 * s_Y[threadIdx.y][ 0 ][threadIdx.x]
+  whitened_volumes[Get_4D_Index(x,y,z,c_Permutation_Vector[ 1 ],DATA_W,DATA_H,DATA_D)];
s_Y[threadIdx.y][ 2 ][threadIdx.x] = alpha1 * s_Y[threadIdx.y][ 1 ][threadIdx.x]
+ alpha2 * s_Y[threadIdx.y][ 0 ][threadIdx.x]
+  whitened_volumes[Get_4D_Index(x,y,z,c_Permutation_Vector[ 2 ],DATA_W,DATA_H,DATA_D)];
s_Y[threadIdx.y][ 3 ][threadIdx.x] = alpha1 * s_Y[threadIdx.y][ 2 ][threadIdx.x]
+ alpha2 * s_Y[threadIdx.y][ 1 ][threadIdx.x] + alpha3 * s_Y[threadIdx.y][ 0 ][threadIdx.x]
+ whitened_volumes[Get_4D_Index(x,y,z,c_Permutation_Vector[ 3 ],DATA_W,DATA_H,DATA_D)];
permuted_volumes[Get_4D_Index(x,y,z,0,DATA_W,DATA_H,DATA_D)] = s_Y[threadIdx.y][ 0 ][threadIdx.x];
permuted_volumes[Get_4D_Index(x,y,z,1,DATA_W,DATA_H,DATA_D)] = s_Y[threadIdx.y][ 1 ][threadIdx.x];
permuted_volumes[Get_4D_Index(x,y,z,2,DATA_W,DATA_H,DATA_D)] = s_Y[threadIdx.y][ 2 ][threadIdx.x];
permuted_volumes[Get_4D_Index(x,y,z,3,DATA_W,DATA_H,DATA_D)] = s_Y[threadIdx.y][ 3 ][threadIdx.x];
// Loop over time points
for (t = 4; t < DATA_T; t++)
{
 s_Y[threadIdx.y][ 4 ][threadIdx.x] =
             alpha1 * s_Y[threadIdx.y][ 3 ][threadIdx.x]
 + alpha2 * s_Y[threadIdx.y][ 2 ][threadIdx.x]
 + alpha3 * s_Y[threadIdx.y][ 1 ][threadIdx.x]
 + alpha4 * s_Y[threadIdx.y][ 0 ][threadIdx.x]
 + whitened_volumes[Get_4D_Index(x,y,z,c_Permutation_Vector[t],DATA_W,DATA_H,DATA_D)];
 permuted_volumes[Get_4D_Index(x,y,z,t,DATA_W,DATA_H,DATA_D)] = s_Y[threadIdx.y][ 4 ][threadIdx.x];
 // Save old values
 s_Y[threadIdx.y][ 0 ][threadIdx.x] = s_Y[threadIdx.y][ 1 ][threadIdx.x];
 s_Y[threadIdx.y][ 1 ][threadIdx.x] = s_Y[threadIdx.y][ 2 ][threadIdx.x];
 s_Y[threadIdx.y][ 2 ][threadIdx.x] = s_Y[threadIdx.y][ 3 ][threadIdx.x];
 s_Y[threadIdx.y][ 3 ][threadIdx.x] = s_Y[threadIdx.y][ 4 ][threadIdx.x];
}
Algorithm 1