misalign = &y0 & 63 |
peeledTripCount = (63 misalign) / sizeof(float) |
x = 10.0f; |
// create a vector: <0,1,2,…15> |
k0_v512 = _mm512_series_pi(0, 1, 16) |
// create vector: all 16 elements are peeledTripCount |
peeledTripCount_v512 = _mm512_broadcast_pi32(peeledTripCount) |
x1_v512 = (m512)0 |
x2_v512 = (m512)0 |
do k0 = 0, peeledTripCount-1, 16 |
// generate mask for vectorizing peeling loop |
mask = _mm512_compare_pi32_mask_lt(k0_v512, peeledTriPCount_v512) |
x1_v512 = _mm512_add_ps_mask(_mm512_fsqrt(yk0:16), x1_v512, mask) |
enddo |
mainTripcount = n ((n peeledTripCount) & 31) |
do k1 = peeledTripCount, mainTripCount-1, 32 |
x1_v512 = _mm512_add_ps(_mm512_fsqrt(yk1:16), x1_v512) |
x2_v512 = _mm512_add_ps(_mm512_fsqrt(yk1+16:16), x2_v512) |
enddo |
// create a vector: <mainTripCount,mainTripCount+1 … mainTripCount+15> |
k2_v512 = _mm512_series_pi(mainTripCount, 1, 16) |
// create a vector: all 16 elements has the same value n |
n_v512 = _mm512_broadcast_pi32(n) |
step_v512 = _mm512_broadcast_pi32(16) |
do k2 = mainTripCount, n, 16 // vectorized remainder loop |
mask = _mm512_compare_pi32_mask_lt(k2_v512, n_v512) |
x1_v512 = _mm512_add_ps_mask(_mm512_fsqrt(yk2:16), x1_v512, mask) |
k2_v512 = _mm512_add_ps(k2_v512, step_v512) |
enddo |
x1_v512 = _mm512_add_ps(x1_v512, x2_512); |
// perform horizontal add on 8 elements and final |
// reduction sum to write the result back to x. |
x = x + _mm512_hadd_ps(x1_512) |