Application Article
A High-Performance Parallel FDTD Method Enhanced by Using SSE Instruction Set
for(i = 0; i <= nx; i ++)
{
| vHi_Coeff = _mm_load1_ps(& _Coeff); | //load single float value to vector | for(j = 0; j <= ny; j++)
{
| vHj_Coeff = _mm_load1_ps(& _Coeff); | //load single float value to vector | = ( 128 *) [i][j]; | = ( 128 *) [i][j]; | = ( 128 *) [i][j]; | _minus = ( 128 *) [i-1][j]; | _minus = ( 128 *) [i][j-1]; | for(k = 0, vk = 0; k < nz; k += 4, vk ++)
{
| vEk_Coeff = _mm_load1_ps(&Ek_Coeff); | xmm0 = _mm_sub_ps( [vk], vHx_minus [vk]); | xmm0 = _mm_mul_ps( _Coeff, xmm0); | xmm1 = _mm_sub_ps( [vk], vHy_minus [vk] ); | xmm1 = _mm_mul_ps( _Coeff, xmm1 ); | xmm0 = _mm_sub_ps(xmm1, xmm0 ); | xmm1 = _mm_mul_ps(vEk [vk], vEk_Coeff); | vEk [vk] = _mm_add_ps(xmm1, xmm0); | |
|