Application Article

A High-Performance Parallel FDTD Method Enhanced by Using SSE Instruction Set

Algorithm 1

for(i = 0; i <= nx; i ++) {
 vHi_Coeff = _mm_load1_ps(& _Coeff);
 //load single float value to vector
 for(j = 0; j <= ny; j++) {
vHj_Coeff = _mm_load1_ps(& _Coeff);
  //load single float value to vector
= ( 128 *) [i][j];
= ( 128 *) [i][j];
= ( 128 *) [i][j];
_minus = ( 128 *) [i-1][j];
_minus = ( 128 *) [i][j-1];
 for(k = 0, vk = 0; k < nz; k += 4, vk ++) {
  vEk_Coeff = _mm_load1_ps(&Ek_Coeff);
  xmm0 = _mm_sub_ps( [vk], vHx_minus [vk]);
  xmm0 = _mm_mul_ps( _Coeff, xmm0);
  xmm1 = _mm_sub_ps( [vk], vHy_minus [vk] );
  xmm1 = _mm_mul_ps( _Coeff, xmm1 );
  xmm0 = _mm_sub_ps(xmm1, xmm0 );
  xmm1 = _mm_mul_ps(vEk [vk], vEk_Coeff);
  vEk [vk] = _mm_add_ps(xmm1, xmm0);