Require: Matrix S ( matrix, ) |
Ensure: Matrix S ‘( matrix, ) |
1: for i from 0 to N, i+= BLOCK_TRANSPOSE do |
2: if i+BLOCK_TRANSPOSE > N |
3: let i-= BLOCK_TRANSPOSE-NBLOCK_TRANSPOSE; |
4: for j from 0 to L, j+=BLOCK_TRANSPOSE do |
5: if j+BLOCK_TRANSPOSE > L |
6: let j-= BLOCK_TRANSPOSE-LBLOCK_TRANSPOSE; |
7: vec1_l=NEON_Vector_Load(S+iL+j); |
8: vec1_h=NEON_Vector_Load(S+iL+j+8); |
9: vec2_l=NEON_Vector_Load(S+(i+8)L+j); |
10: vec2_h=NEON_Vector_Load(S+(i+8)L+j+8); |
11: t2 = NEON_Vector_Interleave(vec1_l, vec2_l); |
12: t3 = NEON_Vector_Interleave(vec1_h, vec2_h); |
13: vec1_l = NEON_Vector_Load(S + (i + 2) LWE_L + j); |
14: vec1_h = NEON_Vector_Load (S + (i + 2) LWE_L + j + 8); |
15: vec2_l = NEON_Vector_Load (S + (i + 10) LWE_L + j); |
16: vec2_h = NEON_Vector_Load (S + (i + 10) LWE_L + j + 8); |
17: t4 = NEON_Vector_Interleave(vec1_l, vec2_l); |
18: t5 = NEON_Vector_Interleave(vec1_h, vec2_h); |
19: vec1_l = NEON_Vector_Load (S + (i + 3) LWE_L + j); |
20: vec1_h = NEON_Vector_Load (S + (i + 3) LWE_L + j + 8); |
21: vec2_l = NEON_Vector_Load (S + (i + 11) LWE_L + j); |
22: vec2_h = NEON_Vector_Load (S + (i + 11) LWE_L + j + 8); |
23: t6 = NEON_Vector_Interleave(vec1_l, vec2_l); |
24: t7 = NEON_Vector_Interleave(vec1_h, vec2_h); |
25: m0 = NEON_Vector_Interleave(t0.val[], t4.val[]); |
26: m1 = NEON_Vector_Interleave(t0.val[], t4.val[]); |
27: m2 = NEON_Vector_Interleave(t1.val[], t5.val[]); |
28: m3 = NEON_Vector_Interleave(t1.val[], t5.val[]);// |
29: m4 = NEON_Vector_Interleave(t2.val[], t6.val[]); |
30: m5 = NEON_Vector_Interleave(t2.val[], t6.val[]); |
31: m6 = NEON_Vector_Interleave(t3.val[], t7.val[]); |
32: m7 = NEON_Vector_Interleave(t3.val[], t7.val[]); |
33: t0 = NEON_Vector_Interleave(m0.val[], m4.val[]); |
34: t1 = NEON_Vector_Interleave(m0.val[], m4.val[]); |
35: t2 = NEON_Vector_Interleave(m1.val[], m5.val[]); |
36: t3 = NEON_Vector_Interleave(m1.val[], m5.val[]); |
37: t4 = NEON_Vector_Interleave(m2.val[], m6.val[]); |
38: t5 = NEON_Vector_Interleave(m2.val[], m6.val[]); |
39: t6 = NEON_Vector_Interleave(m3.val[], m7.val[]); |
40: t7 = NEON_Vector_Interleave(m3.val[], m7.val[]); |
41: NEON_Vector_Store (S’ + j LWE_N + i, t0.val[]); |
42: NEON_Vector_Store (S’ + j LWE_N + i + 8, t0.val[]); |
43: NEON_Vector_Store (S’ + (j + 1) LWE_N + i, t1.val[]); |
44: NEON_Vector_Store (S’ + (j + 1) LWE_N + i + 8, t1.val[]); |
45: NEON_Vector_Store (S’ + (j + 2) LWE_N + i, t2.val[]); |
46: NEON_Vector_Store (S’ + (j + 2) LWE_N + i + 8, t2.val[]); |
47: NEON_Vector_Store (S’ + (j + 3) LWE_N + i, t3.val[]); |
48: NEON_Vector_Store (S’ + (j + 3) LWE_N + i + 8, t3.val[]); |
49: NEON_Vector_Store (S’ + (j + 4) LWE_N + i, t4.val[]); |
50: NEON_Vector_Store (S’ + (j + 4) LWE_N + i + 8, t4.val[]); |
51: NEON_Vector_Store (S’ + (j + 5) LWE_N + i, t5.val[]); |
52: NEON_Vector_Store (S’ + (j + 5) LWE_N + i + 8, t5.val[]); |
53: NEON_Vector_Store (S’ + (j + 6) LWE_N + i, t6.val[]); |
54: NEON_Vector_Store (S’ + (j + 6) LWE_N + i + 8, t6.val[]); |
55: NEON_Vector_Store (S’ + (j + 7) LWE_N + i, t7.val[]); |
56: NEON_Vector_Store (S’ + (j + 7) LWE_N + i + 8, t7.val[]); |
57: Return S’ |