Research Article

Efficient Parallel Implementation of Matrix Multiplication for Lattice-Based Cryptography on Modern ARM Processor

Algorithm 1

Efficient matrix transpose.
Require: Matrix S ( matrix, )
Ensure: Matrix S ‘( matrix, )
1:  for i from 0 to N, i+= BLOCK_TRANSPOSE do
2:      if i+BLOCK_TRANSPOSE > N
3:        let i-= BLOCK_TRANSPOSE-NBLOCK_TRANSPOSE;
4:      for j from 0 to L, j+=BLOCK_TRANSPOSE do
5:       if j+BLOCK_TRANSPOSE > L
6:         let j-= BLOCK_TRANSPOSE-LBLOCK_TRANSPOSE;
7:       vec1_l=NEON_Vector_Load(S+iL+j);
8:       vec1_h=NEON_Vector_Load(S+iL+j+8);
9:       vec2_l=NEON_Vector_Load(S+(i+8)L+j);
10:       vec2_h=NEON_Vector_Load(S+(i+8)L+j+8);
11:       t2 = NEON_Vector_Interleave(vec1_l, vec2_l);
12:       t3 = NEON_Vector_Interleave(vec1_h, vec2_h);
13:       vec1_l = NEON_Vector_Load(S + (i + 2) LWE_L + j);
14:       vec1_h = NEON_Vector_Load (S + (i + 2) LWE_L + j + 8);
15:       vec2_l = NEON_Vector_Load (S + (i + 10) LWE_L + j);
16:       vec2_h = NEON_Vector_Load (S + (i + 10) LWE_L + j + 8);
17:       t4 = NEON_Vector_Interleave(vec1_l, vec2_l);
18:       t5 = NEON_Vector_Interleave(vec1_h, vec2_h);
19:       vec1_l = NEON_Vector_Load (S + (i + 3) LWE_L + j);
20:       vec1_h = NEON_Vector_Load (S + (i + 3) LWE_L + j + 8);
21:       vec2_l = NEON_Vector_Load (S + (i + 11) LWE_L + j);
22:       vec2_h = NEON_Vector_Load (S + (i + 11) LWE_L + j + 8);
23:       t6 = NEON_Vector_Interleave(vec1_l, vec2_l);
24:       t7 = NEON_Vector_Interleave(vec1_h, vec2_h);
25:       m0 = NEON_Vector_Interleave(t0.val[], t4.val[]);
26:       m1 = NEON_Vector_Interleave(t0.val[], t4.val[]);
27:       m2 = NEON_Vector_Interleave(t1.val[], t5.val[]);
28:       m3 = NEON_Vector_Interleave(t1.val[], t5.val[]);//
29:       m4 = NEON_Vector_Interleave(t2.val[], t6.val[]);
30:       m5 = NEON_Vector_Interleave(t2.val[], t6.val[]);
31:       m6 = NEON_Vector_Interleave(t3.val[], t7.val[]);
32:       m7 = NEON_Vector_Interleave(t3.val[], t7.val[]);
33:       t0 = NEON_Vector_Interleave(m0.val[], m4.val[]);
34:       t1 = NEON_Vector_Interleave(m0.val[], m4.val[]);
35:       t2 = NEON_Vector_Interleave(m1.val[], m5.val[]);
36:       t3 = NEON_Vector_Interleave(m1.val[], m5.val[]);
37:       t4 = NEON_Vector_Interleave(m2.val[], m6.val[]);
38:       t5 = NEON_Vector_Interleave(m2.val[], m6.val[]);
39:       t6 = NEON_Vector_Interleave(m3.val[], m7.val[]);
40:       t7 = NEON_Vector_Interleave(m3.val[], m7.val[]);
41:       NEON_Vector_Store (S’ + j LWE_N + i, t0.val[]);
42:       NEON_Vector_Store (S’ + j LWE_N + i + 8, t0.val[]);
43:       NEON_Vector_Store (S’ + (j + 1) LWE_N + i, t1.val[]);
44:       NEON_Vector_Store (S’ + (j + 1) LWE_N + i + 8, t1.val[]);
45:       NEON_Vector_Store (S’ + (j + 2) LWE_N + i, t2.val[]);
46:       NEON_Vector_Store (S’ + (j + 2) LWE_N + i + 8, t2.val[]);
47:       NEON_Vector_Store (S’ + (j + 3) LWE_N + i, t3.val[]);
48:       NEON_Vector_Store (S’ + (j + 3) LWE_N + i + 8, t3.val[]);
49:       NEON_Vector_Store (S’ + (j + 4) LWE_N + i, t4.val[]);
50:       NEON_Vector_Store (S’ + (j + 4) LWE_N + i + 8, t4.val[]);
51:       NEON_Vector_Store (S’ + (j + 5) LWE_N + i, t5.val[]);
52:       NEON_Vector_Store (S’ + (j + 5) LWE_N + i + 8, t5.val[]);
53:       NEON_Vector_Store (S’ + (j + 6) LWE_N + i, t6.val[]);
54:       NEON_Vector_Store (S’ + (j + 6) LWE_N + i + 8, t6.val[]);
55:       NEON_Vector_Store (S’ + (j + 7) LWE_N + i, t7.val[]);
56:       NEON_Vector_Store (S’ + (j + 7) LWE_N + i + 8, t7.val[]);
57:       Return S’