(a) |
// Optimized Implementation for Zig-Zag Scanning |
|
/ |
For Input |
1 2 3 4 |
5 6 7 8 |
9 10 11 12 |
13 14 15 16 |
Output after Zig-Zag scannig is |
1 2 5 9 6 3 4 7 10 13 14 11 8 12 15 16 |
/ |
/ load quantized coefficients to MXU registers / |
S32LDD(xr1, dct, 0x0); // xr1 = 2 1 |
S32LDD(xr2, dct, 0x4); // xr2 = 4 3 |
S32LDD(xr3, dct, 0x8); // xr3 = 6 5 |
S32LDD(xr4, dct, 0xc); // xr4 = 8 7 |
S32LDD(xr5, dct, 0x10); // xr5 = 10 9 |
S32LDD(xr6, dct, 0x14); // xr6 = 12 11 |
S32LDD(xr7, dct, 0x18); // xr7 = 14 13 |
S32LDD(xr8, dct, 0x1c); // xr8 = 16 15 |
/ adjust positions of coefficients according to Zig-Zag order / |
S32SFL(xr9, xr6, xr4, xr10, 3); // xr6 = 12 11 xr4 = 8 7 xr9 = 12 8 xr10 = 11 7 |
S32SFL(xr4, xr10, xr7, xr6, 3); // xr10 = 11 7 xr7 = 14 13 xr4 = 11 14 xr6 = 7 13 |
S32SFL(xr7, xr6, xr2, xr10, 3); // xr6 = 7 13 xr2 = 4 3 xr7 = 7 4 xr10 = 13 3 |
S32SFL(xr6, xr10, xr5, xr2, 3); // xr10 = 13 3 xr5 = 10 9 xr6 = 13 10 xr2 = 3 9 |
S32SFL(xr5, xr2, xr3, xr10, 3); // xr2 = 3 9 xr6 = 6 5 xr5 = 3 6 xr10 = 9 5 |
/ reordered coefficients are stored to main memory from MXU registers / |
S32STD(xr1, level, 0x0); // xr1 = 2 1 |
S32STD(xr10, level, 0x4); // xr10 = 9 5 |
S32STD(xr5, level, 0x8); // xr5 = 3 6 |
S32STD(xr7, level, 0xc); // xr7 = 7 4 |
S32STD(xr6, level, 0x10); // xr6 = 13 10 |
S32STD(xr4, level, 0x14); // xr4 = 11 14 |
S32STD(xr9, level, 0x18); // xr9 = 12 8 |
S32STD(xr8, level, 0x1c); // xr8 = 16 15 |
(b) |
// Optimized Implementation for top edge extension of luma component |
for (j = 0; j < (Lwidth/16); j++) |
|
S32LDD(xr1, ExtndDataUP_Inp, 0x0); // loading data |
S32LDD(xr2, ExtndDataUP_Inp, 0x4); |
S32LDD(xr3, ExtndDataUP_Inp, 0x8); |
S32LDD(xr4, ExtndDataUP_Inp, 0xc); |
for (i = 0; i < 16; i++) |
|
S32STD(xr1, ExtndDataUP_Out, 0x0); // edge extension |
S32STD(xr2, ExtndDataUP_Out, 0x4); |
S32STD(xr3, ExtndDataUP_Out, 0x8); |
S32STD(xr4, ExtndDataUP_Out, 0xc); |
ExtndDataUP_Out+ = 4; |
|
ExtndDataUP_Inp+ = 64; |
|