-
Notifications
You must be signed in to change notification settings - Fork 0
/
Jun11-1.c
131 lines (83 loc) · 2.78 KB
/
Jun11-1.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#include <stdio.h>
#include <arm_neon.h>
uint16_t a[16] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
uint16_t b[16] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
uint16_t result[16];
void Toep_2x8(uint16_t *a, uint16_t *b, uint16_t *r);
void naiveTMVP(uint16_t *a, uint16_t *b, uint16_t *r);
int main() {
printf("Hello\n");
Toep_2x8(a, b, result);
printf("byebye\n");
return 0;
}
void Toep_2x8(uint16_t *a, uint16_t *b, uint16_t *r){
// a is the address of length 16 coefficients of a
// b is the address of length 16 coefficients of b
// r is the address of length 16 coefficients of result.
// We convert input a into a Toeplitz matrix representation
uint16_t toep_a_0[15];
uint16_t toep_a_pm1[15];
for (uint16_t i=0; i<15; i++){
toep_a_0[i] = a[(23-i) % 16];
toep_a_pm1[i] = a[15-i];
}
// printf("toep_a_pm1 is ");
// for (uint16_t i=0; i<15; i++){
// printf("%d ", toep_a_pm1[i]);
// }
// printf("\n");
// (A_-1 - A_0) B_1
uint16_t toep[15];
for (uint16_t i=0; i<15; i++){
toep[i] = toep_a_pm1[i] - toep_a_0[i];
}
// uint16_t current_b[8];
// for (uint16_t i=0; i<8; i++){
// current_b[i] = b[8+i];
// }
uint16_t temp_result_1[8] = {0,0,0,0,0,0,0,0};
naiveTMVP(toep, &b[8], temp_result_1);
// A0 (B_0+B_1)
uint16_t current_b[8];
for (uint16_t i=0; i<8; i++){
current_b[i] = b[i] + b[8+i];
}
uint16_t temp_result_2[8] = {0,0,0,0,0,0,0,0};
naiveTMVP(toep_a_0, current_b, temp_result_2);
// (A_1 - A_0) B_0
for (uint16_t i=0; i<15; i++){
toep[i] = toep_a_pm1[i] - toep_a_0[i];
}
uint16_t temp_result_3[8] = {0,0,0,0,0,0,0,0};
naiveTMVP(toep, b, temp_result_3);
// Recombination
for (uint16_t i=0; i<8; i++){
result[i] = temp_result_1[i] + temp_result_2[i];
result[8+i] = temp_result_2[i] + temp_result_3[i];
}
for (uint16_t i=0; i<16; i++){
printf("%d ", result[i]);
}
}
void naiveTMVP(uint16_t *a, uint16_t *b, uint16_t *r){
// a is the anti-diagonal elements of a toep, consists of 15 elements.
// b is the vector to be multiplied with
// r is the result vector
uint16x8_t result_vector = vdupq_n_u16(0);
u_int16_t b_scalar;
uint16x8_t a_vector;
for (uint16_t i=0; i<8; i++){
// Load vectors
b_scalar = b[i];
a_vector = vld1q_u16(&a[i]);
// Perform multiply-accumulate operation
result_vector = vmlaq_n_u16(result_vector, a_vector, b_scalar);
}
uint16_t r_temp[8];
vst1q_u16(r_temp, result_vector);
// 我怎麼想也不知道 result_vector 要 倒序 才會是我們要的
for (u_int16_t i=0; i<8; i++){
r[i] = r_temp[7-i];
}
}