forked from etmc/tmLQCD
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbgq.h
138 lines (107 loc) · 4.19 KB
/
bgq.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#ifndef _BGQ_H
#define _BGQ_H
#include "bgq2.h"
#define _vec_load_spinor(r0, r1, r2, r3, r4, r5, phi) \
r0 = vec_ld(0L, (double*) &(phi).c0); \
r1 = vec_ld(32L, (double*) &(phi).c0); \
r2 = vec_ld(64L, (double*) &(phi).c0); \
r3 = vec_ld(96L, (double*) &(phi).c0); \
r4 = vec_ld(128L, (double*) &(phi).c0); \
r5 = vec_ld(160L, (double*) &(phi).c0);
#define _vec_load_halfspinor(r0, r1, r2, phi) \
r0 = vec_ld(0L, (double*) &(phi).c0); \
r1 = vec_ld(32L, (double*) &(phi).c0); \
r2 = vec_ld(64L, (double*) &(phi).c0);
#define _vec_load_halfspinor_32(r0, r1, r2, phi) \
r0 = vec_ld(0L, (float*) &(phi).c0); \
r1 = vec_ld(16L, (float*) &(phi).c0); \
r2 = vec_ld(32L, (float*) &(phi).c0);
#define _vec_store_spinor(phi, r0, r1, r2, r3, r4, r5) \
vec_st(r0, 0L, (double*) &(phi).c0); \
vec_st(r1, 32L, (double*) &(phi).c0); \
vec_st(r2, 64L, (double*) &(phi).c0); \
vec_st(r3, 96L, (double*) &(phi).c0); \
vec_st(r4, 128L, (double*) &(phi).c0); \
vec_st(r5, 160L, (double*) &(phi).c0);
#define _vec_add_ul_spinor(rs0, rs1, rs2, r0, r1, r2, r3, r4, r5) \
rs0 = vec_add(r0, r3); \
rs1 = vec_add(r1, r4); \
rs2 = vec_add(r2, r5);
#define _vec_sub_ul_spinor(rs0, rs1, rs2, r0, r1, r2, r3, r4, r5) \
rs0 = vec_sub(r0, r3); \
rs1 = vec_sub(r1, r4); \
rs2 = vec_sub(r2, r5);
// requires 32 byte alignment of phi
#define _vec_load(r0, r1, phi) \
r0 = vec_ld(0L, (double*) &(phi).c0); \
r1 = vec_ld2(0L, (double*) &(phi).c2);
#define _vec_load_32(r0, r1, phi) \
r0 = vec_ld(0L, (float*) &(phi).c0); \
r1 = vec_ld2(0L, (float*) &(phi).c2);
// works also with 16 byte alignement of phi
#define _vec_load16(r0, r1, phi, tmp) \
r0 = vec_ld2(0L, (double*) &(phi).c0); \
r1 = vec_ld(0L, (double*) &(phi).c1); \
tmp = vec_gpci(00145); \
r0 = vec_perm(r0, r1, tmp); \
tmp = vec_gpci(02301); \
r1 = vec_perm(r1, r0, tmp);
#define _vec_load16_32(r0, r1, phi, tmp) \
r0 = vec_ld2(0L, (float*) &(phi).c0); \
r1 = vec_ld(0L, (float*) &(phi).c1); \
tmp = vec_gpci(00145); \
r0 = vec_perm(r0, r1, tmp); \
tmp = vec_gpci(02301); \
r1 = vec_perm(r1, r0, tmp);
// alternative
#define _vec_load16c(r0, r1, phi, tmp) \
r0 = vec_ld2(0L, (double*) &(phi).c0); \
r1 = vec_ld(0L, (double*) &(phi).c1); \
tmp = vec_gpci(00145); \
r0 = vec_perm(r0, r1, tmp); \
r1 = vec_ld2(0L, (double*) &(phi).c2);
// requires 32 byte alignment of phi
#define _vec_store(phi, r0, r1) \
vec_st((r0), 0L, (double*) &(phi).c0); \
vec_st2((r1), 0L, (double*) &(phi).c2);
// requires 16 byte alignment of phi
#define _vec_store_32(phi, r0, r1) \
vec_st((r0), 0L, (float*) &(phi).c0); \
vec_st2((r1), 0L, (float*) &(phi).c2);
// requires 16 (and must not be 32) byte alignment of phi
#define _vec_store16(phi, r0, r1, tmp) \
vec_st2((r0), 0L, (double*) &(phi).c0); \
tmp = vec_gpci(02345); \
r0 = vec_perm(r0, r1, tmp); \
vec_st((r0), 0L, (double *) &(phi).c1);
// requires 8 (and must not be 16) byte alignment of phi
#define _vec_store16_32(phi, r0, r1, tmp) \
vec_st2((r0), 0L, (float*) &(phi).c0); \
tmp = vec_gpci(02345); \
r0 = vec_perm(r0, r1, tmp); \
vec_st((r0), 0L, (float *) &(phi).c1);
// requires 32 byte alignment of phi
#define _vec_store_halfspinor(phi, r0, r1, r2) \
vec_st((r0), 0L, (double*) &(phi).c0); \
vec_st((r1), 32L, (double*) &(phi).c0); \
vec_st((r2), 64L, (double*) &(phi).c0);
// requires 16 byte alignment of phi
#define _vec_store_halfspinor_32(phi, r0, r1, r2) \
vec_st((r0), 0L, (float*) &(phi).c0); \
vec_st((r1), 16L, (float*) &(phi).c0); \
vec_st((r2), 32L, (float*) &(phi).c0);
#define _vec_add(rs0, rs1, r0, r1, s0, s1) \
rs0 = vec_add(r0, s0); \
rs1 = vec_add(r1, s1);
#define _vec_sub(rs0, rs1, r0, r1, s0, s1) \
rs0 = vec_sub(r0, s0); \
rs1 = vec_sub(r1, s1);
#define _vec_i_mul_add(rs0, rs1, r0, r1, s0, s1, tmp) \
tmp = vec_splats(1.); \
rs0 = vec_xxnpmadd(s0, tmp, r0); \
rs1 = vec_xxnpmadd(s1, tmp, r1);
#define _vec_i_mul_sub(rs0, rs1, r0, r1, s0, s1, tmp) \
tmp = vec_splats(-1.); \
rs0 = vec_xxnpmadd(s0, tmp, r0); \
rs1 = vec_xxnpmadd(s1, tmp, r1);
#endif