10 #ifndef MULT_CLOVER_PARTS_QXS_H
11 #define MULT_CLOVER_PARTS_QXS_H
16 constexpr
int clv_idx[72] = {
17 0, -1, 6, 7, 16, 17, 28, 29, 24, 25, 34, 35,
18 -1, -1, 1, -1, 12, 13, 18, 19, 32, 33, 26, 27,
19 -1, -1, -1, -5, 2, -1, 8, 9, 20, 21, 30, 31,
20 -1, -1, -1, -1, -1, -1, 3, -1, 14, 15, 22, 23,
21 -1, -1, -1, -1, -1, -1, -1, -1, 4, -1, 10, 11,
22 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, -1,
25 inline constexpr
int index_clv_up(
int id1,
int ic1,
int id2,
int ic2,
int ri)
32 return clv_idx[2 * (6 * (ic1 + 3 * id1) + ic2 + 3 * id2) + ri];
36 inline constexpr
int index_clv_dn(
int id1,
int ic1,
int id2,
int ic2,
int ri)
43 return clv_idx[2 * (6 * (ic1 + 3 * id1) + ic2 + 3 * id2) + ri] + 36;
48 template<
typename REALTYPE>
49 inline void mult_clover_csw_aypx(REALTYPE *v2, REALTYPE a,
Vsimd_t *v2v,
50 REALTYPE *ct, REALTYPE *v1)
54 for (
int jd = 0; jd <
ND2; ++jd) {
55 for (
int id = 0;
id <
ND; ++id) {
56 int id2 = (
id +
ND2) %
ND;
61 for (
int ic = 0; ic <
NC; ++ic) {
64 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
65 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
66 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
68 load_vec(pg, ut10, &ctp[
VLEN * (ic2)]);
69 load_vec(pg, ut11, &ctp[
VLEN * (ic2 + 1)]);
70 load_vec(pg, ut12, &ctp[
VLEN * (
NVC + ic2)]);
71 load_vec(pg, ut13, &ctp[
VLEN * (
NVC + ic2 + 1)]);
72 load_vec(pg, ut14, &ctp[
VLEN * (2 *
NVC + ic2)]);
73 load_vec(pg, ut15, &ctp[
VLEN * (2 *
NVC + ic2 + 1)]);
75 load_vec(pg, vt10, &v1[
VLEN * (2 *
id)]);
76 load_vec(pg, vt11, &v1[
VLEN * (2 *
id + 1)]);
77 load_vec(pg, vt12, &v1[
VLEN * (2 *
ND + 2 *
id)]);
78 load_vec(pg, vt13, &v1[
VLEN * (2 *
ND + 2 *
id + 1)]);
79 load_vec(pg, vt14, &v1[
VLEN * (4 *
ND + 2 *
id)]);
80 load_vec(pg, vt15, &v1[
VLEN * (4 *
ND + 2 *
id + 1)]);
82 load_vec(pg, vt20, &v1[
VLEN * (2 * id2)]);
83 load_vec(pg, vt21, &v1[
VLEN * (2 * id2 + 1)]);
84 load_vec(pg, vt22, &v1[
VLEN * (2 *
ND + 2 * id2)]);
85 load_vec(pg, vt23, &v1[
VLEN * (2 *
ND + 2 * id2 + 1)]);
86 load_vec(pg, vt24, &v1[
VLEN * (4 *
ND + 2 * id2)]);
87 load_vec(pg, vt25, &v1[
VLEN * (4 *
ND + 2 * id2 + 1)]);
91 mult_uv(pg, wt1r, wt1i,
92 ut10, ut11, ut12, ut13, ut14, ut15,
93 vt10, vt11, vt12, vt13, vt14, vt15);
95 mult_uv(pg, wt2r, wt2i,
96 ut10, ut11, ut12, ut13, ut14, ut15,
97 vt20, vt21, vt22, vt23, vt24, vt25);
99 int icd1 = 2 * (jd +
ND * ic);
100 int icd2 = 2 * (jd +
ND2 +
ND * ic);
103 load_vec(pg, xt1r, &v2v[icd1].v[0]);
104 load_vec(pg, xt1i, &v2v[icd1 + 1].v[0]);
105 add_vec(pg, xt1r, wt1r);
106 add_vec(pg, xt1i, wt1i);
107 save_vec(pg, &v2v[icd1].v[0], xt1r);
108 save_vec(pg, &v2v[icd1 + 1].v[0], xt1i);
111 load_vec(pg, xt2r, &v2v[icd2].v[0]);
112 load_vec(pg, xt2i, &v2v[icd2 + 1].v[0]);
113 add_vec(pg, xt2r, wt2r);
114 add_vec(pg, xt2i, wt2i);
115 save_vec(pg, &v2v[icd2].v[0], xt2r);
116 save_vec(pg, &v2v[icd2 + 1].v[0], xt2i);
122 for (
int i = 0; i <
NVCD; ++i) {
123 load_vec(pg, v1F, &v1[
VLEN * i]);
124 load_vec(pg, v2F, &v2v[i].v[0]);
125 axpy_vec(pg, v1F, a, v2F);
126 save_vec(pg, &v2[
VLEN * i], v1F);
132 template<
typename REALTYPE>
134 REALTYPE *v1,
int id,
int ic)
137 load_vec(pg, wtr, &v1[
VLEN * (2 *
id + 0 + 2 *
ND * ic)]);
138 load_vec(pg, wti, &v1[
VLEN * (2 *
id + 1 + 2 *
ND * ic)]);
139 load_vec(pg, vtr, &v1[
VLEN * (2 *
id + 4 + 2 *
ND * ic)]);
140 load_vec(pg, vti, &v1[
VLEN * (2 *
id + 5 + 2 *
ND * ic)]);
141 sub_vec(pg, vtr, wtr);
142 sub_vec(pg, vti, wti);
147 template<
typename REALTYPE>
149 REALTYPE *v1,
int id,
int ic)
152 load_vec(pg, wtr, &v1[
VLEN * (2 *
id + 0 + 2 *
ND * ic)]);
153 load_vec(pg, wti, &v1[
VLEN * (2 *
id + 1 + 2 *
ND * ic)]);
154 load_vec(pg, vtr, &v1[
VLEN * (2 *
id + 4 + 2 *
ND * ic)]);
155 load_vec(pg, vti, &v1[
VLEN * (2 *
id + 5 + 2 *
ND * ic)]);
156 add_vec(pg, vtr, wtr);
157 add_vec(pg, vti, wti);
162 template<
typename REALTYPE>
163 inline void mult_clover_csw_aypx_chrot(
164 REALTYPE *__restrict v2, REALTYPE a,
165 REALTYPE *__restrict v2v,
166 REALTYPE *__restrict ct,
167 REALTYPE *__restrict v1)
173 svreal_t vt1r, vt1i, vt2r, vt2i, vt3r, vt3i;
174 svreal_t vt4r, vt4i, vt5r, vt5i, vt6r, vt6i;
176 set_2sp_up(pg, vt1r, vt1i, v1, 0, 0);
177 set_2sp_up(pg, vt2r, vt2i, v1, 0, 1);
178 set_2sp_up(pg, vt3r, vt3i, v1, 0, 2);
179 set_2sp_up(pg, vt4r, vt4i, v1, 1, 0);
180 set_2sp_up(pg, vt5r, vt5i, v1, 1, 1);
181 set_2sp_up(pg, vt6r, vt6i, v1, 1, 2);
183 svreal_t u11r, u22r, u33r, u44r, u55r, u66r;
184 load_vec(pg, u11r, &ct[
VLEN * index_clv_up(0, 0, 0, 0, 0)]);
185 load_vec(pg, u22r, &ct[
VLEN * index_clv_up(0, 1, 0, 1, 0)]);
186 load_vec(pg, u33r, &ct[
VLEN * index_clv_up(0, 2, 0, 2, 0)]);
187 load_vec(pg, u44r, &ct[
VLEN * index_clv_up(1, 0, 1, 0, 0)]);
188 load_vec(pg, u55r, &ct[
VLEN * index_clv_up(1, 1, 1, 1, 0)]);
189 load_vec(pg, u66r, &ct[
VLEN * index_clv_up(1, 2, 1, 2, 0)]);
191 svreal_t wt1r, wt1i, wt2r, wt2i, wt3r, wt3i;
192 svreal_t wt4r, wt4i, wt5r, wt5i, wt6r, wt6i;
193 mul_vec(pg, wt1r, u11r, vt1r);
194 mul_vec(pg, wt1i, u11r, vt1i);
195 mul_vec(pg, wt2r, u22r, vt2r);
196 mul_vec(pg, wt2i, u22r, vt2i);
197 mul_vec(pg, wt3r, u33r, vt3r);
198 mul_vec(pg, wt3i, u33r, vt3i);
201 load_vec(pg, u12r, &ct[
VLEN * index_clv_up(0, 0, 0, 1, 0)]);
202 load_vec(pg, u12i, &ct[
VLEN * index_clv_up(0, 0, 0, 1, 1)]);
204 mul_vec(pg, wt4r, u44r, vt4r);
205 mul_vec(pg, wt4i, u44r, vt4i);
206 mul_vec(pg, wt5r, u55r, vt5r);
207 mul_vec(pg, wt5i, u55r, vt5i);
208 mul_vec(pg, wt6r, u66r, vt6r);
209 mul_vec(pg, wt6i, u66r, vt6i);
212 load_vec(pg, u34r, &ct[
VLEN * index_clv_up(0, 2, 1, 0, 0)]);
213 load_vec(pg, u34i, &ct[
VLEN * index_clv_up(0, 2, 1, 0, 1)]);
214 load_vec(pg, u56r, &ct[
VLEN * index_clv_up(1, 1, 1, 2, 0)]);
215 load_vec(pg, u56i, &ct[
VLEN * index_clv_up(1, 1, 1, 2, 1)]);
217 axpy_vec(pg, wt1r, u12r, vt2r);
218 axpy_vec(pg, wt1i, u12r, vt2i);
219 ymax_vec(pg, wt1r, u12i, vt2i);
220 axpy_vec(pg, wt1i, u12i, vt2r);
221 axpy_vec(pg, wt2r, u12r, vt1r);
222 axpy_vec(pg, wt2i, u12r, vt1i);
223 axpy_vec(pg, wt2r, u12i, vt1i);
224 ymax_vec(pg, wt2i, u12i, vt1r);
227 load_vec(pg, u23r, &ct[
VLEN * index_clv_up(0, 1, 0, 2, 0)]);
228 load_vec(pg, u23i, &ct[
VLEN * index_clv_up(0, 1, 0, 2, 1)]);
229 axpy_vec(pg, wt3r, u34r, vt4r);
230 axpy_vec(pg, wt3i, u34r, vt4i);
231 ymax_vec(pg, wt3r, u34i, vt4i);
232 axpy_vec(pg, wt3i, u34i, vt4r);
233 axpy_vec(pg, wt4r, u34r, vt3r);
234 axpy_vec(pg, wt4i, u34r, vt3i);
235 axpy_vec(pg, wt4r, u34i, vt3i);
236 ymax_vec(pg, wt4i, u34i, vt3r);
239 load_vec(pg, u45r, &ct[
VLEN * index_clv_up(1, 0, 1, 1, 0)]);
240 load_vec(pg, u45i, &ct[
VLEN * index_clv_up(1, 0, 1, 1, 1)]);
241 axpy_vec(pg, wt5r, u56r, vt6r);
242 axpy_vec(pg, wt5i, u56r, vt6i);
243 ymax_vec(pg, wt5r, u56i, vt6i);
244 axpy_vec(pg, wt5i, u56i, vt6r);
245 axpy_vec(pg, wt6r, u56r, vt5r);
246 axpy_vec(pg, wt6i, u56r, vt5i);
247 axpy_vec(pg, wt6r, u56i, vt5i);
248 ymax_vec(pg, wt6i, u56i, vt5r);
251 load_vec(pg, u13r, &ct[
VLEN * index_clv_up(0, 0, 0, 2, 0)]);
252 load_vec(pg, u13i, &ct[
VLEN * index_clv_up(0, 0, 0, 2, 1)]);
253 axpy_vec(pg, wt2r, u23r, vt3r);
254 axpy_vec(pg, wt2i, u23r, vt3i);
255 ymax_vec(pg, wt2r, u23i, vt3i);
256 axpy_vec(pg, wt2i, u23i, vt3r);
257 axpy_vec(pg, wt3r, u23r, vt2r);
258 axpy_vec(pg, wt3i, u23r, vt2i);
259 axpy_vec(pg, wt3r, u23i, vt2i);
260 ymax_vec(pg, wt3i, u23i, vt2r);
263 load_vec(pg, u24r, &ct[
VLEN * index_clv_up(0, 1, 1, 0, 0)]);
264 load_vec(pg, u24i, &ct[
VLEN * index_clv_up(0, 1, 1, 0, 1)]);
265 axpy_vec(pg, wt4r, u45r, vt5r);
266 axpy_vec(pg, wt4i, u45r, vt5i);
267 ymax_vec(pg, wt4r, u45i, vt5i);
268 axpy_vec(pg, wt4i, u45i, vt5r);
269 axpy_vec(pg, wt5r, u45r, vt4r);
270 axpy_vec(pg, wt5i, u45r, vt4i);
271 axpy_vec(pg, wt5r, u45i, vt4i);
272 ymax_vec(pg, wt5i, u45i, vt4r);
275 load_vec(pg, u35r, &ct[
VLEN * index_clv_up(0, 2, 1, 1, 0)]);
276 load_vec(pg, u35i, &ct[
VLEN * index_clv_up(0, 2, 1, 1, 1)]);
277 axpy_vec(pg, wt1r, u13r, vt3r);
278 axpy_vec(pg, wt1i, u13r, vt3i);
279 ymax_vec(pg, wt1r, u13i, vt3i);
280 axpy_vec(pg, wt1i, u13i, vt3r);
281 axpy_vec(pg, wt3r, u13r, vt1r);
282 axpy_vec(pg, wt3i, u13r, vt1i);
283 axpy_vec(pg, wt3r, u13i, vt1i);
284 ymax_vec(pg, wt3i, u13i, vt1r);
287 load_vec(pg, u46r, &ct[
VLEN * index_clv_up(1, 0, 1, 2, 0)]);
288 load_vec(pg, u46i, &ct[
VLEN * index_clv_up(1, 0, 1, 2, 1)]);
289 axpy_vec(pg, wt2r, u24r, vt4r);
290 axpy_vec(pg, wt2i, u24r, vt4i);
291 ymax_vec(pg, wt2r, u24i, vt4i);
292 axpy_vec(pg, wt2i, u24i, vt4r);
293 axpy_vec(pg, wt4r, u24r, vt2r);
294 axpy_vec(pg, wt4i, u24r, vt2i);
295 axpy_vec(pg, wt4r, u24i, vt2i);
296 ymax_vec(pg, wt4i, u24i, vt2r);
299 load_vec(pg, u15r, &ct[
VLEN * index_clv_up(0, 0, 1, 1, 0)]);
300 load_vec(pg, u15i, &ct[
VLEN * index_clv_up(0, 0, 1, 1, 1)]);
301 axpy_vec(pg, wt3r, u35r, vt5r);
302 axpy_vec(pg, wt3i, u35r, vt5i);
303 ymax_vec(pg, wt3r, u35i, vt5i);
304 axpy_vec(pg, wt3i, u35i, vt5r);
305 axpy_vec(pg, wt5r, u35r, vt3r);
306 axpy_vec(pg, wt5i, u35r, vt3i);
307 axpy_vec(pg, wt5r, u35i, vt3i);
308 ymax_vec(pg, wt5i, u35i, vt3r);
311 load_vec(pg, u26r, &ct[
VLEN * index_clv_up(0, 1, 1, 2, 0)]);
312 load_vec(pg, u26i, &ct[
VLEN * index_clv_up(0, 1, 1, 2, 1)]);
313 axpy_vec(pg, wt6r, u46r, vt4r);
314 axpy_vec(pg, wt6i, u46r, vt4i);
315 axpy_vec(pg, wt6r, u46i, vt4i);
316 ymax_vec(pg, wt6i, u46i, vt4r);
317 axpy_vec(pg, wt4r, u46r, vt6r);
318 axpy_vec(pg, wt4i, u46r, vt6i);
319 ymax_vec(pg, wt4r, u46i, vt6i);
320 axpy_vec(pg, wt4i, u46i, vt6r);
323 load_vec(pg, u14r, &ct[
VLEN * index_clv_up(0, 0, 1, 0, 0)]);
324 load_vec(pg, u14i, &ct[
VLEN * index_clv_up(0, 0, 1, 0, 1)]);
325 axpy_vec(pg, wt1r, u15r, vt5r);
326 axpy_vec(pg, wt1i, u15r, vt5i);
327 ymax_vec(pg, wt1r, u15i, vt5i);
328 axpy_vec(pg, wt1i, u15i, vt5r);
329 axpy_vec(pg, wt5r, u15r, vt1r);
330 axpy_vec(pg, wt5i, u15r, vt1i);
331 axpy_vec(pg, wt5r, u15i, vt1i);
332 ymax_vec(pg, wt5i, u15i, vt1r);
335 load_vec(pg, u36r, &ct[
VLEN * index_clv_up(0, 2, 1, 2, 0)]);
336 load_vec(pg, u36i, &ct[
VLEN * index_clv_up(0, 2, 1, 2, 1)]);
337 axpy_vec(pg, wt6r, u26r, vt2r);
338 axpy_vec(pg, wt6i, u26r, vt2i);
339 axpy_vec(pg, wt6r, u26i, vt2i);
340 ymax_vec(pg, wt6i, u26i, vt2r);
341 axpy_vec(pg, wt2r, u26r, vt6r);
342 axpy_vec(pg, wt2i, u26r, vt6i);
343 ymax_vec(pg, wt2r, u26i, vt6i);
344 axpy_vec(pg, wt2i, u26i, vt6r);
347 load_vec(pg, u25r, &ct[
VLEN * index_clv_up(0, 1, 1, 1, 0)]);
348 load_vec(pg, u25i, &ct[
VLEN * index_clv_up(0, 1, 1, 1, 1)]);
349 axpy_vec(pg, wt4r, u14r, vt1r);
350 axpy_vec(pg, wt4i, u14r, vt1i);
351 axpy_vec(pg, wt4r, u14i, vt1i);
352 ymax_vec(pg, wt4i, u14i, vt1r);
353 axpy_vec(pg, wt1r, u14r, vt4r);
354 axpy_vec(pg, wt1i, u14r, vt4i);
355 ymax_vec(pg, wt1r, u14i, vt4i);
356 axpy_vec(pg, wt1i, u14i, vt4r);
359 load_vec(pg, u16r, &ct[
VLEN * index_clv_up(0, 0, 1, 2, 0)]);
360 load_vec(pg, u16i, &ct[
VLEN * index_clv_up(0, 0, 1, 2, 1)]);
361 axpy_vec(pg, wt6r, u36r, vt3r);
362 axpy_vec(pg, wt6i, u36r, vt3i);
363 axpy_vec(pg, wt6r, u36i, vt3i);
364 ymax_vec(pg, wt6i, u36i, vt3r);
365 axpy_vec(pg, wt3r, u36r, vt6r);
366 axpy_vec(pg, wt3i, u36r, vt6i);
367 ymax_vec(pg, wt3r, u36i, vt6i);
368 axpy_vec(pg, wt3i, u36i, vt6r);
371 axpy_vec(pg, wt2r, u25r, vt5r);
372 axpy_vec(pg, wt2i, u25r, vt5i);
373 ymax_vec(pg, wt2r, u25i, vt5i);
374 axpy_vec(pg, wt2i, u25i, vt5r);
375 axpy_vec(pg, wt5r, u25r, vt2r);
376 axpy_vec(pg, wt5i, u25r, vt2i);
377 axpy_vec(pg, wt5r, u25i, vt2i);
378 ymax_vec(pg, wt5i, u25i, vt2r);
380 axpy_vec(pg, wt1r, u16r, vt6r);
381 axpy_vec(pg, wt1i, u16r, vt6i);
382 ymax_vec(pg, wt1r, u16i, vt6i);
383 axpy_vec(pg, wt1i, u16i, vt6r);
384 axpy_vec(pg, wt6r, u16r, vt1r);
385 axpy_vec(pg, wt6i, u16r, vt1i);
386 axpy_vec(pg, wt6r, u16i, vt1i);
387 ymax_vec(pg, wt6i, u16i, vt1r);
390 svreal_t yt1r, yt1i, yt2r, yt2i, yt3r, yt3i;
391 load_vec(pg, yt1r, &v2v[
VLEN * (2 * (2 + 4 * 0))]);
392 load_vec(pg, yt1i, &v2v[
VLEN * (1 + 2 * (2 + 4 * 0))]);
393 load_vec(pg, yt2r, &v2v[
VLEN * (2 * (2 + 4 * 1))]);
394 load_vec(pg, yt2i, &v2v[
VLEN * (1 + 2 * (2 + 4 * 1))]);
395 load_vec(pg, yt3r, &v2v[
VLEN * (2 * (2 + 4 * 2))]);
396 load_vec(pg, yt3i, &v2v[
VLEN * (1 + 2 * (2 + 4 * 2))]);
397 add_vec(pg, yt1r, wt1r);
398 add_vec(pg, yt1i, wt1i);
399 add_vec(pg, yt2r, wt2r);
400 add_vec(pg, yt2i, wt2i);
401 add_vec(pg, yt3r, wt3r);
402 add_vec(pg, yt3i, wt3i);
403 save_vec(pg, &v2v[
VLEN * (2 * (2 + 4 * 0))], yt1r);
404 save_vec(pg, &v2v[
VLEN * (1 + 2 * (2 + 4 * 0))], yt1i);
405 save_vec(pg, &v2v[
VLEN * (2 * (2 + 4 * 1))], yt2r);
406 save_vec(pg, &v2v[
VLEN * (1 + 2 * (2 + 4 * 1))], yt2i);
407 save_vec(pg, &v2v[
VLEN * (2 * (2 + 4 * 2))], yt3r);
408 save_vec(pg, &v2v[
VLEN * (1 + 2 * (2 + 4 * 2))], yt3i);
410 svreal_t yt4r, yt4i, yt5r, yt5i, yt6r, yt6i;
411 load_vec(pg, yt4r, &v2v[
VLEN * (2 * (3 + 4 * 0))]);
412 load_vec(pg, yt4i, &v2v[
VLEN * (1 + 2 * (3 + 4 * 0))]);
413 load_vec(pg, yt5r, &v2v[
VLEN * (2 * (3 + 4 * 1))]);
414 load_vec(pg, yt5i, &v2v[
VLEN * (1 + 2 * (3 + 4 * 1))]);
415 load_vec(pg, yt6r, &v2v[
VLEN * (2 * (3 + 4 * 2))]);
416 load_vec(pg, yt6i, &v2v[
VLEN * (1 + 2 * (3 + 4 * 2))]);
417 add_vec(pg, yt4r, wt4r);
418 add_vec(pg, yt4i, wt4i);
419 add_vec(pg, yt5r, wt5r);
420 add_vec(pg, yt5i, wt5i);
421 add_vec(pg, yt6r, wt6r);
422 add_vec(pg, yt6i, wt6i);
423 save_vec(pg, &v2v[
VLEN * (2 * (3 + 4 * 0))], yt4r);
424 save_vec(pg, &v2v[
VLEN * (1 + 2 * (3 + 4 * 0))], yt4i);
425 save_vec(pg, &v2v[
VLEN * (2 * (3 + 4 * 1))], yt5r);
426 save_vec(pg, &v2v[
VLEN * (1 + 2 * (3 + 4 * 1))], yt5i);
427 save_vec(pg, &v2v[
VLEN * (2 * (3 + 4 * 2))], yt6r);
428 save_vec(pg, &v2v[
VLEN * (1 + 2 * (3 + 4 * 2))], yt6i);
431 svreal_t yt1r, yt1i, yt2r, yt2i, yt3r, yt3i;
432 load_vec(pg, yt1r, &v2v[
VLEN * (2 * (0 + 4 * 0))]);
433 load_vec(pg, yt1i, &v2v[
VLEN * (1 + 2 * (0 + 4 * 0))]);
434 load_vec(pg, yt2r, &v2v[
VLEN * (2 * (0 + 4 * 1))]);
435 load_vec(pg, yt2i, &v2v[
VLEN * (1 + 2 * (0 + 4 * 1))]);
436 load_vec(pg, yt3r, &v2v[
VLEN * (2 * (0 + 4 * 2))]);
437 load_vec(pg, yt3i, &v2v[
VLEN * (1 + 2 * (0 + 4 * 2))]);
438 sub_vec(pg, yt1r, wt1r);
439 sub_vec(pg, yt1i, wt1i);
440 sub_vec(pg, yt2r, wt2r);
441 sub_vec(pg, yt2i, wt2i);
442 sub_vec(pg, yt3r, wt3r);
443 sub_vec(pg, yt3i, wt3i);
444 save_vec(pg, &v2v[
VLEN * (2 * (0 + 4 * 0))], yt1r);
445 save_vec(pg, &v2v[
VLEN * (1 + 2 * (0 + 4 * 0))], yt1i);
446 save_vec(pg, &v2v[
VLEN * (2 * (0 + 4 * 1))], yt2r);
447 save_vec(pg, &v2v[
VLEN * (1 + 2 * (0 + 4 * 1))], yt2i);
448 save_vec(pg, &v2v[
VLEN * (2 * (0 + 4 * 2))], yt3r);
449 save_vec(pg, &v2v[
VLEN * (1 + 2 * (0 + 4 * 2))], yt3i);
451 svreal_t yt4r, yt4i, yt5r, yt5i, yt6r, yt6i;
452 load_vec(pg, yt4r, &v2v[
VLEN * (2 * (1 + 4 * 0))]);
453 load_vec(pg, yt4i, &v2v[
VLEN * (1 + 2 * (1 + 4 * 0))]);
454 load_vec(pg, yt5r, &v2v[
VLEN * (2 * (1 + 4 * 1))]);
455 load_vec(pg, yt5i, &v2v[
VLEN * (1 + 2 * (1 + 4 * 1))]);
456 load_vec(pg, yt6r, &v2v[
VLEN * (2 * (1 + 4 * 2))]);
457 load_vec(pg, yt6i, &v2v[
VLEN * (1 + 2 * (1 + 4 * 2))]);
458 sub_vec(pg, yt4r, wt4r);
459 sub_vec(pg, yt4i, wt4i);
460 sub_vec(pg, yt5r, wt5r);
461 sub_vec(pg, yt5i, wt5i);
462 sub_vec(pg, yt6r, wt6r);
463 sub_vec(pg, yt6i, wt6i);
464 save_vec(pg, &v2v[
VLEN * (2 * (1 + 4 * 0))], yt4r);
465 save_vec(pg, &v2v[
VLEN * (1 + 2 * (1 + 4 * 0))], yt4i);
466 save_vec(pg, &v2v[
VLEN * (2 * (1 + 4 * 1))], yt5r);
467 save_vec(pg, &v2v[
VLEN * (1 + 2 * (1 + 4 * 1))], yt5i);
468 save_vec(pg, &v2v[
VLEN * (2 * (1 + 4 * 2))], yt6r);
469 save_vec(pg, &v2v[
VLEN * (1 + 2 * (1 + 4 * 2))], yt6i);
473 set_2sp_dn(pg, vt1r, vt1i, v1, 0, 0);
474 set_2sp_dn(pg, vt2r, vt2i, v1, 0, 1);
475 set_2sp_dn(pg, vt3r, vt3i, v1, 0, 2);
476 set_2sp_dn(pg, vt4r, vt4i, v1, 1, 0);
477 set_2sp_dn(pg, vt5r, vt5i, v1, 1, 1);
478 set_2sp_dn(pg, vt6r, vt6i, v1, 1, 2);
480 svreal_t u11r, u22r, u33r, u44r, u55r, u66r;
481 load_vec(pg, u11r, &ct[
VLEN * index_clv_dn(0, 0, 0, 0, 0)]);
482 load_vec(pg, u22r, &ct[
VLEN * index_clv_dn(0, 1, 0, 1, 0)]);
483 load_vec(pg, u33r, &ct[
VLEN * index_clv_dn(0, 2, 0, 2, 0)]);
484 load_vec(pg, u44r, &ct[
VLEN * index_clv_dn(1, 0, 1, 0, 0)]);
485 load_vec(pg, u55r, &ct[
VLEN * index_clv_dn(1, 1, 1, 1, 0)]);
486 load_vec(pg, u66r, &ct[
VLEN * index_clv_dn(1, 2, 1, 2, 0)]);
490 svreal_t wt1r, wt1i, wt2r, wt2i, wt3r, wt3i;
491 svreal_t wt4r, wt4i, wt5r, wt5i, wt6r, wt6i;
492 mul_vec(pg, wt1r, u11r, vt1r);
493 mul_vec(pg, wt1i, u11r, vt1i);
494 mul_vec(pg, wt2r, u22r, vt2r);
495 mul_vec(pg, wt2i, u22r, vt2i);
496 mul_vec(pg, wt3r, u33r, vt3r);
497 mul_vec(pg, wt3i, u33r, vt3i);
500 load_vec(pg, u12r, &ct[
VLEN * index_clv_dn(0, 0, 0, 1, 0)]);
501 load_vec(pg, u12i, &ct[
VLEN * index_clv_dn(0, 0, 0, 1, 1)]);
503 mul_vec(pg, wt4r, u44r, vt4r);
504 mul_vec(pg, wt4i, u44r, vt4i);
505 mul_vec(pg, wt5r, u55r, vt5r);
506 mul_vec(pg, wt5i, u55r, vt5i);
507 mul_vec(pg, wt6r, u66r, vt6r);
508 mul_vec(pg, wt6i, u66r, vt6i);
511 load_vec(pg, u34r, &ct[
VLEN * index_clv_dn(0, 2, 1, 0, 0)]);
512 load_vec(pg, u34i, &ct[
VLEN * index_clv_dn(0, 2, 1, 0, 1)]);
513 load_vec(pg, u56r, &ct[
VLEN * index_clv_dn(1, 1, 1, 2, 0)]);
514 load_vec(pg, u56i, &ct[
VLEN * index_clv_dn(1, 1, 1, 2, 1)]);
516 axpy_vec(pg, wt1r, u12r, vt2r);
517 axpy_vec(pg, wt1i, u12r, vt2i);
518 ymax_vec(pg, wt1r, u12i, vt2i);
519 axpy_vec(pg, wt1i, u12i, vt2r);
520 axpy_vec(pg, wt2r, u12r, vt1r);
521 axpy_vec(pg, wt2i, u12r, vt1i);
522 axpy_vec(pg, wt2r, u12i, vt1i);
523 ymax_vec(pg, wt2i, u12i, vt1r);
526 load_vec(pg, u23r, &ct[
VLEN * index_clv_dn(0, 1, 0, 2, 0)]);
527 load_vec(pg, u23i, &ct[
VLEN * index_clv_dn(0, 1, 0, 2, 1)]);
528 axpy_vec(pg, wt3r, u34r, vt4r);
529 axpy_vec(pg, wt3i, u34r, vt4i);
530 ymax_vec(pg, wt3r, u34i, vt4i);
531 axpy_vec(pg, wt3i, u34i, vt4r);
532 axpy_vec(pg, wt4r, u34r, vt3r);
533 axpy_vec(pg, wt4i, u34r, vt3i);
534 axpy_vec(pg, wt4r, u34i, vt3i);
535 ymax_vec(pg, wt4i, u34i, vt3r);
538 load_vec(pg, u45r, &ct[
VLEN * index_clv_dn(1, 0, 1, 1, 0)]);
539 load_vec(pg, u45i, &ct[
VLEN * index_clv_dn(1, 0, 1, 1, 1)]);
540 axpy_vec(pg, wt5r, u56r, vt6r);
541 axpy_vec(pg, wt5i, u56r, vt6i);
542 ymax_vec(pg, wt5r, u56i, vt6i);
543 axpy_vec(pg, wt5i, u56i, vt6r);
544 axpy_vec(pg, wt6r, u56r, vt5r);
545 axpy_vec(pg, wt6i, u56r, vt5i);
546 axpy_vec(pg, wt6r, u56i, vt5i);
547 ymax_vec(pg, wt6i, u56i, vt5r);
550 load_vec(pg, u13r, &ct[
VLEN * index_clv_dn(0, 0, 0, 2, 0)]);
551 load_vec(pg, u13i, &ct[
VLEN * index_clv_dn(0, 0, 0, 2, 1)]);
552 axpy_vec(pg, wt2r, u23r, vt3r);
553 axpy_vec(pg, wt2i, u23r, vt3i);
554 ymax_vec(pg, wt2r, u23i, vt3i);
555 axpy_vec(pg, wt2i, u23i, vt3r);
556 axpy_vec(pg, wt3r, u23r, vt2r);
557 axpy_vec(pg, wt3i, u23r, vt2i);
558 axpy_vec(pg, wt3r, u23i, vt2i);
559 ymax_vec(pg, wt3i, u23i, vt2r);
562 load_vec(pg, u24r, &ct[
VLEN * index_clv_dn(0, 1, 1, 0, 0)]);
563 load_vec(pg, u24i, &ct[
VLEN * index_clv_dn(0, 1, 1, 0, 1)]);
564 axpy_vec(pg, wt4r, u45r, vt5r);
565 axpy_vec(pg, wt4i, u45r, vt5i);
566 ymax_vec(pg, wt4r, u45i, vt5i);
567 axpy_vec(pg, wt4i, u45i, vt5r);
568 axpy_vec(pg, wt5r, u45r, vt4r);
569 axpy_vec(pg, wt5i, u45r, vt4i);
570 axpy_vec(pg, wt5r, u45i, vt4i);
571 ymax_vec(pg, wt5i, u45i, vt4r);
574 load_vec(pg, u35r, &ct[
VLEN * index_clv_dn(0, 2, 1, 1, 0)]);
575 load_vec(pg, u35i, &ct[
VLEN * index_clv_dn(0, 2, 1, 1, 1)]);
576 axpy_vec(pg, wt1r, u13r, vt3r);
577 axpy_vec(pg, wt1i, u13r, vt3i);
578 ymax_vec(pg, wt1r, u13i, vt3i);
579 axpy_vec(pg, wt1i, u13i, vt3r);
580 axpy_vec(pg, wt3r, u13r, vt1r);
581 axpy_vec(pg, wt3i, u13r, vt1i);
582 axpy_vec(pg, wt3r, u13i, vt1i);
583 ymax_vec(pg, wt3i, u13i, vt1r);
586 load_vec(pg, u46r, &ct[
VLEN * index_clv_dn(1, 0, 1, 2, 0)]);
587 load_vec(pg, u46i, &ct[
VLEN * index_clv_dn(1, 0, 1, 2, 1)]);
588 axpy_vec(pg, wt2r, u24r, vt4r);
589 axpy_vec(pg, wt2i, u24r, vt4i);
590 ymax_vec(pg, wt2r, u24i, vt4i);
591 axpy_vec(pg, wt2i, u24i, vt4r);
592 axpy_vec(pg, wt4r, u24r, vt2r);
593 axpy_vec(pg, wt4i, u24r, vt2i);
594 axpy_vec(pg, wt4r, u24i, vt2i);
595 ymax_vec(pg, wt4i, u24i, vt2r);
598 load_vec(pg, u15r, &ct[
VLEN * index_clv_dn(0, 0, 1, 1, 0)]);
599 load_vec(pg, u15i, &ct[
VLEN * index_clv_dn(0, 0, 1, 1, 1)]);
600 axpy_vec(pg, wt3r, u35r, vt5r);
601 axpy_vec(pg, wt3i, u35r, vt5i);
602 ymax_vec(pg, wt3r, u35i, vt5i);
603 axpy_vec(pg, wt3i, u35i, vt5r);
604 axpy_vec(pg, wt5r, u35r, vt3r);
605 axpy_vec(pg, wt5i, u35r, vt3i);
606 axpy_vec(pg, wt5r, u35i, vt3i);
607 ymax_vec(pg, wt5i, u35i, vt3r);
610 load_vec(pg, u26r, &ct[
VLEN * index_clv_dn(0, 1, 1, 2, 0)]);
611 load_vec(pg, u26i, &ct[
VLEN * index_clv_dn(0, 1, 1, 2, 1)]);
612 axpy_vec(pg, wt6r, u46r, vt4r);
613 axpy_vec(pg, wt6i, u46r, vt4i);
614 axpy_vec(pg, wt6r, u46i, vt4i);
615 ymax_vec(pg, wt6i, u46i, vt4r);
616 axpy_vec(pg, wt4r, u46r, vt6r);
617 axpy_vec(pg, wt4i, u46r, vt6i);
618 ymax_vec(pg, wt4r, u46i, vt6i);
619 axpy_vec(pg, wt4i, u46i, vt6r);
622 load_vec(pg, u14r, &ct[
VLEN * index_clv_dn(0, 0, 1, 0, 0)]);
623 load_vec(pg, u14i, &ct[
VLEN * index_clv_dn(0, 0, 1, 0, 1)]);
624 axpy_vec(pg, wt1r, u15r, vt5r);
625 axpy_vec(pg, wt1i, u15r, vt5i);
626 ymax_vec(pg, wt1r, u15i, vt5i);
627 axpy_vec(pg, wt1i, u15i, vt5r);
628 axpy_vec(pg, wt5r, u15r, vt1r);
629 axpy_vec(pg, wt5i, u15r, vt1i);
630 axpy_vec(pg, wt5r, u15i, vt1i);
631 ymax_vec(pg, wt5i, u15i, vt1r);
634 load_vec(pg, u36r, &ct[
VLEN * index_clv_dn(0, 2, 1, 2, 0)]);
635 load_vec(pg, u36i, &ct[
VLEN * index_clv_dn(0, 2, 1, 2, 1)]);
636 axpy_vec(pg, wt6r, u26r, vt2r);
637 axpy_vec(pg, wt6i, u26r, vt2i);
638 axpy_vec(pg, wt6r, u26i, vt2i);
639 ymax_vec(pg, wt6i, u26i, vt2r);
640 axpy_vec(pg, wt2r, u26r, vt6r);
641 axpy_vec(pg, wt2i, u26r, vt6i);
642 ymax_vec(pg, wt2r, u26i, vt6i);
643 axpy_vec(pg, wt2i, u26i, vt6r);
646 load_vec(pg, u25r, &ct[
VLEN * index_clv_dn(0, 1, 1, 1, 0)]);
647 load_vec(pg, u25i, &ct[
VLEN * index_clv_dn(0, 1, 1, 1, 1)]);
648 axpy_vec(pg, wt4r, u14r, vt1r);
649 axpy_vec(pg, wt4i, u14r, vt1i);
650 axpy_vec(pg, wt4r, u14i, vt1i);
651 ymax_vec(pg, wt4i, u14i, vt1r);
652 axpy_vec(pg, wt1r, u14r, vt4r);
653 axpy_vec(pg, wt1i, u14r, vt4i);
654 ymax_vec(pg, wt1r, u14i, vt4i);
655 axpy_vec(pg, wt1i, u14i, vt4r);
658 load_vec(pg, u16r, &ct[
VLEN * index_clv_dn(0, 0, 1, 2, 0)]);
659 load_vec(pg, u16i, &ct[
VLEN * index_clv_dn(0, 0, 1, 2, 1)]);
660 axpy_vec(pg, wt6r, u36r, vt3r);
661 axpy_vec(pg, wt6i, u36r, vt3i);
662 axpy_vec(pg, wt6r, u36i, vt3i);
663 ymax_vec(pg, wt6i, u36i, vt3r);
664 axpy_vec(pg, wt3r, u36r, vt6r);
665 axpy_vec(pg, wt3i, u36r, vt6i);
666 ymax_vec(pg, wt3r, u36i, vt6i);
667 axpy_vec(pg, wt3i, u36i, vt6r);
670 axpy_vec(pg, wt2r, u25r, vt5r);
671 axpy_vec(pg, wt2i, u25r, vt5i);
672 ymax_vec(pg, wt2r, u25i, vt5i);
673 axpy_vec(pg, wt2i, u25i, vt5r);
674 axpy_vec(pg, wt5r, u25r, vt2r);
675 axpy_vec(pg, wt5i, u25r, vt2i);
676 axpy_vec(pg, wt5r, u25i, vt2i);
677 ymax_vec(pg, wt5i, u25i, vt2r);
679 axpy_vec(pg, wt1r, u16r, vt6r);
680 axpy_vec(pg, wt1i, u16r, vt6i);
681 ymax_vec(pg, wt1r, u16i, vt6i);
682 axpy_vec(pg, wt1i, u16i, vt6r);
683 axpy_vec(pg, wt6r, u16r, vt1r);
684 axpy_vec(pg, wt6i, u16r, vt1i);
685 axpy_vec(pg, wt6r, u16i, vt1i);
686 ymax_vec(pg, wt6i, u16i, vt1r);
689 for (
int sp = 0; sp < 4; sp += 2) {
691 svreal_t yt1r, yt1i, yt2r, yt2i, yt3r, yt3i;
692 load_vec(pg, yt1r, &v2v[
VLEN * (2 * (sp + 4 * 0))]);
693 load_vec(pg, yt1i, &v2v[
VLEN * (1 + 2 * (sp + 4 * 0))]);
694 load_vec(pg, yt2r, &v2v[
VLEN * (2 * (sp + 4 * 1))]);
695 load_vec(pg, yt2i, &v2v[
VLEN * (1 + 2 * (sp + 4 * 1))]);
696 load_vec(pg, yt3r, &v2v[
VLEN * (2 * (sp + 4 * 2))]);
697 load_vec(pg, yt3i, &v2v[
VLEN * (1 + 2 * (sp + 4 * 2))]);
698 add_vec(pg, yt1r, wt1r);
699 add_vec(pg, yt1i, wt1i);
700 add_vec(pg, yt2r, wt2r);
701 add_vec(pg, yt2i, wt2i);
702 add_vec(pg, yt3r, wt3r);
703 add_vec(pg, yt3i, wt3i);
705 svreal_t xt1r, xt1i, xt2r, xt2i, xt3r, xt3i;
706 load_vec(pg, xt1r, &v1[
VLEN * (2 * (sp + 4 * 0))]);
707 load_vec(pg, xt1i, &v1[
VLEN * (1 + 2 * (sp + 4 * 0))]);
708 load_vec(pg, xt2r, &v1[
VLEN * (2 * (sp + 4 * 1))]);
709 load_vec(pg, xt2i, &v1[
VLEN * (1 + 2 * (sp + 4 * 1))]);
710 load_vec(pg, xt3r, &v1[
VLEN * (2 * (sp + 4 * 2))]);
711 load_vec(pg, xt3i, &v1[
VLEN * (1 + 2 * (sp + 4 * 2))]);
713 aypx_vec(pg, a, yt1r, xt1r);
714 aypx_vec(pg, a, yt1i, xt1i);
715 aypx_vec(pg, a, yt2r, xt2r);
716 aypx_vec(pg, a, yt2i, xt2i);
717 aypx_vec(pg, a, yt3r, xt3r);
718 aypx_vec(pg, a, yt3i, xt3i);
720 save_vec(pg, &v2[
VLEN * (2 * (sp + 4 * 0))], yt1r);
721 save_vec(pg, &v2[
VLEN * (1 + 2 * (sp + 4 * 0))], yt1i);
722 save_vec(pg, &v2[
VLEN * (2 * (sp + 4 * 1))], yt2r);
723 save_vec(pg, &v2[
VLEN * (1 + 2 * (sp + 4 * 1))], yt2i);
724 save_vec(pg, &v2[
VLEN * (2 * (sp + 4 * 2))], yt3r);
725 save_vec(pg, &v2[
VLEN * (1 + 2 * (sp + 4 * 2))], yt3i);
728 svreal_t yt4r, yt4i, yt5r, yt5i, yt6r, yt6i;
729 load_vec(pg, yt4r, &v2v[
VLEN * (2 * (sp + 1 + 4 * 0))]);
730 load_vec(pg, yt4i, &v2v[
VLEN * (1 + 2 * (sp + 1 + 4 * 0))]);
731 load_vec(pg, yt5r, &v2v[
VLEN * (2 * (sp + 1 + 4 * 1))]);
732 load_vec(pg, yt5i, &v2v[
VLEN * (1 + 2 * (sp + 1 + 4 * 1))]);
733 load_vec(pg, yt6r, &v2v[
VLEN * (2 * (sp + 1 + 4 * 2))]);
734 load_vec(pg, yt6i, &v2v[
VLEN * (1 + 2 * (sp + 1 + 4 * 2))]);
735 add_vec(pg, yt4r, wt4r);
736 add_vec(pg, yt4i, wt4i);
737 add_vec(pg, yt5r, wt5r);
738 add_vec(pg, yt5i, wt5i);
739 add_vec(pg, yt6r, wt6r);
740 add_vec(pg, yt6i, wt6i);
742 svreal_t xt4r, xt4i, xt5r, xt5i, xt6r, xt6i;
743 load_vec(pg, xt4r, &v1[
VLEN * (2 * (sp + 1 + 4 * 0))]);
744 load_vec(pg, xt4i, &v1[
VLEN * (1 + 2 * (sp + 1 + 4 * 0))]);
745 load_vec(pg, xt5r, &v1[
VLEN * (2 * (sp + 1 + 4 * 1))]);
746 load_vec(pg, xt5i, &v1[
VLEN * (1 + 2 * (sp + 1 + 4 * 1))]);
747 load_vec(pg, xt6r, &v1[
VLEN * (2 * (sp + 1 + 4 * 2))]);
748 load_vec(pg, xt6i, &v1[
VLEN * (1 + 2 * (sp + 1 + 4 * 2))]);
750 aypx_vec(pg, a, yt4r, xt4r);
751 aypx_vec(pg, a, yt4i, xt4i);
752 aypx_vec(pg, a, yt5r, xt5r);
753 aypx_vec(pg, a, yt5i, xt5i);
754 aypx_vec(pg, a, yt6r, xt6r);
755 aypx_vec(pg, a, yt6i, xt6i);
757 save_vec(pg, &v2[
VLEN * (2 * (sp + 1 + 4 * 0))], yt4r);
758 save_vec(pg, &v2[
VLEN * (1 + 2 * (sp + 1 + 4 * 0))], yt4i);
759 save_vec(pg, &v2[
VLEN * (2 * (sp + 1 + 4 * 1))], yt5r);
760 save_vec(pg, &v2[
VLEN * (1 + 2 * (sp + 1 + 4 * 1))], yt5i);
761 save_vec(pg, &v2[
VLEN * (2 * (sp + 1 + 4 * 2))], yt6r);
762 save_vec(pg, &v2[
VLEN * (1 + 2 * (sp + 1 + 4 * 2))], yt6i);
770 template<
typename REALTYPE>
771 inline void mult_cswinv_chrot(REALTYPE *__restrict v2,
772 REALTYPE *__restrict ct,
773 REALTYPE *__restrict v1)
777 svreal_t vt1r, vt1i, vt2r, vt2i, vt3r, vt3i;
778 svreal_t vt4r, vt4i, vt5r, vt5i, vt6r, vt6i;
785 set_2sp_up(pg, vt1r, vt1i, v1, 0, 0);
786 set_2sp_up(pg, vt2r, vt2i, v1, 0, 1);
787 set_2sp_up(pg, vt3r, vt3i, v1, 0, 2);
788 set_2sp_up(pg, vt4r, vt4i, v1, 1, 0);
789 set_2sp_up(pg, vt5r, vt5i, v1, 1, 1);
790 set_2sp_up(pg, vt6r, vt6i, v1, 1, 2);
792 svreal_t u11r, u22r, u33r, u44r, u55r, u66r;
793 load_vec(pg, u11r, &ct[
VLEN * index_clv_up(0, 0, 0, 0, 0)]);
794 load_vec(pg, u22r, &ct[
VLEN * index_clv_up(0, 1, 0, 1, 0)]);
795 load_vec(pg, u33r, &ct[
VLEN * index_clv_up(0, 2, 0, 2, 0)]);
796 load_vec(pg, u44r, &ct[
VLEN * index_clv_up(1, 0, 1, 0, 0)]);
797 load_vec(pg, u55r, &ct[
VLEN * index_clv_up(1, 1, 1, 1, 0)]);
798 load_vec(pg, u66r, &ct[
VLEN * index_clv_up(1, 2, 1, 2, 0)]);
800 svreal_t wt1r, wt1i, wt2r, wt2i, wt3r, wt3i;
801 svreal_t wt4r, wt4i, wt5r, wt5i, wt6r, wt6i;
802 mul_vec(pg, wt1r, u11r, vt1r);
803 mul_vec(pg, wt1i, u11r, vt1i);
804 mul_vec(pg, wt2r, u22r, vt2r);
805 mul_vec(pg, wt2i, u22r, vt2i);
806 mul_vec(pg, wt3r, u33r, vt3r);
807 mul_vec(pg, wt3i, u33r, vt3i);
810 load_vec(pg, u12r, &ct[
VLEN * index_clv_up(0, 0, 0, 1, 0)]);
811 load_vec(pg, u12i, &ct[
VLEN * index_clv_up(0, 0, 0, 1, 1)]);
813 mul_vec(pg, wt4r, u44r, vt4r);
814 mul_vec(pg, wt4i, u44r, vt4i);
815 mul_vec(pg, wt5r, u55r, vt5r);
816 mul_vec(pg, wt5i, u55r, vt5i);
817 mul_vec(pg, wt6r, u66r, vt6r);
818 mul_vec(pg, wt6i, u66r, vt6i);
821 load_vec(pg, u34r, &ct[
VLEN * index_clv_up(0, 2, 1, 0, 0)]);
822 load_vec(pg, u34i, &ct[
VLEN * index_clv_up(0, 2, 1, 0, 1)]);
823 load_vec(pg, u56r, &ct[
VLEN * index_clv_up(1, 1, 1, 2, 0)]);
824 load_vec(pg, u56i, &ct[
VLEN * index_clv_up(1, 1, 1, 2, 1)]);
826 axpy_vec(pg, wt1r, u12r, vt2r);
827 axpy_vec(pg, wt1i, u12r, vt2i);
828 ymax_vec(pg, wt1r, u12i, vt2i);
829 axpy_vec(pg, wt1i, u12i, vt2r);
830 axpy_vec(pg, wt2r, u12r, vt1r);
831 axpy_vec(pg, wt2i, u12r, vt1i);
832 axpy_vec(pg, wt2r, u12i, vt1i);
833 ymax_vec(pg, wt2i, u12i, vt1r);
836 load_vec(pg, u23r, &ct[
VLEN * index_clv_up(0, 1, 0, 2, 0)]);
837 load_vec(pg, u23i, &ct[
VLEN * index_clv_up(0, 1, 0, 2, 1)]);
838 axpy_vec(pg, wt3r, u34r, vt4r);
839 axpy_vec(pg, wt3i, u34r, vt4i);
840 ymax_vec(pg, wt3r, u34i, vt4i);
841 axpy_vec(pg, wt3i, u34i, vt4r);
842 axpy_vec(pg, wt4r, u34r, vt3r);
843 axpy_vec(pg, wt4i, u34r, vt3i);
844 axpy_vec(pg, wt4r, u34i, vt3i);
845 ymax_vec(pg, wt4i, u34i, vt3r);
848 load_vec(pg, u45r, &ct[
VLEN * index_clv_up(1, 0, 1, 1, 0)]);
849 load_vec(pg, u45i, &ct[
VLEN * index_clv_up(1, 0, 1, 1, 1)]);
850 axpy_vec(pg, wt5r, u56r, vt6r);
851 axpy_vec(pg, wt5i, u56r, vt6i);
852 ymax_vec(pg, wt5r, u56i, vt6i);
853 axpy_vec(pg, wt5i, u56i, vt6r);
854 axpy_vec(pg, wt6r, u56r, vt5r);
855 axpy_vec(pg, wt6i, u56r, vt5i);
856 axpy_vec(pg, wt6r, u56i, vt5i);
857 ymax_vec(pg, wt6i, u56i, vt5r);
860 load_vec(pg, u13r, &ct[
VLEN * index_clv_up(0, 0, 0, 2, 0)]);
861 load_vec(pg, u13i, &ct[
VLEN * index_clv_up(0, 0, 0, 2, 1)]);
862 axpy_vec(pg, wt2r, u23r, vt3r);
863 axpy_vec(pg, wt2i, u23r, vt3i);
864 ymax_vec(pg, wt2r, u23i, vt3i);
865 axpy_vec(pg, wt2i, u23i, vt3r);
866 axpy_vec(pg, wt3r, u23r, vt2r);
867 axpy_vec(pg, wt3i, u23r, vt2i);
868 axpy_vec(pg, wt3r, u23i, vt2i);
869 ymax_vec(pg, wt3i, u23i, vt2r);
872 load_vec(pg, u24r, &ct[
VLEN * index_clv_up(0, 1, 1, 0, 0)]);
873 load_vec(pg, u24i, &ct[
VLEN * index_clv_up(0, 1, 1, 0, 1)]);
874 axpy_vec(pg, wt4r, u45r, vt5r);
875 axpy_vec(pg, wt4i, u45r, vt5i);
876 ymax_vec(pg, wt4r, u45i, vt5i);
877 axpy_vec(pg, wt4i, u45i, vt5r);
878 axpy_vec(pg, wt5r, u45r, vt4r);
879 axpy_vec(pg, wt5i, u45r, vt4i);
880 axpy_vec(pg, wt5r, u45i, vt4i);
881 ymax_vec(pg, wt5i, u45i, vt4r);
884 load_vec(pg, u35r, &ct[
VLEN * index_clv_up(0, 2, 1, 1, 0)]);
885 load_vec(pg, u35i, &ct[
VLEN * index_clv_up(0, 2, 1, 1, 1)]);
886 axpy_vec(pg, wt1r, u13r, vt3r);
887 axpy_vec(pg, wt1i, u13r, vt3i);
888 ymax_vec(pg, wt1r, u13i, vt3i);
889 axpy_vec(pg, wt1i, u13i, vt3r);
890 axpy_vec(pg, wt3r, u13r, vt1r);
891 axpy_vec(pg, wt3i, u13r, vt1i);
892 axpy_vec(pg, wt3r, u13i, vt1i);
893 ymax_vec(pg, wt3i, u13i, vt1r);
896 load_vec(pg, u46r, &ct[
VLEN * index_clv_up(1, 0, 1, 2, 0)]);
897 load_vec(pg, u46i, &ct[
VLEN * index_clv_up(1, 0, 1, 2, 1)]);
898 axpy_vec(pg, wt2r, u24r, vt4r);
899 axpy_vec(pg, wt2i, u24r, vt4i);
900 ymax_vec(pg, wt2r, u24i, vt4i);
901 axpy_vec(pg, wt2i, u24i, vt4r);
902 axpy_vec(pg, wt4r, u24r, vt2r);
903 axpy_vec(pg, wt4i, u24r, vt2i);
904 axpy_vec(pg, wt4r, u24i, vt2i);
905 ymax_vec(pg, wt4i, u24i, vt2r);
908 load_vec(pg, u15r, &ct[
VLEN * index_clv_up(0, 0, 1, 1, 0)]);
909 load_vec(pg, u15i, &ct[
VLEN * index_clv_up(0, 0, 1, 1, 1)]);
910 axpy_vec(pg, wt3r, u35r, vt5r);
911 axpy_vec(pg, wt3i, u35r, vt5i);
912 ymax_vec(pg, wt3r, u35i, vt5i);
913 axpy_vec(pg, wt3i, u35i, vt5r);
914 axpy_vec(pg, wt5r, u35r, vt3r);
915 axpy_vec(pg, wt5i, u35r, vt3i);
916 axpy_vec(pg, wt5r, u35i, vt3i);
917 ymax_vec(pg, wt5i, u35i, vt3r);
921 load_vec(pg, u26r, &ct[
VLEN * index_clv_up(0, 1, 1, 2, 0)]);
922 load_vec(pg, u26i, &ct[
VLEN * index_clv_up(0, 1, 1, 2, 1)]);
923 axpy_vec(pg, wt6r, u46r, vt4r);
924 axpy_vec(pg, wt6i, u46r, vt4i);
925 axpy_vec(pg, wt6r, u46i, vt4i);
926 ymax_vec(pg, wt6i, u46i, vt4r);
927 axpy_vec(pg, wt4r, u46r, vt6r);
928 axpy_vec(pg, wt4i, u46r, vt6i);
929 ymax_vec(pg, wt4r, u46i, vt6i);
930 axpy_vec(pg, wt4i, u46i, vt6r);
933 load_vec(pg, u14r, &ct[
VLEN * index_clv_up(0, 0, 1, 0, 0)]);
934 load_vec(pg, u14i, &ct[
VLEN * index_clv_up(0, 0, 1, 0, 1)]);
935 axpy_vec(pg, wt1r, u15r, vt5r);
936 axpy_vec(pg, wt1i, u15r, vt5i);
937 ymax_vec(pg, wt1r, u15i, vt5i);
938 axpy_vec(pg, wt1i, u15i, vt5r);
939 axpy_vec(pg, wt5r, u15r, vt1r);
940 axpy_vec(pg, wt5i, u15r, vt1i);
941 axpy_vec(pg, wt5r, u15i, vt1i);
942 ymax_vec(pg, wt5i, u15i, vt1r);
945 load_vec(pg, u36r, &ct[
VLEN * index_clv_up(0, 2, 1, 2, 0)]);
946 load_vec(pg, u36i, &ct[
VLEN * index_clv_up(0, 2, 1, 2, 1)]);
947 axpy_vec(pg, wt6r, u26r, vt2r);
948 axpy_vec(pg, wt6i, u26r, vt2i);
949 axpy_vec(pg, wt6r, u26i, vt2i);
950 ymax_vec(pg, wt6i, u26i, vt2r);
951 axpy_vec(pg, wt2r, u26r, vt6r);
952 axpy_vec(pg, wt2i, u26r, vt6i);
953 ymax_vec(pg, wt2r, u26i, vt6i);
954 axpy_vec(pg, wt2i, u26i, vt6r);
957 load_vec(pg, u25r, &ct[
VLEN * index_clv_up(0, 1, 1, 1, 0)]);
958 load_vec(pg, u25i, &ct[
VLEN * index_clv_up(0, 1, 1, 1, 1)]);
959 axpy_vec(pg, wt4r, u14r, vt1r);
960 axpy_vec(pg, wt4i, u14r, vt1i);
961 axpy_vec(pg, wt4r, u14i, vt1i);
962 ymax_vec(pg, wt4i, u14i, vt1r);
963 axpy_vec(pg, wt1r, u14r, vt4r);
964 axpy_vec(pg, wt1i, u14r, vt4i);
965 ymax_vec(pg, wt1r, u14i, vt4i);
966 axpy_vec(pg, wt1i, u14i, vt4r);
969 load_vec(pg, u16r, &ct[
VLEN * index_clv_up(0, 0, 1, 2, 0)]);
970 load_vec(pg, u16i, &ct[
VLEN * index_clv_up(0, 0, 1, 2, 1)]);
971 axpy_vec(pg, wt6r, u36r, vt3r);
972 axpy_vec(pg, wt6i, u36r, vt3i);
973 axpy_vec(pg, wt6r, u36i, vt3i);
974 ymax_vec(pg, wt6i, u36i, vt3r);
975 axpy_vec(pg, wt3r, u36r, vt6r);
976 axpy_vec(pg, wt3i, u36r, vt6i);
977 ymax_vec(pg, wt3r, u36i, vt6i);
978 axpy_vec(pg, wt3i, u36i, vt6r);
981 axpy_vec(pg, wt2r, u25r, vt5r);
982 axpy_vec(pg, wt2i, u25r, vt5i);
983 ymax_vec(pg, wt2r, u25i, vt5i);
984 axpy_vec(pg, wt2i, u25i, vt5r);
985 axpy_vec(pg, wt5r, u25r, vt2r);
986 axpy_vec(pg, wt5i, u25r, vt2i);
987 axpy_vec(pg, wt5r, u25i, vt2i);
988 ymax_vec(pg, wt5i, u25i, vt2r);
990 axpy_vec(pg, wt1r, u16r, vt6r);
991 axpy_vec(pg, wt1i, u16r, vt6i);
992 ymax_vec(pg, wt1r, u16i, vt6i);
993 axpy_vec(pg, wt1i, u16i, vt6r);
994 axpy_vec(pg, wt6r, u16r, vt1r);
995 axpy_vec(pg, wt6i, u16r, vt1i);
996 axpy_vec(pg, wt6r, u16i, vt1i);
997 ymax_vec(pg, wt6i, u16i, vt1r);
999 save_vec(pg, &v2[
VLEN * (2 * (2 + 4 * 0))], wt1r);
1000 save_vec(pg, &v2[
VLEN * (1 + 2 * (2 + 4 * 0))], wt1i);
1001 save_vec(pg, &v2[
VLEN * (2 * (2 + 4 * 1))], wt2r);
1002 save_vec(pg, &v2[
VLEN * (1 + 2 * (2 + 4 * 1))], wt2i);
1003 save_vec(pg, &v2[
VLEN * (2 * (2 + 4 * 2))], wt3r);
1004 save_vec(pg, &v2[
VLEN * (1 + 2 * (2 + 4 * 2))], wt3i);
1006 save_vec(pg, &v2[
VLEN * (2 * (3 + 4 * 0))], wt4r);
1007 save_vec(pg, &v2[
VLEN * (1 + 2 * (3 + 4 * 0))], wt4i);
1008 save_vec(pg, &v2[
VLEN * (2 * (3 + 4 * 1))], wt5r);
1009 save_vec(pg, &v2[
VLEN * (1 + 2 * (3 + 4 * 1))], wt5i);
1010 save_vec(pg, &v2[
VLEN * (2 * (3 + 4 * 2))], wt6r);
1011 save_vec(pg, &v2[
VLEN * (1 + 2 * (3 + 4 * 2))], wt6i);
1013 flip_sign(pg, wt1r);
1014 flip_sign(pg, wt1i);
1015 flip_sign(pg, wt2r);
1016 flip_sign(pg, wt2i);
1017 flip_sign(pg, wt3r);
1018 flip_sign(pg, wt3i);
1019 flip_sign(pg, wt4r);
1020 flip_sign(pg, wt4i);
1021 flip_sign(pg, wt5r);
1022 flip_sign(pg, wt5i);
1023 flip_sign(pg, wt6r);
1024 flip_sign(pg, wt6i);
1026 save_vec(pg, &v2[
VLEN * (2 * (0 + 4 * 0))], wt1r);
1027 save_vec(pg, &v2[
VLEN * (1 + 2 * (0 + 4 * 0))], wt1i);
1028 save_vec(pg, &v2[
VLEN * (2 * (0 + 4 * 1))], wt2r);
1029 save_vec(pg, &v2[
VLEN * (1 + 2 * (0 + 4 * 1))], wt2i);
1030 save_vec(pg, &v2[
VLEN * (2 * (0 + 4 * 2))], wt3r);
1031 save_vec(pg, &v2[
VLEN * (1 + 2 * (0 + 4 * 2))], wt3i);
1033 save_vec(pg, &v2[
VLEN * (2 * (1 + 4 * 0))], wt4r);
1034 save_vec(pg, &v2[
VLEN * (1 + 2 * (1 + 4 * 0))], wt4i);
1035 save_vec(pg, &v2[
VLEN * (2 * (1 + 4 * 1))], wt5r);
1036 save_vec(pg, &v2[
VLEN * (1 + 2 * (1 + 4 * 1))], wt5i);
1037 save_vec(pg, &v2[
VLEN * (2 * (1 + 4 * 2))], wt6r);
1038 save_vec(pg, &v2[
VLEN * (1 + 2 * (1 + 4 * 2))], wt6i);
1043 set_2sp_dn(pg, vt1r, vt1i, v1, 0, 0);
1044 set_2sp_dn(pg, vt2r, vt2i, v1, 0, 1);
1045 set_2sp_dn(pg, vt3r, vt3i, v1, 0, 2);
1046 set_2sp_dn(pg, vt4r, vt4i, v1, 1, 0);
1047 set_2sp_dn(pg, vt5r, vt5i, v1, 1, 1);
1048 set_2sp_dn(pg, vt6r, vt6i, v1, 1, 2);
1050 svreal_t u11r, u22r, u33r, u44r, u55r, u66r;
1051 load_vec(pg, u11r, &ct[
VLEN * index_clv_dn(0, 0, 0, 0, 0)]);
1052 load_vec(pg, u22r, &ct[
VLEN * index_clv_dn(0, 1, 0, 1, 0)]);
1053 load_vec(pg, u33r, &ct[
VLEN * index_clv_dn(0, 2, 0, 2, 0)]);
1054 load_vec(pg, u44r, &ct[
VLEN * index_clv_dn(1, 0, 1, 0, 0)]);
1055 load_vec(pg, u55r, &ct[
VLEN * index_clv_dn(1, 1, 1, 1, 0)]);
1056 load_vec(pg, u66r, &ct[
VLEN * index_clv_dn(1, 2, 1, 2, 0)]);
1058 svreal_t wt1r, wt1i, wt2r, wt2i, wt3r, wt3i;
1059 svreal_t wt4r, wt4i, wt5r, wt5i, wt6r, wt6i;
1060 mul_vec(pg, wt1r, u11r, vt1r);
1061 mul_vec(pg, wt1i, u11r, vt1i);
1062 mul_vec(pg, wt2r, u22r, vt2r);
1063 mul_vec(pg, wt2i, u22r, vt2i);
1064 mul_vec(pg, wt3r, u33r, vt3r);
1065 mul_vec(pg, wt3i, u33r, vt3i);
1068 load_vec(pg, u12r, &ct[
VLEN * index_clv_dn(0, 0, 0, 1, 0)]);
1069 load_vec(pg, u12i, &ct[
VLEN * index_clv_dn(0, 0, 0, 1, 1)]);
1071 mul_vec(pg, wt4r, u44r, vt4r);
1072 mul_vec(pg, wt4i, u44r, vt4i);
1073 mul_vec(pg, wt5r, u55r, vt5r);
1074 mul_vec(pg, wt5i, u55r, vt5i);
1075 mul_vec(pg, wt6r, u66r, vt6r);
1076 mul_vec(pg, wt6i, u66r, vt6i);
1079 load_vec(pg, u34r, &ct[
VLEN * index_clv_dn(0, 2, 1, 0, 0)]);
1080 load_vec(pg, u34i, &ct[
VLEN * index_clv_dn(0, 2, 1, 0, 1)]);
1081 load_vec(pg, u56r, &ct[
VLEN * index_clv_dn(1, 1, 1, 2, 0)]);
1082 load_vec(pg, u56i, &ct[
VLEN * index_clv_dn(1, 1, 1, 2, 1)]);
1084 axpy_vec(pg, wt1r, u12r, vt2r);
1085 axpy_vec(pg, wt1i, u12r, vt2i);
1086 ymax_vec(pg, wt1r, u12i, vt2i);
1087 axpy_vec(pg, wt1i, u12i, vt2r);
1088 axpy_vec(pg, wt2r, u12r, vt1r);
1089 axpy_vec(pg, wt2i, u12r, vt1i);
1090 axpy_vec(pg, wt2r, u12i, vt1i);
1091 ymax_vec(pg, wt2i, u12i, vt1r);
1094 load_vec(pg, u23r, &ct[
VLEN * index_clv_dn(0, 1, 0, 2, 0)]);
1095 load_vec(pg, u23i, &ct[
VLEN * index_clv_dn(0, 1, 0, 2, 1)]);
1096 axpy_vec(pg, wt3r, u34r, vt4r);
1097 axpy_vec(pg, wt3i, u34r, vt4i);
1098 ymax_vec(pg, wt3r, u34i, vt4i);
1099 axpy_vec(pg, wt3i, u34i, vt4r);
1100 axpy_vec(pg, wt4r, u34r, vt3r);
1101 axpy_vec(pg, wt4i, u34r, vt3i);
1102 axpy_vec(pg, wt4r, u34i, vt3i);
1103 ymax_vec(pg, wt4i, u34i, vt3r);
1106 load_vec(pg, u45r, &ct[
VLEN * index_clv_dn(1, 0, 1, 1, 0)]);
1107 load_vec(pg, u45i, &ct[
VLEN * index_clv_dn(1, 0, 1, 1, 1)]);
1108 axpy_vec(pg, wt5r, u56r, vt6r);
1109 axpy_vec(pg, wt5i, u56r, vt6i);
1110 ymax_vec(pg, wt5r, u56i, vt6i);
1111 axpy_vec(pg, wt5i, u56i, vt6r);
1112 axpy_vec(pg, wt6r, u56r, vt5r);
1113 axpy_vec(pg, wt6i, u56r, vt5i);
1114 axpy_vec(pg, wt6r, u56i, vt5i);
1115 ymax_vec(pg, wt6i, u56i, vt5r);
1118 load_vec(pg, u13r, &ct[
VLEN * index_clv_dn(0, 0, 0, 2, 0)]);
1119 load_vec(pg, u13i, &ct[
VLEN * index_clv_dn(0, 0, 0, 2, 1)]);
1120 axpy_vec(pg, wt2r, u23r, vt3r);
1121 axpy_vec(pg, wt2i, u23r, vt3i);
1122 ymax_vec(pg, wt2r, u23i, vt3i);
1123 axpy_vec(pg, wt2i, u23i, vt3r);
1124 axpy_vec(pg, wt3r, u23r, vt2r);
1125 axpy_vec(pg, wt3i, u23r, vt2i);
1126 axpy_vec(pg, wt3r, u23i, vt2i);
1127 ymax_vec(pg, wt3i, u23i, vt2r);
1130 load_vec(pg, u24r, &ct[
VLEN * index_clv_dn(0, 1, 1, 0, 0)]);
1131 load_vec(pg, u24i, &ct[
VLEN * index_clv_dn(0, 1, 1, 0, 1)]);
1132 axpy_vec(pg, wt4r, u45r, vt5r);
1133 axpy_vec(pg, wt4i, u45r, vt5i);
1134 ymax_vec(pg, wt4r, u45i, vt5i);
1135 axpy_vec(pg, wt4i, u45i, vt5r);
1136 axpy_vec(pg, wt5r, u45r, vt4r);
1137 axpy_vec(pg, wt5i, u45r, vt4i);
1138 axpy_vec(pg, wt5r, u45i, vt4i);
1139 ymax_vec(pg, wt5i, u45i, vt4r);
1142 load_vec(pg, u35r, &ct[
VLEN * index_clv_dn(0, 2, 1, 1, 0)]);
1143 load_vec(pg, u35i, &ct[
VLEN * index_clv_dn(0, 2, 1, 1, 1)]);
1144 axpy_vec(pg, wt1r, u13r, vt3r);
1145 axpy_vec(pg, wt1i, u13r, vt3i);
1146 ymax_vec(pg, wt1r, u13i, vt3i);
1147 axpy_vec(pg, wt1i, u13i, vt3r);
1148 axpy_vec(pg, wt3r, u13r, vt1r);
1149 axpy_vec(pg, wt3i, u13r, vt1i);
1150 axpy_vec(pg, wt3r, u13i, vt1i);
1151 ymax_vec(pg, wt3i, u13i, vt1r);
1154 load_vec(pg, u46r, &ct[
VLEN * index_clv_dn(1, 0, 1, 2, 0)]);
1155 load_vec(pg, u46i, &ct[
VLEN * index_clv_dn(1, 0, 1, 2, 1)]);
1156 axpy_vec(pg, wt2r, u24r, vt4r);
1157 axpy_vec(pg, wt2i, u24r, vt4i);
1158 ymax_vec(pg, wt2r, u24i, vt4i);
1159 axpy_vec(pg, wt2i, u24i, vt4r);
1160 axpy_vec(pg, wt4r, u24r, vt2r);
1161 axpy_vec(pg, wt4i, u24r, vt2i);
1162 axpy_vec(pg, wt4r, u24i, vt2i);
1163 ymax_vec(pg, wt4i, u24i, vt2r);
1166 load_vec(pg, u15r, &ct[
VLEN * index_clv_dn(0, 0, 1, 1, 0)]);
1167 load_vec(pg, u15i, &ct[
VLEN * index_clv_dn(0, 0, 1, 1, 1)]);
1168 axpy_vec(pg, wt3r, u35r, vt5r);
1169 axpy_vec(pg, wt3i, u35r, vt5i);
1170 ymax_vec(pg, wt3r, u35i, vt5i);
1171 axpy_vec(pg, wt3i, u35i, vt5r);
1172 axpy_vec(pg, wt5r, u35r, vt3r);
1173 axpy_vec(pg, wt5i, u35r, vt3i);
1174 axpy_vec(pg, wt5r, u35i, vt3i);
1175 ymax_vec(pg, wt5i, u35i, vt3r);
1178 load_vec(pg, u26r, &ct[
VLEN * index_clv_dn(0, 1, 1, 2, 0)]);
1179 load_vec(pg, u26i, &ct[
VLEN * index_clv_dn(0, 1, 1, 2, 1)]);
1180 axpy_vec(pg, wt6r, u46r, vt4r);
1181 axpy_vec(pg, wt6i, u46r, vt4i);
1182 axpy_vec(pg, wt6r, u46i, vt4i);
1183 ymax_vec(pg, wt6i, u46i, vt4r);
1184 axpy_vec(pg, wt4r, u46r, vt6r);
1185 axpy_vec(pg, wt4i, u46r, vt6i);
1186 ymax_vec(pg, wt4r, u46i, vt6i);
1187 axpy_vec(pg, wt4i, u46i, vt6r);
1190 load_vec(pg, u14r, &ct[
VLEN * index_clv_dn(0, 0, 1, 0, 0)]);
1191 load_vec(pg, u14i, &ct[
VLEN * index_clv_dn(0, 0, 1, 0, 1)]);
1192 axpy_vec(pg, wt1r, u15r, vt5r);
1193 axpy_vec(pg, wt1i, u15r, vt5i);
1194 ymax_vec(pg, wt1r, u15i, vt5i);
1195 axpy_vec(pg, wt1i, u15i, vt5r);
1196 axpy_vec(pg, wt5r, u15r, vt1r);
1197 axpy_vec(pg, wt5i, u15r, vt1i);
1198 axpy_vec(pg, wt5r, u15i, vt1i);
1199 ymax_vec(pg, wt5i, u15i, vt1r);
1202 load_vec(pg, u36r, &ct[
VLEN * index_clv_dn(0, 2, 1, 2, 0)]);
1203 load_vec(pg, u36i, &ct[
VLEN * index_clv_dn(0, 2, 1, 2, 1)]);
1204 axpy_vec(pg, wt6r, u26r, vt2r);
1205 axpy_vec(pg, wt6i, u26r, vt2i);
1206 axpy_vec(pg, wt6r, u26i, vt2i);
1207 ymax_vec(pg, wt6i, u26i, vt2r);
1208 axpy_vec(pg, wt2r, u26r, vt6r);
1209 axpy_vec(pg, wt2i, u26r, vt6i);
1210 ymax_vec(pg, wt2r, u26i, vt6i);
1211 axpy_vec(pg, wt2i, u26i, vt6r);
1214 load_vec(pg, u25r, &ct[
VLEN * index_clv_dn(0, 1, 1, 1, 0)]);
1215 load_vec(pg, u25i, &ct[
VLEN * index_clv_dn(0, 1, 1, 1, 1)]);
1216 axpy_vec(pg, wt4r, u14r, vt1r);
1217 axpy_vec(pg, wt4i, u14r, vt1i);
1218 axpy_vec(pg, wt4r, u14i, vt1i);
1219 ymax_vec(pg, wt4i, u14i, vt1r);
1220 axpy_vec(pg, wt1r, u14r, vt4r);
1221 axpy_vec(pg, wt1i, u14r, vt4i);
1222 ymax_vec(pg, wt1r, u14i, vt4i);
1223 axpy_vec(pg, wt1i, u14i, vt4r);
1226 load_vec(pg, u16r, &ct[
VLEN * index_clv_dn(0, 0, 1, 2, 0)]);
1227 load_vec(pg, u16i, &ct[
VLEN * index_clv_dn(0, 0, 1, 2, 1)]);
1228 axpy_vec(pg, wt6r, u36r, vt3r);
1229 axpy_vec(pg, wt6i, u36r, vt3i);
1230 axpy_vec(pg, wt6r, u36i, vt3i);
1231 ymax_vec(pg, wt6i, u36i, vt3r);
1232 axpy_vec(pg, wt3r, u36r, vt6r);
1233 axpy_vec(pg, wt3i, u36r, vt6i);
1234 ymax_vec(pg, wt3r, u36i, vt6i);
1235 axpy_vec(pg, wt3i, u36i, vt6r);
1238 axpy_vec(pg, wt2r, u25r, vt5r);
1239 axpy_vec(pg, wt2i, u25r, vt5i);
1240 ymax_vec(pg, wt2r, u25i, vt5i);
1241 axpy_vec(pg, wt2i, u25i, vt5r);
1242 axpy_vec(pg, wt5r, u25r, vt2r);
1243 axpy_vec(pg, wt5i, u25r, vt2i);
1244 axpy_vec(pg, wt5r, u25i, vt2i);
1245 ymax_vec(pg, wt5i, u25i, vt2r);
1247 axpy_vec(pg, wt1r, u16r, vt6r);
1248 axpy_vec(pg, wt1i, u16r, vt6i);
1249 ymax_vec(pg, wt1r, u16i, vt6i);
1250 axpy_vec(pg, wt1i, u16i, vt6r);
1251 axpy_vec(pg, wt6r, u16r, vt1r);
1252 axpy_vec(pg, wt6i, u16r, vt1i);
1253 axpy_vec(pg, wt6r, u16i, vt1i);
1254 ymax_vec(pg, wt6i, u16i, vt1r);
1256 svreal_t xt1r, xt1i, xt2r, xt2i, xt3r, xt3i;
1257 svreal_t xt4r, xt4i, xt5r, xt5i, xt6r, xt6i;
1260 load_vec(pg, xt1r, &v2[
VLEN * (2 * (0 + 4 * 0))]);
1261 load_vec(pg, xt1i, &v2[
VLEN * (1 + 2 * (0 + 4 * 0))]);
1262 load_vec(pg, xt2r, &v2[
VLEN * (2 * (0 + 4 * 1))]);
1263 load_vec(pg, xt2i, &v2[
VLEN * (1 + 2 * (0 + 4 * 1))]);
1264 load_vec(pg, xt3r, &v2[
VLEN * (2 * (0 + 4 * 2))]);
1265 load_vec(pg, xt3i, &v2[
VLEN * (1 + 2 * (0 + 4 * 2))]);
1266 load_vec(pg, xt4r, &v2[
VLEN * (2 * (1 + 4 * 0))]);
1267 load_vec(pg, xt4i, &v2[
VLEN * (1 + 2 * (1 + 4 * 0))]);
1268 load_vec(pg, xt5r, &v2[
VLEN * (2 * (1 + 4 * 1))]);
1269 load_vec(pg, xt5i, &v2[
VLEN * (1 + 2 * (1 + 4 * 1))]);
1270 load_vec(pg, xt6r, &v2[
VLEN * (2 * (1 + 4 * 2))]);
1271 load_vec(pg, xt6i, &v2[
VLEN * (1 + 2 * (1 + 4 * 2))]);
1273 add_vec(pg, xt1r, wt1r);
1274 add_vec(pg, xt1i, wt1i);
1275 add_vec(pg, xt2r, wt2r);
1276 add_vec(pg, xt2i, wt2i);
1277 add_vec(pg, xt3r, wt3r);
1278 add_vec(pg, xt3i, wt3i);
1279 add_vec(pg, xt4r, wt4r);
1280 add_vec(pg, xt4i, wt4i);
1281 add_vec(pg, xt5r, wt5r);
1282 add_vec(pg, xt5i, wt5i);
1283 add_vec(pg, xt6r, wt6r);
1284 add_vec(pg, xt6i, wt6i);
1286 save_vec(pg, &v2[
VLEN * (2 * (0 + 4 * 0))], xt1r);
1287 save_vec(pg, &v2[
VLEN * (1 + 2 * (0 + 4 * 0))], xt1i);
1288 save_vec(pg, &v2[
VLEN * (2 * (0 + 4 * 1))], xt2r);
1289 save_vec(pg, &v2[
VLEN * (1 + 2 * (0 + 4 * 1))], xt2i);
1290 save_vec(pg, &v2[
VLEN * (2 * (0 + 4 * 2))], xt3r);
1291 save_vec(pg, &v2[
VLEN * (1 + 2 * (0 + 4 * 2))], xt3i);
1292 save_vec(pg, &v2[
VLEN * (2 * (1 + 4 * 0))], xt4r);
1293 save_vec(pg, &v2[
VLEN * (1 + 2 * (1 + 4 * 0))], xt4i);
1294 save_vec(pg, &v2[
VLEN * (2 * (1 + 4 * 1))], xt5r);
1295 save_vec(pg, &v2[
VLEN * (1 + 2 * (1 + 4 * 1))], xt5i);
1296 save_vec(pg, &v2[
VLEN * (2 * (1 + 4 * 2))], xt6r);
1297 save_vec(pg, &v2[
VLEN * (1 + 2 * (1 + 4 * 2))], xt6i);
1299 load_vec(pg, xt1r, &v2[
VLEN * (2 * (2 + 4 * 0))]);
1300 load_vec(pg, xt1i, &v2[
VLEN * (1 + 2 * (2 + 4 * 0))]);
1301 load_vec(pg, xt2r, &v2[
VLEN * (2 * (2 + 4 * 1))]);
1302 load_vec(pg, xt2i, &v2[
VLEN * (1 + 2 * (2 + 4 * 1))]);
1303 load_vec(pg, xt3r, &v2[
VLEN * (2 * (2 + 4 * 2))]);
1304 load_vec(pg, xt3i, &v2[
VLEN * (1 + 2 * (2 + 4 * 2))]);
1305 load_vec(pg, xt4r, &v2[
VLEN * (2 * (3 + 4 * 0))]);
1306 load_vec(pg, xt4i, &v2[
VLEN * (1 + 2 * (3 + 4 * 0))]);
1307 load_vec(pg, xt5r, &v2[
VLEN * (2 * (3 + 4 * 1))]);
1308 load_vec(pg, xt5i, &v2[
VLEN * (1 + 2 * (3 + 4 * 1))]);
1309 load_vec(pg, xt6r, &v2[
VLEN * (2 * (3 + 4 * 2))]);
1310 load_vec(pg, xt6i, &v2[
VLEN * (1 + 2 * (3 + 4 * 2))]);
1312 add_vec(pg, xt1r, wt1r);
1313 add_vec(pg, xt1i, wt1i);
1314 add_vec(pg, xt2r, wt2r);
1315 add_vec(pg, xt2i, wt2i);
1316 add_vec(pg, xt3r, wt3r);
1317 add_vec(pg, xt3i, wt3i);
1318 add_vec(pg, xt4r, wt4r);
1319 add_vec(pg, xt4i, wt4i);
1320 add_vec(pg, xt5r, wt5r);
1321 add_vec(pg, xt5i, wt5i);
1322 add_vec(pg, xt6r, wt6r);
1323 add_vec(pg, xt6i, wt6i);
1325 save_vec(pg, &v2[
VLEN * (2 * (2 + 4 * 0))], xt1r);
1326 save_vec(pg, &v2[
VLEN * (1 + 2 * (2 + 4 * 0))], xt1i);
1327 save_vec(pg, &v2[
VLEN * (2 * (2 + 4 * 1))], xt2r);
1328 save_vec(pg, &v2[
VLEN * (1 + 2 * (2 + 4 * 1))], xt2i);
1329 save_vec(pg, &v2[
VLEN * (2 * (2 + 4 * 2))], xt3r);
1330 save_vec(pg, &v2[
VLEN * (1 + 2 * (2 + 4 * 2))], xt3i);
1331 save_vec(pg, &v2[
VLEN * (2 * (3 + 4 * 0))], xt4r);
1332 save_vec(pg, &v2[
VLEN * (1 + 2 * (3 + 4 * 0))], xt4i);
1333 save_vec(pg, &v2[
VLEN * (2 * (3 + 4 * 1))], xt5r);
1334 save_vec(pg, &v2[
VLEN * (1 + 2 * (3 + 4 * 1))], xt5i);
1335 save_vec(pg, &v2[
VLEN * (2 * (3 + 4 * 2))], xt6r);
1336 save_vec(pg, &v2[
VLEN * (1 + 2 * (3 + 4 * 2))], xt6i);