10 #ifndef QXS_VSIMD_INCLUDED
11 #define QXS_VSIMD_INCLUDED
18 #ifdef __ARM_FEATURE_SVE
27 typedef int32_t
int_t;
38 inline svbool_t set_predicate_false()
44 inline svbool_t set_predicate_whilelt(
int range)
46 return svwhilelt_b32(0, range);
54 for (
int iy =
VLENYS; iy > 0; --iy) {
55 pg1 = svwhilelt_b32(0, iy *
VLENXS);
56 pg2 = sveor_z(pg0, pg2, pg1);
57 pg1 = svwhilelt_b32(0, iy *
VLENXS - 1);
58 pg2 = sveor_z(pg0, pg2, pg1);
60 pg1 = svnot_z(pg0, pg2);
68 for (
int iy =
VLENYS - 1; iy > 0; --iy) {
69 pg1 = svwhilelt_b32(0, iy *
VLENXS);
70 pg2 = sveor_z(pg0, pg2, pg1);
71 pg1 = svwhilelt_b32(0, (iy - 1) *
VLENXS + 1);
72 pg2 = sveor_z(pg0, pg2, pg1);
74 pg1 = svnot_z(pg0, pg2);
84 for (
int iy =
VLENYS; iy > 0; --iy) {
86 pg1 = svwhilelt_b32(0, iy *
VLENXS);
87 pg2 = sveor_z(pg0, pg2, pg1);
88 pg1 = svwhilelt_b32(0, iy *
VLENXS - 1);
89 pg2 = sveor_z(pg0, pg2, pg1);
91 pg1 = svwhilelt_b32(0, iy *
VLENXS);
92 pg3 = sveor_z(pg0, pg3, pg1);
93 pg1 = svwhilelt_b32(0, (iy - 1) *
VLENXS);
94 pg3 = sveor_z(pg0, pg3, pg1);
97 pg1 = sveor_z(pg0, pg2, pg3);
98 pg1 = svnot_z(pg0, pg1);
108 for (
int iy =
VLENYS; iy > 0; --iy) {
110 pg2 = svwhilelt_b32(0, iy *
VLENXS);
111 pg3 = sveor_z(pg0, pg3, pg2);
112 pg2 = svwhilelt_b32(0, (iy - 1) *
VLENXS);
113 pg3 = sveor_z(pg0, pg3, pg2);
115 pg2 = svwhilelt_b32(0, iy *
VLENXS);
116 pg1 = sveor_z(pg0, pg1, pg2);
117 pg2 = svwhilelt_b32(0, (iy - 1) *
VLENXS + 1);
118 pg1 = sveor_z(pg0, pg1, pg2);
121 pg2 = sveor_z(pg0, pg1, pg3);
122 pg2 = svnot_z(pg0, pg2);
130 pg2 = svnot_z(pg0, pg1);
137 pg2 = svwhilelt_b32(0,
VLENXS);
138 pg1 = svnot_z(pg0, pg2);
143 inline void set1_at(
const int i,
svbool_t& pg)
147 svbool_t pg2 = svwhilelt_b32(0, i + 1);
148 pg = sveor_z(pg0, pg, pg1);
149 pg = sveor_z(pg0, pg, pg2);
153 inline void rot1_R(uint32_t *u,
const int len =
VLENXS)
155 uint32_t tmp = u[len - 1];
156 for (
int i = len - 1; i >= 1; --i) {
163 inline void rot1_L(uint32_t *u,
const int len =
VLENXS)
166 for (
int i = 0; i < len - 1; ++i) {
173 inline void set_idx_predicate_xp_eo(
svbool_t& pg,
178 for (
int i = 0; i <
VLEN; i++) {
197 idx = svld1_u32(svptrue_b32(), u);
201 inline void set_idx_predicate_xm_eo(
svbool_t& pg,
206 for (
int i = 0; i <
VLEN; ++i) {
214 set1_at(i +
VLENXS - 1, pg);
221 set1_at(i +
VLENXS - 1, pg);
225 idx = svld1_u32(svptrue_b32(), u);
229 inline void set_idx_predicate_yp(
svbool_t& pg1, svuint32_t&
idx)
231 pg1 = svwhilelt_b32(0,
VLENXS);
236 for (
int i = 0; i <
VLENXS; ++i) {
239 idx = svld1_u32(svptrue_b32(), u);
243 inline void set_idx_predicate_ym(
svbool_t& pg1, svuint32_t&
idx)
246 pg1 = svnot_z(svptrue_b32(), pg2);
248 for (
int i = 0; i <
VLENXS; ++i) {
254 idx = svld1_u32(svptrue_b32(), u);
258 inline void set_vec(
svbool_t pg, svfloat32_t& v, float32_t a)
260 v = svdup_f32_m(v, pg, a);
264 inline void clear_vec(
svbool_t pg, svfloat32_t& v)
266 v = svdup_f32_m(v, pg, 0.0);
270 inline void load_vec(
svbool_t pg, svfloat32_t& v,
const float32_t *vp)
272 v = svld1_f32(pg, vp);
276 inline void load_add(
svbool_t pg, svfloat32_t& v, float32_t *vp)
279 v2 = svld1_f32(pg, vp);
280 v = svsel_f32(pg, v2, v);
286 inline void load_svint(
svbool_t pg, svint32_t& v, int32_t *vp)
288 v = svld1_s32(pg, vp);
292 inline void load_svuint(
svbool_t pg, svuint32_t& v, uint32_t *vp)
294 v = svld1_u32(pg, vp);
298 inline void load_vec_gather(
svbool_t pg, svfloat32_t& v, float32_t *vp,
301 v = svld1_gather_s32index_f32(pg, vp, index);
305 inline void load_add_gather(
svbool_t pg, svfloat32_t& v, float32_t *vp,
309 v2 = svld1_gather_s32index_f32(pg, vp, index);
310 v = svsel_f32(pg, v2, v);
316 inline void flip_sign(
svbool_t pg, svfloat32_t& v)
318 v = svneg_f32_m(v, pg, v);
322 inline void flip_sign(
svbool_t pg, svfloat32_t& v1, svfloat32_t& v2)
324 v1 = svneg_f32_m(v2, pg, v2);
328 #endif // __ARM_FEATURE_SVE