arm tcg cpus: Fix Lesser GPL version number
[qemu.git] / target / arm / sve_helper.c
1 /*
2 * ARM SVE Operations
3 *
4 * Copyright (c) 2018 Linaro, Ltd.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/cpu_ldst.h"
25 #include "exec/helper-proto.h"
26 #include "tcg/tcg-gvec-desc.h"
27 #include "fpu/softfloat.h"
28 #include "tcg/tcg.h"
29
30
31 /* Note that vector data is stored in host-endian 64-bit chunks,
32 so addressing units smaller than that needs a host-endian fixup. */
33 #ifdef HOST_WORDS_BIGENDIAN
34 #define H1(x) ((x) ^ 7)
35 #define H1_2(x) ((x) ^ 6)
36 #define H1_4(x) ((x) ^ 4)
37 #define H2(x) ((x) ^ 3)
38 #define H4(x) ((x) ^ 1)
39 #else
40 #define H1(x) (x)
41 #define H1_2(x) (x)
42 #define H1_4(x) (x)
43 #define H2(x) (x)
44 #define H4(x) (x)
45 #endif
46
47 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
48 *
49 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
50 * and bit 0 set if C is set. Compare the definitions of these variables
51 * within CPUARMState.
52 */
53
54 /* For no G bits set, NZCV = C. */
55 #define PREDTEST_INIT 1
56
57 /* This is an iterative function, called for each Pd and Pg word
58 * moving forward.
59 */
60 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
61 {
62 if (likely(g)) {
63 /* Compute N from first D & G.
64 Use bit 2 to signal first G bit seen. */
65 if (!(flags & 4)) {
66 flags |= ((d & (g & -g)) != 0) << 31;
67 flags |= 4;
68 }
69
70 /* Accumulate Z from each D & G. */
71 flags |= ((d & g) != 0) << 1;
72
73 /* Compute C from last !(D & G). Replace previous. */
74 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
75 }
76 return flags;
77 }
78
79 /* This is an iterative function, called for each Pd and Pg word
80 * moving backward.
81 */
82 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
83 {
84 if (likely(g)) {
85 /* Compute C from first (i.e last) !(D & G).
86 Use bit 2 to signal first G bit seen. */
87 if (!(flags & 4)) {
88 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
89 flags |= (d & pow2floor(g)) == 0;
90 }
91
92 /* Accumulate Z from each D & G. */
93 flags |= ((d & g) != 0) << 1;
94
95 /* Compute N from last (i.e first) D & G. Replace previous. */
96 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
97 }
98 return flags;
99 }
100
101 /* The same for a single word predicate. */
102 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
103 {
104 return iter_predtest_fwd(d, g, PREDTEST_INIT);
105 }
106
107 /* The same for a multi-word predicate. */
108 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
109 {
110 uint32_t flags = PREDTEST_INIT;
111 uint64_t *d = vd, *g = vg;
112 uintptr_t i = 0;
113
114 do {
115 flags = iter_predtest_fwd(d[i], g[i], flags);
116 } while (++i < words);
117
118 return flags;
119 }
120
121 /* Expand active predicate bits to bytes, for byte elements.
122 * for (i = 0; i < 256; ++i) {
123 * unsigned long m = 0;
124 * for (j = 0; j < 8; j++) {
125 * if ((i >> j) & 1) {
126 * m |= 0xfful << (j << 3);
127 * }
128 * }
129 * printf("0x%016lx,\n", m);
130 * }
131 */
132 static inline uint64_t expand_pred_b(uint8_t byte)
133 {
134 static const uint64_t word[256] = {
135 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
136 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
137 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
138 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
139 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
140 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
141 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
142 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
143 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
144 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
145 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
146 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
147 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
148 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
149 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
150 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
151 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
152 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
153 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
154 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
155 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
156 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
157 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
158 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
159 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
160 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
161 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
162 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
163 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
164 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
165 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
166 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
167 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
168 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
169 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
170 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
171 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
172 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
173 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
174 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
175 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
176 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
177 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
178 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
179 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
180 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
181 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
182 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
183 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
184 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
185 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
186 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
187 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
188 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
189 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
190 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
191 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
192 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
193 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
194 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
195 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
196 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
197 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
198 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
199 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
200 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
201 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
202 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
203 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
204 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
205 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
206 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
207 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
208 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
209 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
210 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
211 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
212 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
213 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
214 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
215 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
216 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
217 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
218 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
219 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
220 0xffffffffffffffff,
221 };
222 return word[byte];
223 }
224
225 /* Similarly for half-word elements.
226 * for (i = 0; i < 256; ++i) {
227 * unsigned long m = 0;
228 * if (i & 0xaa) {
229 * continue;
230 * }
231 * for (j = 0; j < 8; j += 2) {
232 * if ((i >> j) & 1) {
233 * m |= 0xfffful << (j << 3);
234 * }
235 * }
236 * printf("[0x%x] = 0x%016lx,\n", i, m);
237 * }
238 */
239 static inline uint64_t expand_pred_h(uint8_t byte)
240 {
241 static const uint64_t word[] = {
242 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
243 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
244 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
245 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
246 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
247 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
248 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
249 [0x55] = 0xffffffffffffffff,
250 };
251 return word[byte & 0x55];
252 }
253
254 /* Similarly for single word elements. */
255 static inline uint64_t expand_pred_s(uint8_t byte)
256 {
257 static const uint64_t word[] = {
258 [0x01] = 0x00000000ffffffffull,
259 [0x10] = 0xffffffff00000000ull,
260 [0x11] = 0xffffffffffffffffull,
261 };
262 return word[byte & 0x11];
263 }
264
265 /* Swap 16-bit words within a 32-bit word. */
266 static inline uint32_t hswap32(uint32_t h)
267 {
268 return rol32(h, 16);
269 }
270
271 /* Swap 16-bit words within a 64-bit word. */
272 static inline uint64_t hswap64(uint64_t h)
273 {
274 uint64_t m = 0x0000ffff0000ffffull;
275 h = rol64(h, 32);
276 return ((h & m) << 16) | ((h >> 16) & m);
277 }
278
279 /* Swap 32-bit words within a 64-bit word. */
280 static inline uint64_t wswap64(uint64_t h)
281 {
282 return rol64(h, 32);
283 }
284
285 #define LOGICAL_PPPP(NAME, FUNC) \
286 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
287 { \
288 uintptr_t opr_sz = simd_oprsz(desc); \
289 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
290 uintptr_t i; \
291 for (i = 0; i < opr_sz / 8; ++i) { \
292 d[i] = FUNC(n[i], m[i], g[i]); \
293 } \
294 }
295
296 #define DO_AND(N, M, G) (((N) & (M)) & (G))
297 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
298 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
299 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
300 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
301 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
302 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
303 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
304
305 LOGICAL_PPPP(sve_and_pppp, DO_AND)
306 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
307 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
308 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
309 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
310 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
311 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
312 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
313
314 #undef DO_AND
315 #undef DO_BIC
316 #undef DO_EOR
317 #undef DO_ORR
318 #undef DO_ORN
319 #undef DO_NOR
320 #undef DO_NAND
321 #undef DO_SEL
322 #undef LOGICAL_PPPP
323
324 /* Fully general three-operand expander, controlled by a predicate.
325 * This is complicated by the host-endian storage of the register file.
326 */
327 /* ??? I don't expect the compiler could ever vectorize this itself.
328 * With some tables we can convert bit masks to byte masks, and with
329 * extra care wrt byte/word ordering we could use gcc generic vectors
330 * and do 16 bytes at a time.
331 */
332 #define DO_ZPZZ(NAME, TYPE, H, OP) \
333 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
334 { \
335 intptr_t i, opr_sz = simd_oprsz(desc); \
336 for (i = 0; i < opr_sz; ) { \
337 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
338 do { \
339 if (pg & 1) { \
340 TYPE nn = *(TYPE *)(vn + H(i)); \
341 TYPE mm = *(TYPE *)(vm + H(i)); \
342 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
343 } \
344 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
345 } while (i & 15); \
346 } \
347 }
348
349 /* Similarly, specialized for 64-bit operands. */
350 #define DO_ZPZZ_D(NAME, TYPE, OP) \
351 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
352 { \
353 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
354 TYPE *d = vd, *n = vn, *m = vm; \
355 uint8_t *pg = vg; \
356 for (i = 0; i < opr_sz; i += 1) { \
357 if (pg[H1(i)] & 1) { \
358 TYPE nn = n[i], mm = m[i]; \
359 d[i] = OP(nn, mm); \
360 } \
361 } \
362 }
363
364 #define DO_AND(N, M) (N & M)
365 #define DO_EOR(N, M) (N ^ M)
366 #define DO_ORR(N, M) (N | M)
367 #define DO_BIC(N, M) (N & ~M)
368 #define DO_ADD(N, M) (N + M)
369 #define DO_SUB(N, M) (N - M)
370 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
371 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
372 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
373 #define DO_MUL(N, M) (N * M)
374
375
376 /*
377 * We must avoid the C undefined behaviour cases: division by
378 * zero and signed division of INT_MIN by -1. Both of these
379 * have architecturally defined required results for Arm.
380 * We special case all signed divisions by -1 to avoid having
381 * to deduce the minimum integer for the type involved.
382 */
383 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
384 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
385
386 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
387 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
388 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
389 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
390
391 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
392 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
393 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
394 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
395
396 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
397 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
398 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
399 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
400
401 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
402 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
403 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
404 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
405
406 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
407 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
408 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
409 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
410
411 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
412 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
413 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
414 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
415
416 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
417 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
418 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
419 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
420
421 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
422 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
423 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
424 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
425
426 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
427 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
428 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
429 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
430
431 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
432 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
433 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
434 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
435
436 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
437 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
438 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
439 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
440
441 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
442 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
443 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
444 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
445
446 /* Because the computation type is at least twice as large as required,
447 these work for both signed and unsigned source types. */
448 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
449 {
450 return (n * m) >> 8;
451 }
452
453 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
454 {
455 return (n * m) >> 16;
456 }
457
458 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
459 {
460 return (n * m) >> 32;
461 }
462
463 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
464 {
465 uint64_t lo, hi;
466 muls64(&lo, &hi, n, m);
467 return hi;
468 }
469
470 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
471 {
472 uint64_t lo, hi;
473 mulu64(&lo, &hi, n, m);
474 return hi;
475 }
476
477 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
478 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
479 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
480 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
481
482 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
483 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
484 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
485 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
486
487 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
488 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
489 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
490 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
491
492 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
493 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
494
495 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
496 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
497
498 /* Note that all bits of the shift are significant
499 and not modulo the element size. */
500 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
501 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
502 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
503
504 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
505 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
506 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
507
508 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
509 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
510 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
511
512 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
513 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
514 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
515
516 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
517 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
518 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
519
520 #undef DO_ZPZZ
521 #undef DO_ZPZZ_D
522
523 /* Three-operand expander, controlled by a predicate, in which the
524 * third operand is "wide". That is, for D = N op M, the same 64-bit
525 * value of M is used with all of the narrower values of N.
526 */
527 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
528 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
529 { \
530 intptr_t i, opr_sz = simd_oprsz(desc); \
531 for (i = 0; i < opr_sz; ) { \
532 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
533 TYPEW mm = *(TYPEW *)(vm + i); \
534 do { \
535 if (pg & 1) { \
536 TYPE nn = *(TYPE *)(vn + H(i)); \
537 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
538 } \
539 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
540 } while (i & 7); \
541 } \
542 }
543
544 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
545 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
546 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
547
548 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
549 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
550 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
551
552 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
553 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
554 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
555
556 #undef DO_ZPZW
557
558 /* Fully general two-operand expander, controlled by a predicate.
559 */
560 #define DO_ZPZ(NAME, TYPE, H, OP) \
561 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
562 { \
563 intptr_t i, opr_sz = simd_oprsz(desc); \
564 for (i = 0; i < opr_sz; ) { \
565 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
566 do { \
567 if (pg & 1) { \
568 TYPE nn = *(TYPE *)(vn + H(i)); \
569 *(TYPE *)(vd + H(i)) = OP(nn); \
570 } \
571 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
572 } while (i & 15); \
573 } \
574 }
575
576 /* Similarly, specialized for 64-bit operands. */
577 #define DO_ZPZ_D(NAME, TYPE, OP) \
578 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
579 { \
580 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
581 TYPE *d = vd, *n = vn; \
582 uint8_t *pg = vg; \
583 for (i = 0; i < opr_sz; i += 1) { \
584 if (pg[H1(i)] & 1) { \
585 TYPE nn = n[i]; \
586 d[i] = OP(nn); \
587 } \
588 } \
589 }
590
591 #define DO_CLS_B(N) (clrsb32(N) - 24)
592 #define DO_CLS_H(N) (clrsb32(N) - 16)
593
594 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
595 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
596 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
597 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
598
599 #define DO_CLZ_B(N) (clz32(N) - 24)
600 #define DO_CLZ_H(N) (clz32(N) - 16)
601
602 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
603 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
604 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
605 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
606
607 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
608 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
609 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
610 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
611
612 #define DO_CNOT(N) (N == 0)
613
614 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
615 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
616 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
617 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
618
619 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
620
621 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
622 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
623 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
624
625 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
626
627 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
628 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
629 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
630
631 #define DO_NOT(N) (~N)
632
633 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
634 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
635 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
636 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
637
638 #define DO_SXTB(N) ((int8_t)N)
639 #define DO_SXTH(N) ((int16_t)N)
640 #define DO_SXTS(N) ((int32_t)N)
641 #define DO_UXTB(N) ((uint8_t)N)
642 #define DO_UXTH(N) ((uint16_t)N)
643 #define DO_UXTS(N) ((uint32_t)N)
644
645 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
646 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
647 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
648 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
649 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
650 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
651
652 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
653 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
654 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
655 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
656 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
657 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
658
659 #define DO_ABS(N) (N < 0 ? -N : N)
660
661 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
662 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
663 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
664 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
665
666 #define DO_NEG(N) (-N)
667
668 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
669 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
670 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
671 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
672
673 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
674 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
675 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
676
677 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
678 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
679
680 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
681
682 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
683 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
684 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
685 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
686
687 /* Three-operand expander, unpredicated, in which the third operand is "wide".
688 */
689 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
690 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
691 { \
692 intptr_t i, opr_sz = simd_oprsz(desc); \
693 for (i = 0; i < opr_sz; ) { \
694 TYPEW mm = *(TYPEW *)(vm + i); \
695 do { \
696 TYPE nn = *(TYPE *)(vn + H(i)); \
697 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
698 i += sizeof(TYPE); \
699 } while (i & 7); \
700 } \
701 }
702
703 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
704 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
705 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
706
707 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
708 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
709 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
710
711 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
712 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
713 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
714
715 #undef DO_ZZW
716
717 #undef DO_CLS_B
718 #undef DO_CLS_H
719 #undef DO_CLZ_B
720 #undef DO_CLZ_H
721 #undef DO_CNOT
722 #undef DO_FABS
723 #undef DO_FNEG
724 #undef DO_ABS
725 #undef DO_NEG
726 #undef DO_ZPZ
727 #undef DO_ZPZ_D
728
729 /* Two-operand reduction expander, controlled by a predicate.
730 * The difference between TYPERED and TYPERET has to do with
731 * sign-extension. E.g. for SMAX, TYPERED must be signed,
732 * but TYPERET must be unsigned so that e.g. a 32-bit value
733 * is not sign-extended to the ABI uint64_t return type.
734 */
735 /* ??? If we were to vectorize this by hand the reduction ordering
736 * would change. For integer operands, this is perfectly fine.
737 */
738 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
739 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
740 { \
741 intptr_t i, opr_sz = simd_oprsz(desc); \
742 TYPERED ret = INIT; \
743 for (i = 0; i < opr_sz; ) { \
744 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
745 do { \
746 if (pg & 1) { \
747 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
748 ret = OP(ret, nn); \
749 } \
750 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
751 } while (i & 15); \
752 } \
753 return (TYPERET)ret; \
754 }
755
756 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
757 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
758 { \
759 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
760 TYPEE *n = vn; \
761 uint8_t *pg = vg; \
762 TYPER ret = INIT; \
763 for (i = 0; i < opr_sz; i += 1) { \
764 if (pg[H1(i)] & 1) { \
765 TYPEE nn = n[i]; \
766 ret = OP(ret, nn); \
767 } \
768 } \
769 return ret; \
770 }
771
772 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
773 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
774 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
775 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
776
777 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
778 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
779 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
780 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
781
782 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
783 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
784 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
785 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
786
787 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
788 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
789 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
790
791 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
792 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
793 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
794 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
795
796 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
797 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
798 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
799 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
800
801 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
802 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
803 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
804 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
805
806 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
807 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
808 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
809 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
810
811 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
812 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
813 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
814 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
815
816 #undef DO_VPZ
817 #undef DO_VPZ_D
818
819 /* Two vector operand, one scalar operand, unpredicated. */
820 #define DO_ZZI(NAME, TYPE, OP) \
821 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
822 { \
823 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
824 TYPE s = s64, *d = vd, *n = vn; \
825 for (i = 0; i < opr_sz; ++i) { \
826 d[i] = OP(n[i], s); \
827 } \
828 }
829
830 #define DO_SUBR(X, Y) (Y - X)
831
832 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
833 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
834 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
835 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
836
837 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
838 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
839 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
840 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
841
842 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
843 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
844 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
845 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
846
847 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
848 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
849 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
850 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
851
852 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
853 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
854 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
855 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
856
857 #undef DO_ZZI
858
859 #undef DO_AND
860 #undef DO_ORR
861 #undef DO_EOR
862 #undef DO_BIC
863 #undef DO_ADD
864 #undef DO_SUB
865 #undef DO_MAX
866 #undef DO_MIN
867 #undef DO_ABD
868 #undef DO_MUL
869 #undef DO_DIV
870 #undef DO_ASR
871 #undef DO_LSR
872 #undef DO_LSL
873 #undef DO_SUBR
874
875 /* Similar to the ARM LastActiveElement pseudocode function, except the
876 result is multiplied by the element size. This includes the not found
877 indication; e.g. not found for esz=3 is -8. */
878 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
879 {
880 uint64_t mask = pred_esz_masks[esz];
881 intptr_t i = words;
882
883 do {
884 uint64_t this_g = g[--i] & mask;
885 if (this_g) {
886 return i * 64 + (63 - clz64(this_g));
887 }
888 } while (i > 0);
889 return (intptr_t)-1 << esz;
890 }
891
892 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
893 {
894 uint32_t flags = PREDTEST_INIT;
895 uint64_t *d = vd, *g = vg;
896 intptr_t i = 0;
897
898 do {
899 uint64_t this_d = d[i];
900 uint64_t this_g = g[i];
901
902 if (this_g) {
903 if (!(flags & 4)) {
904 /* Set in D the first bit of G. */
905 this_d |= this_g & -this_g;
906 d[i] = this_d;
907 }
908 flags = iter_predtest_fwd(this_d, this_g, flags);
909 }
910 } while (++i < words);
911
912 return flags;
913 }
914
915 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
916 {
917 intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
918 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
919 uint32_t flags = PREDTEST_INIT;
920 uint64_t *d = vd, *g = vg, esz_mask;
921 intptr_t i, next;
922
923 next = last_active_element(vd, words, esz) + (1 << esz);
924 esz_mask = pred_esz_masks[esz];
925
926 /* Similar to the pseudocode for pnext, but scaled by ESZ
927 so that we find the correct bit. */
928 if (next < words * 64) {
929 uint64_t mask = -1;
930
931 if (next & 63) {
932 mask = ~((1ull << (next & 63)) - 1);
933 next &= -64;
934 }
935 do {
936 uint64_t this_g = g[next / 64] & esz_mask & mask;
937 if (this_g != 0) {
938 next = (next & -64) + ctz64(this_g);
939 break;
940 }
941 next += 64;
942 mask = -1;
943 } while (next < words * 64);
944 }
945
946 i = 0;
947 do {
948 uint64_t this_d = 0;
949 if (i == next / 64) {
950 this_d = 1ull << (next & 63);
951 }
952 d[i] = this_d;
953 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
954 } while (++i < words);
955
956 return flags;
957 }
958
959 /*
960 * Copy Zn into Zd, and store zero into inactive elements.
961 * If inv, store zeros into the active elements.
962 */
963 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
964 {
965 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
966 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
967 uint64_t *d = vd, *n = vn;
968 uint8_t *pg = vg;
969
970 for (i = 0; i < opr_sz; i += 1) {
971 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
972 }
973 }
974
975 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
976 {
977 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
978 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
979 uint64_t *d = vd, *n = vn;
980 uint8_t *pg = vg;
981
982 for (i = 0; i < opr_sz; i += 1) {
983 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
984 }
985 }
986
987 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
988 {
989 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
990 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
991 uint64_t *d = vd, *n = vn;
992 uint8_t *pg = vg;
993
994 for (i = 0; i < opr_sz; i += 1) {
995 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
996 }
997 }
998
999 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1000 {
1001 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1002 uint64_t *d = vd, *n = vn;
1003 uint8_t *pg = vg;
1004 uint8_t inv = simd_data(desc);
1005
1006 for (i = 0; i < opr_sz; i += 1) {
1007 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
1008 }
1009 }
1010
1011 /* Three-operand expander, immediate operand, controlled by a predicate.
1012 */
1013 #define DO_ZPZI(NAME, TYPE, H, OP) \
1014 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1015 { \
1016 intptr_t i, opr_sz = simd_oprsz(desc); \
1017 TYPE imm = simd_data(desc); \
1018 for (i = 0; i < opr_sz; ) { \
1019 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1020 do { \
1021 if (pg & 1) { \
1022 TYPE nn = *(TYPE *)(vn + H(i)); \
1023 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1024 } \
1025 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1026 } while (i & 15); \
1027 } \
1028 }
1029
1030 /* Similarly, specialized for 64-bit operands. */
1031 #define DO_ZPZI_D(NAME, TYPE, OP) \
1032 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1033 { \
1034 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1035 TYPE *d = vd, *n = vn; \
1036 TYPE imm = simd_data(desc); \
1037 uint8_t *pg = vg; \
1038 for (i = 0; i < opr_sz; i += 1) { \
1039 if (pg[H1(i)] & 1) { \
1040 TYPE nn = n[i]; \
1041 d[i] = OP(nn, imm); \
1042 } \
1043 } \
1044 }
1045
1046 #define DO_SHR(N, M) (N >> M)
1047 #define DO_SHL(N, M) (N << M)
1048
1049 /* Arithmetic shift right for division. This rounds negative numbers
1050 toward zero as per signed division. Therefore before shifting,
1051 when N is negative, add 2**M-1. */
1052 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1053
1054 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1055 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1056 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1057 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1058
1059 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1060 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1061 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1062 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1063
1064 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1065 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1066 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1067 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1068
1069 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1070 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1071 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1072 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1073
1074 #undef DO_SHR
1075 #undef DO_SHL
1076 #undef DO_ASRD
1077 #undef DO_ZPZI
1078 #undef DO_ZPZI_D
1079
1080 /* Fully general four-operand expander, controlled by a predicate.
1081 */
1082 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
1083 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1084 void *vg, uint32_t desc) \
1085 { \
1086 intptr_t i, opr_sz = simd_oprsz(desc); \
1087 for (i = 0; i < opr_sz; ) { \
1088 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1089 do { \
1090 if (pg & 1) { \
1091 TYPE nn = *(TYPE *)(vn + H(i)); \
1092 TYPE mm = *(TYPE *)(vm + H(i)); \
1093 TYPE aa = *(TYPE *)(va + H(i)); \
1094 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1095 } \
1096 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1097 } while (i & 15); \
1098 } \
1099 }
1100
1101 /* Similarly, specialized for 64-bit operands. */
1102 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
1103 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1104 void *vg, uint32_t desc) \
1105 { \
1106 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1107 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1108 uint8_t *pg = vg; \
1109 for (i = 0; i < opr_sz; i += 1) { \
1110 if (pg[H1(i)] & 1) { \
1111 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1112 d[i] = OP(aa, nn, mm); \
1113 } \
1114 } \
1115 }
1116
1117 #define DO_MLA(A, N, M) (A + N * M)
1118 #define DO_MLS(A, N, M) (A - N * M)
1119
1120 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1121 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1122
1123 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1124 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1125
1126 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1127 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1128
1129 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1130 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1131
1132 #undef DO_MLA
1133 #undef DO_MLS
1134 #undef DO_ZPZZZ
1135 #undef DO_ZPZZZ_D
1136
1137 void HELPER(sve_index_b)(void *vd, uint32_t start,
1138 uint32_t incr, uint32_t desc)
1139 {
1140 intptr_t i, opr_sz = simd_oprsz(desc);
1141 uint8_t *d = vd;
1142 for (i = 0; i < opr_sz; i += 1) {
1143 d[H1(i)] = start + i * incr;
1144 }
1145 }
1146
1147 void HELPER(sve_index_h)(void *vd, uint32_t start,
1148 uint32_t incr, uint32_t desc)
1149 {
1150 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1151 uint16_t *d = vd;
1152 for (i = 0; i < opr_sz; i += 1) {
1153 d[H2(i)] = start + i * incr;
1154 }
1155 }
1156
1157 void HELPER(sve_index_s)(void *vd, uint32_t start,
1158 uint32_t incr, uint32_t desc)
1159 {
1160 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1161 uint32_t *d = vd;
1162 for (i = 0; i < opr_sz; i += 1) {
1163 d[H4(i)] = start + i * incr;
1164 }
1165 }
1166
1167 void HELPER(sve_index_d)(void *vd, uint64_t start,
1168 uint64_t incr, uint32_t desc)
1169 {
1170 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1171 uint64_t *d = vd;
1172 for (i = 0; i < opr_sz; i += 1) {
1173 d[i] = start + i * incr;
1174 }
1175 }
1176
1177 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1178 {
1179 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1180 uint32_t sh = simd_data(desc);
1181 uint32_t *d = vd, *n = vn, *m = vm;
1182 for (i = 0; i < opr_sz; i += 1) {
1183 d[i] = n[i] + (m[i] << sh);
1184 }
1185 }
1186
1187 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1188 {
1189 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1190 uint64_t sh = simd_data(desc);
1191 uint64_t *d = vd, *n = vn, *m = vm;
1192 for (i = 0; i < opr_sz; i += 1) {
1193 d[i] = n[i] + (m[i] << sh);
1194 }
1195 }
1196
1197 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1198 {
1199 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1200 uint64_t sh = simd_data(desc);
1201 uint64_t *d = vd, *n = vn, *m = vm;
1202 for (i = 0; i < opr_sz; i += 1) {
1203 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1204 }
1205 }
1206
1207 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1208 {
1209 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1210 uint64_t sh = simd_data(desc);
1211 uint64_t *d = vd, *n = vn, *m = vm;
1212 for (i = 0; i < opr_sz; i += 1) {
1213 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1214 }
1215 }
1216
1217 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1218 {
1219 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1220 static const uint16_t coeff[] = {
1221 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1222 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1223 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1224 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1225 };
1226 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1227 uint16_t *d = vd, *n = vn;
1228
1229 for (i = 0; i < opr_sz; i++) {
1230 uint16_t nn = n[i];
1231 intptr_t idx = extract32(nn, 0, 5);
1232 uint16_t exp = extract32(nn, 5, 5);
1233 d[i] = coeff[idx] | (exp << 10);
1234 }
1235 }
1236
1237 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1238 {
1239 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1240 static const uint32_t coeff[] = {
1241 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1242 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1243 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1244 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1245 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1246 0x1ef532, 0x20b051, 0x227043, 0x243516,
1247 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1248 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1249 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1250 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1251 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1252 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1253 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1254 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1255 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1256 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1257 };
1258 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1259 uint32_t *d = vd, *n = vn;
1260
1261 for (i = 0; i < opr_sz; i++) {
1262 uint32_t nn = n[i];
1263 intptr_t idx = extract32(nn, 0, 6);
1264 uint32_t exp = extract32(nn, 6, 8);
1265 d[i] = coeff[idx] | (exp << 23);
1266 }
1267 }
1268
1269 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1270 {
1271 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1272 static const uint64_t coeff[] = {
1273 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1274 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1275 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1276 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1277 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1278 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1279 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1280 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1281 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1282 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1283 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1284 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1285 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1286 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1287 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1288 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1289 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1290 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1291 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1292 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1293 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1294 0xFA7C1819E90D8ull,
1295 };
1296 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1297 uint64_t *d = vd, *n = vn;
1298
1299 for (i = 0; i < opr_sz; i++) {
1300 uint64_t nn = n[i];
1301 intptr_t idx = extract32(nn, 0, 6);
1302 uint64_t exp = extract32(nn, 6, 11);
1303 d[i] = coeff[idx] | (exp << 52);
1304 }
1305 }
1306
1307 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1308 {
1309 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1310 uint16_t *d = vd, *n = vn, *m = vm;
1311 for (i = 0; i < opr_sz; i += 1) {
1312 uint16_t nn = n[i];
1313 uint16_t mm = m[i];
1314 if (mm & 1) {
1315 nn = float16_one;
1316 }
1317 d[i] = nn ^ (mm & 2) << 14;
1318 }
1319 }
1320
1321 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1322 {
1323 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1324 uint32_t *d = vd, *n = vn, *m = vm;
1325 for (i = 0; i < opr_sz; i += 1) {
1326 uint32_t nn = n[i];
1327 uint32_t mm = m[i];
1328 if (mm & 1) {
1329 nn = float32_one;
1330 }
1331 d[i] = nn ^ (mm & 2) << 30;
1332 }
1333 }
1334
1335 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1336 {
1337 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1338 uint64_t *d = vd, *n = vn, *m = vm;
1339 for (i = 0; i < opr_sz; i += 1) {
1340 uint64_t nn = n[i];
1341 uint64_t mm = m[i];
1342 if (mm & 1) {
1343 nn = float64_one;
1344 }
1345 d[i] = nn ^ (mm & 2) << 62;
1346 }
1347 }
1348
1349 /*
1350 * Signed saturating addition with scalar operand.
1351 */
1352
1353 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1354 {
1355 intptr_t i, oprsz = simd_oprsz(desc);
1356
1357 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1358 int r = *(int8_t *)(a + i) + b;
1359 if (r > INT8_MAX) {
1360 r = INT8_MAX;
1361 } else if (r < INT8_MIN) {
1362 r = INT8_MIN;
1363 }
1364 *(int8_t *)(d + i) = r;
1365 }
1366 }
1367
1368 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1369 {
1370 intptr_t i, oprsz = simd_oprsz(desc);
1371
1372 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1373 int r = *(int16_t *)(a + i) + b;
1374 if (r > INT16_MAX) {
1375 r = INT16_MAX;
1376 } else if (r < INT16_MIN) {
1377 r = INT16_MIN;
1378 }
1379 *(int16_t *)(d + i) = r;
1380 }
1381 }
1382
1383 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1384 {
1385 intptr_t i, oprsz = simd_oprsz(desc);
1386
1387 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1388 int64_t r = *(int32_t *)(a + i) + b;
1389 if (r > INT32_MAX) {
1390 r = INT32_MAX;
1391 } else if (r < INT32_MIN) {
1392 r = INT32_MIN;
1393 }
1394 *(int32_t *)(d + i) = r;
1395 }
1396 }
1397
1398 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1399 {
1400 intptr_t i, oprsz = simd_oprsz(desc);
1401
1402 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1403 int64_t ai = *(int64_t *)(a + i);
1404 int64_t r = ai + b;
1405 if (((r ^ ai) & ~(ai ^ b)) < 0) {
1406 /* Signed overflow. */
1407 r = (r < 0 ? INT64_MAX : INT64_MIN);
1408 }
1409 *(int64_t *)(d + i) = r;
1410 }
1411 }
1412
1413 /*
1414 * Unsigned saturating addition with scalar operand.
1415 */
1416
1417 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1418 {
1419 intptr_t i, oprsz = simd_oprsz(desc);
1420
1421 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1422 int r = *(uint8_t *)(a + i) + b;
1423 if (r > UINT8_MAX) {
1424 r = UINT8_MAX;
1425 } else if (r < 0) {
1426 r = 0;
1427 }
1428 *(uint8_t *)(d + i) = r;
1429 }
1430 }
1431
1432 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1433 {
1434 intptr_t i, oprsz = simd_oprsz(desc);
1435
1436 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1437 int r = *(uint16_t *)(a + i) + b;
1438 if (r > UINT16_MAX) {
1439 r = UINT16_MAX;
1440 } else if (r < 0) {
1441 r = 0;
1442 }
1443 *(uint16_t *)(d + i) = r;
1444 }
1445 }
1446
1447 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1448 {
1449 intptr_t i, oprsz = simd_oprsz(desc);
1450
1451 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1452 int64_t r = *(uint32_t *)(a + i) + b;
1453 if (r > UINT32_MAX) {
1454 r = UINT32_MAX;
1455 } else if (r < 0) {
1456 r = 0;
1457 }
1458 *(uint32_t *)(d + i) = r;
1459 }
1460 }
1461
1462 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1463 {
1464 intptr_t i, oprsz = simd_oprsz(desc);
1465
1466 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1467 uint64_t r = *(uint64_t *)(a + i) + b;
1468 if (r < b) {
1469 r = UINT64_MAX;
1470 }
1471 *(uint64_t *)(d + i) = r;
1472 }
1473 }
1474
1475 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1476 {
1477 intptr_t i, oprsz = simd_oprsz(desc);
1478
1479 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1480 uint64_t ai = *(uint64_t *)(a + i);
1481 *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1482 }
1483 }
1484
1485 /* Two operand predicated copy immediate with merge. All valid immediates
1486 * can fit within 17 signed bits in the simd_data field.
1487 */
1488 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1489 uint64_t mm, uint32_t desc)
1490 {
1491 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1492 uint64_t *d = vd, *n = vn;
1493 uint8_t *pg = vg;
1494
1495 mm = dup_const(MO_8, mm);
1496 for (i = 0; i < opr_sz; i += 1) {
1497 uint64_t nn = n[i];
1498 uint64_t pp = expand_pred_b(pg[H1(i)]);
1499 d[i] = (mm & pp) | (nn & ~pp);
1500 }
1501 }
1502
1503 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1504 uint64_t mm, uint32_t desc)
1505 {
1506 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1507 uint64_t *d = vd, *n = vn;
1508 uint8_t *pg = vg;
1509
1510 mm = dup_const(MO_16, mm);
1511 for (i = 0; i < opr_sz; i += 1) {
1512 uint64_t nn = n[i];
1513 uint64_t pp = expand_pred_h(pg[H1(i)]);
1514 d[i] = (mm & pp) | (nn & ~pp);
1515 }
1516 }
1517
1518 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1519 uint64_t mm, uint32_t desc)
1520 {
1521 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1522 uint64_t *d = vd, *n = vn;
1523 uint8_t *pg = vg;
1524
1525 mm = dup_const(MO_32, mm);
1526 for (i = 0; i < opr_sz; i += 1) {
1527 uint64_t nn = n[i];
1528 uint64_t pp = expand_pred_s(pg[H1(i)]);
1529 d[i] = (mm & pp) | (nn & ~pp);
1530 }
1531 }
1532
1533 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1534 uint64_t mm, uint32_t desc)
1535 {
1536 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1537 uint64_t *d = vd, *n = vn;
1538 uint8_t *pg = vg;
1539
1540 for (i = 0; i < opr_sz; i += 1) {
1541 uint64_t nn = n[i];
1542 d[i] = (pg[H1(i)] & 1 ? mm : nn);
1543 }
1544 }
1545
1546 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1547 {
1548 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1549 uint64_t *d = vd;
1550 uint8_t *pg = vg;
1551
1552 val = dup_const(MO_8, val);
1553 for (i = 0; i < opr_sz; i += 1) {
1554 d[i] = val & expand_pred_b(pg[H1(i)]);
1555 }
1556 }
1557
1558 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1559 {
1560 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1561 uint64_t *d = vd;
1562 uint8_t *pg = vg;
1563
1564 val = dup_const(MO_16, val);
1565 for (i = 0; i < opr_sz; i += 1) {
1566 d[i] = val & expand_pred_h(pg[H1(i)]);
1567 }
1568 }
1569
1570 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1571 {
1572 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1573 uint64_t *d = vd;
1574 uint8_t *pg = vg;
1575
1576 val = dup_const(MO_32, val);
1577 for (i = 0; i < opr_sz; i += 1) {
1578 d[i] = val & expand_pred_s(pg[H1(i)]);
1579 }
1580 }
1581
1582 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1583 {
1584 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1585 uint64_t *d = vd;
1586 uint8_t *pg = vg;
1587
1588 for (i = 0; i < opr_sz; i += 1) {
1589 d[i] = (pg[H1(i)] & 1 ? val : 0);
1590 }
1591 }
1592
1593 /* Big-endian hosts need to frob the byte indices. If the copy
1594 * happens to be 8-byte aligned, then no frobbing necessary.
1595 */
1596 static void swap_memmove(void *vd, void *vs, size_t n)
1597 {
1598 uintptr_t d = (uintptr_t)vd;
1599 uintptr_t s = (uintptr_t)vs;
1600 uintptr_t o = (d | s | n) & 7;
1601 size_t i;
1602
1603 #ifndef HOST_WORDS_BIGENDIAN
1604 o = 0;
1605 #endif
1606 switch (o) {
1607 case 0:
1608 memmove(vd, vs, n);
1609 break;
1610
1611 case 4:
1612 if (d < s || d >= s + n) {
1613 for (i = 0; i < n; i += 4) {
1614 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1615 }
1616 } else {
1617 for (i = n; i > 0; ) {
1618 i -= 4;
1619 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1620 }
1621 }
1622 break;
1623
1624 case 2:
1625 case 6:
1626 if (d < s || d >= s + n) {
1627 for (i = 0; i < n; i += 2) {
1628 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1629 }
1630 } else {
1631 for (i = n; i > 0; ) {
1632 i -= 2;
1633 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1634 }
1635 }
1636 break;
1637
1638 default:
1639 if (d < s || d >= s + n) {
1640 for (i = 0; i < n; i++) {
1641 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1642 }
1643 } else {
1644 for (i = n; i > 0; ) {
1645 i -= 1;
1646 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1647 }
1648 }
1649 break;
1650 }
1651 }
1652
1653 /* Similarly for memset of 0. */
1654 static void swap_memzero(void *vd, size_t n)
1655 {
1656 uintptr_t d = (uintptr_t)vd;
1657 uintptr_t o = (d | n) & 7;
1658 size_t i;
1659
1660 /* Usually, the first bit of a predicate is set, so N is 0. */
1661 if (likely(n == 0)) {
1662 return;
1663 }
1664
1665 #ifndef HOST_WORDS_BIGENDIAN
1666 o = 0;
1667 #endif
1668 switch (o) {
1669 case 0:
1670 memset(vd, 0, n);
1671 break;
1672
1673 case 4:
1674 for (i = 0; i < n; i += 4) {
1675 *(uint32_t *)H1_4(d + i) = 0;
1676 }
1677 break;
1678
1679 case 2:
1680 case 6:
1681 for (i = 0; i < n; i += 2) {
1682 *(uint16_t *)H1_2(d + i) = 0;
1683 }
1684 break;
1685
1686 default:
1687 for (i = 0; i < n; i++) {
1688 *(uint8_t *)H1(d + i) = 0;
1689 }
1690 break;
1691 }
1692 }
1693
1694 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1695 {
1696 intptr_t opr_sz = simd_oprsz(desc);
1697 size_t n_ofs = simd_data(desc);
1698 size_t n_siz = opr_sz - n_ofs;
1699
1700 if (vd != vm) {
1701 swap_memmove(vd, vn + n_ofs, n_siz);
1702 swap_memmove(vd + n_siz, vm, n_ofs);
1703 } else if (vd != vn) {
1704 swap_memmove(vd + n_siz, vd, n_ofs);
1705 swap_memmove(vd, vn + n_ofs, n_siz);
1706 } else {
1707 /* vd == vn == vm. Need temp space. */
1708 ARMVectorReg tmp;
1709 swap_memmove(&tmp, vm, n_ofs);
1710 swap_memmove(vd, vd + n_ofs, n_siz);
1711 memcpy(vd + n_siz, &tmp, n_ofs);
1712 }
1713 }
1714
1715 #define DO_INSR(NAME, TYPE, H) \
1716 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1717 { \
1718 intptr_t opr_sz = simd_oprsz(desc); \
1719 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1720 *(TYPE *)(vd + H(0)) = val; \
1721 }
1722
1723 DO_INSR(sve_insr_b, uint8_t, H1)
1724 DO_INSR(sve_insr_h, uint16_t, H1_2)
1725 DO_INSR(sve_insr_s, uint32_t, H1_4)
1726 DO_INSR(sve_insr_d, uint64_t, )
1727
1728 #undef DO_INSR
1729
1730 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1731 {
1732 intptr_t i, j, opr_sz = simd_oprsz(desc);
1733 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1734 uint64_t f = *(uint64_t *)(vn + i);
1735 uint64_t b = *(uint64_t *)(vn + j);
1736 *(uint64_t *)(vd + i) = bswap64(b);
1737 *(uint64_t *)(vd + j) = bswap64(f);
1738 }
1739 }
1740
1741 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1742 {
1743 intptr_t i, j, opr_sz = simd_oprsz(desc);
1744 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1745 uint64_t f = *(uint64_t *)(vn + i);
1746 uint64_t b = *(uint64_t *)(vn + j);
1747 *(uint64_t *)(vd + i) = hswap64(b);
1748 *(uint64_t *)(vd + j) = hswap64(f);
1749 }
1750 }
1751
1752 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1753 {
1754 intptr_t i, j, opr_sz = simd_oprsz(desc);
1755 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1756 uint64_t f = *(uint64_t *)(vn + i);
1757 uint64_t b = *(uint64_t *)(vn + j);
1758 *(uint64_t *)(vd + i) = rol64(b, 32);
1759 *(uint64_t *)(vd + j) = rol64(f, 32);
1760 }
1761 }
1762
1763 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1764 {
1765 intptr_t i, j, opr_sz = simd_oprsz(desc);
1766 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1767 uint64_t f = *(uint64_t *)(vn + i);
1768 uint64_t b = *(uint64_t *)(vn + j);
1769 *(uint64_t *)(vd + i) = b;
1770 *(uint64_t *)(vd + j) = f;
1771 }
1772 }
1773
1774 #define DO_TBL(NAME, TYPE, H) \
1775 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1776 { \
1777 intptr_t i, opr_sz = simd_oprsz(desc); \
1778 uintptr_t elem = opr_sz / sizeof(TYPE); \
1779 TYPE *d = vd, *n = vn, *m = vm; \
1780 ARMVectorReg tmp; \
1781 if (unlikely(vd == vn)) { \
1782 n = memcpy(&tmp, vn, opr_sz); \
1783 } \
1784 for (i = 0; i < elem; i++) { \
1785 TYPE j = m[H(i)]; \
1786 d[H(i)] = j < elem ? n[H(j)] : 0; \
1787 } \
1788 }
1789
1790 DO_TBL(sve_tbl_b, uint8_t, H1)
1791 DO_TBL(sve_tbl_h, uint16_t, H2)
1792 DO_TBL(sve_tbl_s, uint32_t, H4)
1793 DO_TBL(sve_tbl_d, uint64_t, )
1794
1795 #undef TBL
1796
1797 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1798 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1799 { \
1800 intptr_t i, opr_sz = simd_oprsz(desc); \
1801 TYPED *d = vd; \
1802 TYPES *n = vn; \
1803 ARMVectorReg tmp; \
1804 if (unlikely(vn - vd < opr_sz)) { \
1805 n = memcpy(&tmp, n, opr_sz / 2); \
1806 } \
1807 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1808 d[HD(i)] = n[HS(i)]; \
1809 } \
1810 }
1811
1812 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1813 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1814 DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1815
1816 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1817 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1818 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1819
1820 #undef DO_UNPK
1821
1822 /* Mask of bits included in the even numbered predicates of width esz.
1823 * We also use this for expand_bits/compress_bits, and so extend the
1824 * same pattern out to 16-bit units.
1825 */
1826 static const uint64_t even_bit_esz_masks[5] = {
1827 0x5555555555555555ull,
1828 0x3333333333333333ull,
1829 0x0f0f0f0f0f0f0f0full,
1830 0x00ff00ff00ff00ffull,
1831 0x0000ffff0000ffffull,
1832 };
1833
1834 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1835 * For N==0, this corresponds to the operation that in qemu/bitops.h
1836 * we call half_shuffle64; this algorithm is from Hacker's Delight,
1837 * section 7-2 Shuffling Bits.
1838 */
1839 static uint64_t expand_bits(uint64_t x, int n)
1840 {
1841 int i;
1842
1843 x &= 0xffffffffu;
1844 for (i = 4; i >= n; i--) {
1845 int sh = 1 << i;
1846 x = ((x << sh) | x) & even_bit_esz_masks[i];
1847 }
1848 return x;
1849 }
1850
1851 /* Compress units of 2**(N+1) bits to units of 2**N bits.
1852 * For N==0, this corresponds to the operation that in qemu/bitops.h
1853 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1854 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1855 */
1856 static uint64_t compress_bits(uint64_t x, int n)
1857 {
1858 int i;
1859
1860 for (i = n; i <= 4; i++) {
1861 int sh = 1 << i;
1862 x &= even_bit_esz_masks[i];
1863 x = (x >> sh) | x;
1864 }
1865 return x & 0xffffffffu;
1866 }
1867
1868 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1869 {
1870 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1871 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1872 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1873 uint64_t *d = vd;
1874 intptr_t i;
1875
1876 if (oprsz <= 8) {
1877 uint64_t nn = *(uint64_t *)vn;
1878 uint64_t mm = *(uint64_t *)vm;
1879 int half = 4 * oprsz;
1880
1881 nn = extract64(nn, high * half, half);
1882 mm = extract64(mm, high * half, half);
1883 nn = expand_bits(nn, esz);
1884 mm = expand_bits(mm, esz);
1885 d[0] = nn + (mm << (1 << esz));
1886 } else {
1887 ARMPredicateReg tmp_n, tmp_m;
1888
1889 /* We produce output faster than we consume input.
1890 Therefore we must be mindful of possible overlap. */
1891 if ((vn - vd) < (uintptr_t)oprsz) {
1892 vn = memcpy(&tmp_n, vn, oprsz);
1893 }
1894 if ((vm - vd) < (uintptr_t)oprsz) {
1895 vm = memcpy(&tmp_m, vm, oprsz);
1896 }
1897 if (high) {
1898 high = oprsz >> 1;
1899 }
1900
1901 if ((high & 3) == 0) {
1902 uint32_t *n = vn, *m = vm;
1903 high >>= 2;
1904
1905 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1906 uint64_t nn = n[H4(high + i)];
1907 uint64_t mm = m[H4(high + i)];
1908
1909 nn = expand_bits(nn, esz);
1910 mm = expand_bits(mm, esz);
1911 d[i] = nn + (mm << (1 << esz));
1912 }
1913 } else {
1914 uint8_t *n = vn, *m = vm;
1915 uint16_t *d16 = vd;
1916
1917 for (i = 0; i < oprsz / 2; i++) {
1918 uint16_t nn = n[H1(high + i)];
1919 uint16_t mm = m[H1(high + i)];
1920
1921 nn = expand_bits(nn, esz);
1922 mm = expand_bits(mm, esz);
1923 d16[H2(i)] = nn + (mm << (1 << esz));
1924 }
1925 }
1926 }
1927 }
1928
1929 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1930 {
1931 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1932 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1933 int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
1934 uint64_t *d = vd, *n = vn, *m = vm;
1935 uint64_t l, h;
1936 intptr_t i;
1937
1938 if (oprsz <= 8) {
1939 l = compress_bits(n[0] >> odd, esz);
1940 h = compress_bits(m[0] >> odd, esz);
1941 d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
1942 } else {
1943 ARMPredicateReg tmp_m;
1944 intptr_t oprsz_16 = oprsz / 16;
1945
1946 if ((vm - vd) < (uintptr_t)oprsz) {
1947 m = memcpy(&tmp_m, vm, oprsz);
1948 }
1949
1950 for (i = 0; i < oprsz_16; i++) {
1951 l = n[2 * i + 0];
1952 h = n[2 * i + 1];
1953 l = compress_bits(l >> odd, esz);
1954 h = compress_bits(h >> odd, esz);
1955 d[i] = l + (h << 32);
1956 }
1957
1958 /* For VL which is not a power of 2, the results from M do not
1959 align nicely with the uint64_t for D. Put the aligned results
1960 from M into TMP_M and then copy it into place afterward. */
1961 if (oprsz & 15) {
1962 d[i] = compress_bits(n[2 * i] >> odd, esz);
1963
1964 for (i = 0; i < oprsz_16; i++) {
1965 l = m[2 * i + 0];
1966 h = m[2 * i + 1];
1967 l = compress_bits(l >> odd, esz);
1968 h = compress_bits(h >> odd, esz);
1969 tmp_m.p[i] = l + (h << 32);
1970 }
1971 tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
1972
1973 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
1974 } else {
1975 for (i = 0; i < oprsz_16; i++) {
1976 l = m[2 * i + 0];
1977 h = m[2 * i + 1];
1978 l = compress_bits(l >> odd, esz);
1979 h = compress_bits(h >> odd, esz);
1980 d[oprsz_16 + i] = l + (h << 32);
1981 }
1982 }
1983 }
1984 }
1985
1986 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1987 {
1988 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1989 uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1990 bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1991 uint64_t *d = vd, *n = vn, *m = vm;
1992 uint64_t mask;
1993 int shr, shl;
1994 intptr_t i;
1995
1996 shl = 1 << esz;
1997 shr = 0;
1998 mask = even_bit_esz_masks[esz];
1999 if (odd) {
2000 mask <<= shl;
2001 shr = shl;
2002 shl = 0;
2003 }
2004
2005 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2006 uint64_t nn = (n[i] & mask) >> shr;
2007 uint64_t mm = (m[i] & mask) << shl;
2008 d[i] = nn + mm;
2009 }
2010 }
2011
2012 /* Reverse units of 2**N bits. */
2013 static uint64_t reverse_bits_64(uint64_t x, int n)
2014 {
2015 int i, sh;
2016
2017 x = bswap64(x);
2018 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2019 uint64_t mask = even_bit_esz_masks[i];
2020 x = ((x & mask) << sh) | ((x >> sh) & mask);
2021 }
2022 return x;
2023 }
2024
2025 static uint8_t reverse_bits_8(uint8_t x, int n)
2026 {
2027 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2028 int i, sh;
2029
2030 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2031 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2032 }
2033 return x;
2034 }
2035
2036 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2037 {
2038 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2039 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2040 intptr_t i, oprsz_2 = oprsz / 2;
2041
2042 if (oprsz <= 8) {
2043 uint64_t l = *(uint64_t *)vn;
2044 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2045 *(uint64_t *)vd = l;
2046 } else if ((oprsz & 15) == 0) {
2047 for (i = 0; i < oprsz_2; i += 8) {
2048 intptr_t ih = oprsz - 8 - i;
2049 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2050 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2051 *(uint64_t *)(vd + i) = h;
2052 *(uint64_t *)(vd + ih) = l;
2053 }
2054 } else {
2055 for (i = 0; i < oprsz_2; i += 1) {
2056 intptr_t il = H1(i);
2057 intptr_t ih = H1(oprsz - 1 - i);
2058 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2059 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2060 *(uint8_t *)(vd + il) = h;
2061 *(uint8_t *)(vd + ih) = l;
2062 }
2063 }
2064 }
2065
2066 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2067 {
2068 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2069 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2070 uint64_t *d = vd;
2071 intptr_t i;
2072
2073 if (oprsz <= 8) {
2074 uint64_t nn = *(uint64_t *)vn;
2075 int half = 4 * oprsz;
2076
2077 nn = extract64(nn, high * half, half);
2078 nn = expand_bits(nn, 0);
2079 d[0] = nn;
2080 } else {
2081 ARMPredicateReg tmp_n;
2082
2083 /* We produce output faster than we consume input.
2084 Therefore we must be mindful of possible overlap. */
2085 if ((vn - vd) < (uintptr_t)oprsz) {
2086 vn = memcpy(&tmp_n, vn, oprsz);
2087 }
2088 if (high) {
2089 high = oprsz >> 1;
2090 }
2091
2092 if ((high & 3) == 0) {
2093 uint32_t *n = vn;
2094 high >>= 2;
2095
2096 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2097 uint64_t nn = n[H4(high + i)];
2098 d[i] = expand_bits(nn, 0);
2099 }
2100 } else {
2101 uint16_t *d16 = vd;
2102 uint8_t *n = vn;
2103
2104 for (i = 0; i < oprsz / 2; i++) {
2105 uint16_t nn = n[H1(high + i)];
2106 d16[H2(i)] = expand_bits(nn, 0);
2107 }
2108 }
2109 }
2110 }
2111
2112 #define DO_ZIP(NAME, TYPE, H) \
2113 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2114 { \
2115 intptr_t oprsz = simd_oprsz(desc); \
2116 intptr_t i, oprsz_2 = oprsz / 2; \
2117 ARMVectorReg tmp_n, tmp_m; \
2118 /* We produce output faster than we consume input. \
2119 Therefore we must be mindful of possible overlap. */ \
2120 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2121 vn = memcpy(&tmp_n, vn, oprsz_2); \
2122 } \
2123 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2124 vm = memcpy(&tmp_m, vm, oprsz_2); \
2125 } \
2126 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2127 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2128 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2129 } \
2130 }
2131
2132 DO_ZIP(sve_zip_b, uint8_t, H1)
2133 DO_ZIP(sve_zip_h, uint16_t, H1_2)
2134 DO_ZIP(sve_zip_s, uint32_t, H1_4)
2135 DO_ZIP(sve_zip_d, uint64_t, )
2136
2137 #define DO_UZP(NAME, TYPE, H) \
2138 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2139 { \
2140 intptr_t oprsz = simd_oprsz(desc); \
2141 intptr_t oprsz_2 = oprsz / 2; \
2142 intptr_t odd_ofs = simd_data(desc); \
2143 intptr_t i; \
2144 ARMVectorReg tmp_m; \
2145 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2146 vm = memcpy(&tmp_m, vm, oprsz); \
2147 } \
2148 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2149 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2150 } \
2151 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2152 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2153 } \
2154 }
2155
2156 DO_UZP(sve_uzp_b, uint8_t, H1)
2157 DO_UZP(sve_uzp_h, uint16_t, H1_2)
2158 DO_UZP(sve_uzp_s, uint32_t, H1_4)
2159 DO_UZP(sve_uzp_d, uint64_t, )
2160
2161 #define DO_TRN(NAME, TYPE, H) \
2162 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2163 { \
2164 intptr_t oprsz = simd_oprsz(desc); \
2165 intptr_t odd_ofs = simd_data(desc); \
2166 intptr_t i; \
2167 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2168 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2169 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2170 *(TYPE *)(vd + H(i + 0)) = ae; \
2171 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2172 } \
2173 }
2174
2175 DO_TRN(sve_trn_b, uint8_t, H1)
2176 DO_TRN(sve_trn_h, uint16_t, H1_2)
2177 DO_TRN(sve_trn_s, uint32_t, H1_4)
2178 DO_TRN(sve_trn_d, uint64_t, )
2179
2180 #undef DO_ZIP
2181 #undef DO_UZP
2182 #undef DO_TRN
2183
2184 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2185 {
2186 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2187 uint32_t *d = vd, *n = vn;
2188 uint8_t *pg = vg;
2189
2190 for (i = j = 0; i < opr_sz; i++) {
2191 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2192 d[H4(j)] = n[H4(i)];
2193 j++;
2194 }
2195 }
2196 for (; j < opr_sz; j++) {
2197 d[H4(j)] = 0;
2198 }
2199 }
2200
2201 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2202 {
2203 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2204 uint64_t *d = vd, *n = vn;
2205 uint8_t *pg = vg;
2206
2207 for (i = j = 0; i < opr_sz; i++) {
2208 if (pg[H1(i)] & 1) {
2209 d[j] = n[i];
2210 j++;
2211 }
2212 }
2213 for (; j < opr_sz; j++) {
2214 d[j] = 0;
2215 }
2216 }
2217
2218 /* Similar to the ARM LastActiveElement pseudocode function, except the
2219 * result is multiplied by the element size. This includes the not found
2220 * indication; e.g. not found for esz=3 is -8.
2221 */
2222 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2223 {
2224 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2225 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2226
2227 return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2228 }
2229
2230 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2231 {
2232 intptr_t opr_sz = simd_oprsz(desc) / 8;
2233 int esz = simd_data(desc);
2234 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2235 intptr_t i, first_i, last_i;
2236 ARMVectorReg tmp;
2237
2238 first_i = last_i = 0;
2239 first_g = last_g = 0;
2240
2241 /* Find the extent of the active elements within VG. */
2242 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2243 pg = *(uint64_t *)(vg + i) & mask;
2244 if (pg) {
2245 if (last_g == 0) {
2246 last_g = pg;
2247 last_i = i;
2248 }
2249 first_g = pg;
2250 first_i = i;
2251 }
2252 }
2253
2254 len = 0;
2255 if (first_g != 0) {
2256 first_i = first_i * 8 + ctz64(first_g);
2257 last_i = last_i * 8 + 63 - clz64(last_g);
2258 len = last_i - first_i + (1 << esz);
2259 if (vd == vm) {
2260 vm = memcpy(&tmp, vm, opr_sz * 8);
2261 }
2262 swap_memmove(vd, vn + first_i, len);
2263 }
2264 swap_memmove(vd + len, vm, opr_sz * 8 - len);
2265 }
2266
2267 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2268 void *vg, uint32_t desc)
2269 {
2270 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2271 uint64_t *d = vd, *n = vn, *m = vm;
2272 uint8_t *pg = vg;
2273
2274 for (i = 0; i < opr_sz; i += 1) {
2275 uint64_t nn = n[i], mm = m[i];
2276 uint64_t pp = expand_pred_b(pg[H1(i)]);
2277 d[i] = (nn & pp) | (mm & ~pp);
2278 }
2279 }
2280
2281 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2282 void *vg, uint32_t desc)
2283 {
2284 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2285 uint64_t *d = vd, *n = vn, *m = vm;
2286 uint8_t *pg = vg;
2287
2288 for (i = 0; i < opr_sz; i += 1) {
2289 uint64_t nn = n[i], mm = m[i];
2290 uint64_t pp = expand_pred_h(pg[H1(i)]);
2291 d[i] = (nn & pp) | (mm & ~pp);
2292 }
2293 }
2294
2295 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2296 void *vg, uint32_t desc)
2297 {
2298 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2299 uint64_t *d = vd, *n = vn, *m = vm;
2300 uint8_t *pg = vg;
2301
2302 for (i = 0; i < opr_sz; i += 1) {
2303 uint64_t nn = n[i], mm = m[i];
2304 uint64_t pp = expand_pred_s(pg[H1(i)]);
2305 d[i] = (nn & pp) | (mm & ~pp);
2306 }
2307 }
2308
2309 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2310 void *vg, uint32_t desc)
2311 {
2312 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2313 uint64_t *d = vd, *n = vn, *m = vm;
2314 uint8_t *pg = vg;
2315
2316 for (i = 0; i < opr_sz; i += 1) {
2317 uint64_t nn = n[i], mm = m[i];
2318 d[i] = (pg[H1(i)] & 1 ? nn : mm);
2319 }
2320 }
2321
2322 /* Two operand comparison controlled by a predicate.
2323 * ??? It is very tempting to want to be able to expand this inline
2324 * with x86 instructions, e.g.
2325 *
2326 * vcmpeqw zm, zn, %ymm0
2327 * vpmovmskb %ymm0, %eax
2328 * and $0x5555, %eax
2329 * and pg, %eax
2330 *
2331 * or even aarch64, e.g.
2332 *
2333 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2334 * cmeq v0.8h, zn, zm
2335 * and v0.8h, v0.8h, mask
2336 * addv h0, v0.8h
2337 * and v0.8b, pg
2338 *
2339 * However, coming up with an abstraction that allows vector inputs and
2340 * a scalar output, and also handles the byte-ordering of sub-uint64_t
2341 * scalar outputs, is tricky.
2342 */
2343 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
2344 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2345 { \
2346 intptr_t opr_sz = simd_oprsz(desc); \
2347 uint32_t flags = PREDTEST_INIT; \
2348 intptr_t i = opr_sz; \
2349 do { \
2350 uint64_t out = 0, pg; \
2351 do { \
2352 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2353 TYPE nn = *(TYPE *)(vn + H(i)); \
2354 TYPE mm = *(TYPE *)(vm + H(i)); \
2355 out |= nn OP mm; \
2356 } while (i & 63); \
2357 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2358 out &= pg; \
2359 *(uint64_t *)(vd + (i >> 3)) = out; \
2360 flags = iter_predtest_bwd(out, pg, flags); \
2361 } while (i > 0); \
2362 return flags; \
2363 }
2364
2365 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2366 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2367 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2368 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2369 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2370 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2371 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2372 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
2373
2374 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
2375 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2376 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2377 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2378
2379 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
2380 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2381 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2382 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2383
2384 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
2385 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2386 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2387 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2388
2389 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
2390 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2391 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2392 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2393
2394 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
2395 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2396 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2397 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2398
2399 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
2400 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2401 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2402 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2403
2404 #undef DO_CMP_PPZZ_B
2405 #undef DO_CMP_PPZZ_H
2406 #undef DO_CMP_PPZZ_S
2407 #undef DO_CMP_PPZZ_D
2408 #undef DO_CMP_PPZZ
2409
2410 /* Similar, but the second source is "wide". */
2411 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
2412 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2413 { \
2414 intptr_t opr_sz = simd_oprsz(desc); \
2415 uint32_t flags = PREDTEST_INIT; \
2416 intptr_t i = opr_sz; \
2417 do { \
2418 uint64_t out = 0, pg; \
2419 do { \
2420 TYPEW mm = *(TYPEW *)(vm + i - 8); \
2421 do { \
2422 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2423 TYPE nn = *(TYPE *)(vn + H(i)); \
2424 out |= nn OP mm; \
2425 } while (i & 7); \
2426 } while (i & 63); \
2427 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2428 out &= pg; \
2429 *(uint64_t *)(vd + (i >> 3)) = out; \
2430 flags = iter_predtest_bwd(out, pg, flags); \
2431 } while (i > 0); \
2432 return flags; \
2433 }
2434
2435 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2436 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
2437 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2438 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2439 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2440 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2441
2442 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
2443 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
2444 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
2445
2446 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
2447 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
2448 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
2449
2450 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
2451 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
2452 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
2453
2454 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
2455 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
2456 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
2457
2458 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
2459 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2460 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2461
2462 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
2463 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2464 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2465
2466 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
2467 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
2468 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
2469
2470 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
2471 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
2472 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
2473
2474 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
2475 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2476 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2477
2478 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
2479 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2480 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2481
2482 #undef DO_CMP_PPZW_B
2483 #undef DO_CMP_PPZW_H
2484 #undef DO_CMP_PPZW_S
2485 #undef DO_CMP_PPZW
2486
2487 /* Similar, but the second source is immediate. */
2488 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
2489 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2490 { \
2491 intptr_t opr_sz = simd_oprsz(desc); \
2492 uint32_t flags = PREDTEST_INIT; \
2493 TYPE mm = simd_data(desc); \
2494 intptr_t i = opr_sz; \
2495 do { \
2496 uint64_t out = 0, pg; \
2497 do { \
2498 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2499 TYPE nn = *(TYPE *)(vn + H(i)); \
2500 out |= nn OP mm; \
2501 } while (i & 63); \
2502 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2503 out &= pg; \
2504 *(uint64_t *)(vd + (i >> 3)) = out; \
2505 flags = iter_predtest_bwd(out, pg, flags); \
2506 } while (i > 0); \
2507 return flags; \
2508 }
2509
2510 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2511 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2512 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2513 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2514 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2515 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2516 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2517 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
2518
2519 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
2520 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
2521 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
2522 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
2523
2524 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
2525 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
2526 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
2527 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
2528
2529 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
2530 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
2531 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
2532 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
2533
2534 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
2535 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
2536 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
2537 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
2538
2539 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
2540 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
2541 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
2542 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
2543
2544 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
2545 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
2546 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
2547 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
2548
2549 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
2550 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
2551 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
2552 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
2553
2554 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
2555 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
2556 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
2557 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
2558
2559 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
2560 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
2561 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
2562 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
2563
2564 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
2565 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
2566 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
2567 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
2568
2569 #undef DO_CMP_PPZI_B
2570 #undef DO_CMP_PPZI_H
2571 #undef DO_CMP_PPZI_S
2572 #undef DO_CMP_PPZI_D
2573 #undef DO_CMP_PPZI
2574
2575 /* Similar to the ARM LastActive pseudocode function. */
2576 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
2577 {
2578 intptr_t i;
2579
2580 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
2581 uint64_t pg = *(uint64_t *)(vg + i);
2582 if (pg) {
2583 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
2584 }
2585 }
2586 return 0;
2587 }
2588
2589 /* Compute a mask into RETB that is true for all G, up to and including
2590 * (if after) or excluding (if !after) the first G & N.
2591 * Return true if BRK found.
2592 */
2593 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
2594 bool brk, bool after)
2595 {
2596 uint64_t b;
2597
2598 if (brk) {
2599 b = 0;
2600 } else if ((g & n) == 0) {
2601 /* For all G, no N are set; break not found. */
2602 b = g;
2603 } else {
2604 /* Break somewhere in N. Locate it. */
2605 b = g & n; /* guard true, pred true */
2606 b = b & -b; /* first such */
2607 if (after) {
2608 b = b | (b - 1); /* break after same */
2609 } else {
2610 b = b - 1; /* break before same */
2611 }
2612 brk = true;
2613 }
2614
2615 *retb = b;
2616 return brk;
2617 }
2618
2619 /* Compute a zeroing BRK. */
2620 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
2621 intptr_t oprsz, bool after)
2622 {
2623 bool brk = false;
2624 intptr_t i;
2625
2626 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2627 uint64_t this_b, this_g = g[i];
2628
2629 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2630 d[i] = this_b & this_g;
2631 }
2632 }
2633
2634 /* Likewise, but also compute flags. */
2635 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
2636 intptr_t oprsz, bool after)
2637 {
2638 uint32_t flags = PREDTEST_INIT;
2639 bool brk = false;
2640 intptr_t i;
2641
2642 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2643 uint64_t this_b, this_d, this_g = g[i];
2644
2645 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2646 d[i] = this_d = this_b & this_g;
2647 flags = iter_predtest_fwd(this_d, this_g, flags);
2648 }
2649 return flags;
2650 }
2651
2652 /* Compute a merging BRK. */
2653 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
2654 intptr_t oprsz, bool after)
2655 {
2656 bool brk = false;
2657 intptr_t i;
2658
2659 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2660 uint64_t this_b, this_g = g[i];
2661
2662 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2663 d[i] = (this_b & this_g) | (d[i] & ~this_g);
2664 }
2665 }
2666
2667 /* Likewise, but also compute flags. */
2668 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
2669 intptr_t oprsz, bool after)
2670 {
2671 uint32_t flags = PREDTEST_INIT;
2672 bool brk = false;
2673 intptr_t i;
2674
2675 for (i = 0; i < oprsz / 8; ++i) {
2676 uint64_t this_b, this_d = d[i], this_g = g[i];
2677
2678 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2679 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
2680 flags = iter_predtest_fwd(this_d, this_g, flags);
2681 }
2682 return flags;
2683 }
2684
2685 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
2686 {
2687 /* It is quicker to zero the whole predicate than loop on OPRSZ.
2688 * The compiler should turn this into 4 64-bit integer stores.
2689 */
2690 memset(d, 0, sizeof(ARMPredicateReg));
2691 return PREDTEST_INIT;
2692 }
2693
2694 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
2695 uint32_t pred_desc)
2696 {
2697 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2698 if (last_active_pred(vn, vg, oprsz)) {
2699 compute_brk_z(vd, vm, vg, oprsz, true);
2700 } else {
2701 do_zero(vd, oprsz);
2702 }
2703 }
2704
2705 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
2706 uint32_t pred_desc)
2707 {
2708 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2709 if (last_active_pred(vn, vg, oprsz)) {
2710 return compute_brks_z(vd, vm, vg, oprsz, true);
2711 } else {
2712 return do_zero(vd, oprsz);
2713 }
2714 }
2715
2716 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
2717 uint32_t pred_desc)
2718 {
2719 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2720 if (last_active_pred(vn, vg, oprsz)) {
2721 compute_brk_z(vd, vm, vg, oprsz, false);
2722 } else {
2723 do_zero(vd, oprsz);
2724 }
2725 }
2726
2727 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
2728 uint32_t pred_desc)
2729 {
2730 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2731 if (last_active_pred(vn, vg, oprsz)) {
2732 return compute_brks_z(vd, vm, vg, oprsz, false);
2733 } else {
2734 return do_zero(vd, oprsz);
2735 }
2736 }
2737
2738 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2739 {
2740 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2741 compute_brk_z(vd, vn, vg, oprsz, true);
2742 }
2743
2744 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2745 {
2746 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2747 return compute_brks_z(vd, vn, vg, oprsz, true);
2748 }
2749
2750 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2751 {
2752 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2753 compute_brk_z(vd, vn, vg, oprsz, false);
2754 }
2755
2756 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2757 {
2758 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2759 return compute_brks_z(vd, vn, vg, oprsz, false);
2760 }
2761
2762 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2763 {
2764 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2765 compute_brk_m(vd, vn, vg, oprsz, true);
2766 }
2767
2768 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2769 {
2770 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2771 return compute_brks_m(vd, vn, vg, oprsz, true);
2772 }
2773
2774 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2775 {
2776 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2777 compute_brk_m(vd, vn, vg, oprsz, false);
2778 }
2779
2780 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2781 {
2782 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2783 return compute_brks_m(vd, vn, vg, oprsz, false);
2784 }
2785
2786 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2787 {
2788 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2789
2790 if (!last_active_pred(vn, vg, oprsz)) {
2791 do_zero(vd, oprsz);
2792 }
2793 }
2794
2795 /* As if PredTest(Ones(PL), D, esz). */
2796 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
2797 uint64_t esz_mask)
2798 {
2799 uint32_t flags = PREDTEST_INIT;
2800 intptr_t i;
2801
2802 for (i = 0; i < oprsz / 8; i++) {
2803 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
2804 }
2805 if (oprsz & 7) {
2806 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
2807 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
2808 }
2809 return flags;
2810 }
2811
2812 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2813 {
2814 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2815
2816 if (last_active_pred(vn, vg, oprsz)) {
2817 return predtest_ones(vd, oprsz, -1);
2818 } else {
2819 return do_zero(vd, oprsz);
2820 }
2821 }
2822
2823 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
2824 {
2825 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2826 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2827 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
2828 intptr_t i;
2829
2830 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2831 uint64_t t = n[i] & g[i] & mask;
2832 sum += ctpop64(t);
2833 }
2834 return sum;
2835 }
2836
2837 uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
2838 {
2839 uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2840 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2841 uint64_t esz_mask = pred_esz_masks[esz];
2842 ARMPredicateReg *d = vd;
2843 uint32_t flags;
2844 intptr_t i;
2845
2846 /* Begin with a zero predicate register. */
2847 flags = do_zero(d, oprsz);
2848 if (count == 0) {
2849 return flags;
2850 }
2851
2852 /* Set all of the requested bits. */
2853 for (i = 0; i < count / 64; ++i) {
2854 d->p[i] = esz_mask;
2855 }
2856 if (count & 63) {
2857 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
2858 }
2859
2860 return predtest_ones(d, oprsz, esz_mask);
2861 }
2862
2863 /* Recursive reduction on a function;
2864 * C.f. the ARM ARM function ReducePredicated.
2865 *
2866 * While it would be possible to write this without the DATA temporary,
2867 * it is much simpler to process the predicate register this way.
2868 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
2869 * little to gain with a more complex non-recursive form.
2870 */
2871 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
2872 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
2873 { \
2874 if (n == 1) { \
2875 return *data; \
2876 } else { \
2877 uintptr_t half = n / 2; \
2878 TYPE lo = NAME##_reduce(data, status, half); \
2879 TYPE hi = NAME##_reduce(data + half, status, half); \
2880 return TYPE##_##FUNC(lo, hi, status); \
2881 } \
2882 } \
2883 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
2884 { \
2885 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_maxsz(desc); \
2886 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
2887 for (i = 0; i < oprsz; ) { \
2888 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2889 do { \
2890 TYPE nn = *(TYPE *)(vn + H(i)); \
2891 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
2892 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2893 } while (i & 15); \
2894 } \
2895 for (; i < maxsz; i += sizeof(TYPE)) { \
2896 *(TYPE *)((void *)data + i) = IDENT; \
2897 } \
2898 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
2899 }
2900
2901 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
2902 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
2903 DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
2904
2905 /* Identity is floatN_default_nan, without the function call. */
2906 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
2907 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
2908 DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
2909
2910 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
2911 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
2912 DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
2913
2914 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
2915 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
2916 DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
2917
2918 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
2919 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
2920 DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
2921
2922 #undef DO_REDUCE
2923
2924 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
2925 void *status, uint32_t desc)
2926 {
2927 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2928 float16 result = nn;
2929
2930 do {
2931 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2932 do {
2933 if (pg & 1) {
2934 float16 mm = *(float16 *)(vm + H1_2(i));
2935 result = float16_add(result, mm, status);
2936 }
2937 i += sizeof(float16), pg >>= sizeof(float16);
2938 } while (i & 15);
2939 } while (i < opr_sz);
2940
2941 return result;
2942 }
2943
2944 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
2945 void *status, uint32_t desc)
2946 {
2947 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2948 float32 result = nn;
2949
2950 do {
2951 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2952 do {
2953 if (pg & 1) {
2954 float32 mm = *(float32 *)(vm + H1_2(i));
2955 result = float32_add(result, mm, status);
2956 }
2957 i += sizeof(float32), pg >>= sizeof(float32);
2958 } while (i & 15);
2959 } while (i < opr_sz);
2960
2961 return result;
2962 }
2963
2964 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
2965 void *status, uint32_t desc)
2966 {
2967 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
2968 uint64_t *m = vm;
2969 uint8_t *pg = vg;
2970
2971 for (i = 0; i < opr_sz; i++) {
2972 if (pg[H1(i)] & 1) {
2973 nn = float64_add(nn, m[i], status);
2974 }
2975 }
2976
2977 return nn;
2978 }
2979
2980 /* Fully general three-operand expander, controlled by a predicate,
2981 * With the extra float_status parameter.
2982 */
2983 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
2984 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
2985 void *status, uint32_t desc) \
2986 { \
2987 intptr_t i = simd_oprsz(desc); \
2988 uint64_t *g = vg; \
2989 do { \
2990 uint64_t pg = g[(i - 1) >> 6]; \
2991 do { \
2992 i -= sizeof(TYPE); \
2993 if (likely((pg >> (i & 63)) & 1)) { \
2994 TYPE nn = *(TYPE *)(vn + H(i)); \
2995 TYPE mm = *(TYPE *)(vm + H(i)); \
2996 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
2997 } \
2998 } while (i & 63); \
2999 } while (i != 0); \
3000 }
3001
3002 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
3003 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
3004 DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
3005
3006 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
3007 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3008 DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
3009
3010 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3011 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3012 DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
3013
3014 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3015 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)