target/arm: Implement SVE Memory Contiguous Store Group
[qemu.git] / target / arm / sve_helper.c
1 /*
2 * ARM SVE Operations
3 *
4 * Copyright (c) 2018 Linaro, Ltd.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/exec-all.h"
23 #include "exec/cpu_ldst.h"
24 #include "exec/helper-proto.h"
25 #include "tcg/tcg-gvec-desc.h"
26 #include "fpu/softfloat.h"
27
28
29 /* Note that vector data is stored in host-endian 64-bit chunks,
30 so addressing units smaller than that needs a host-endian fixup. */
31 #ifdef HOST_WORDS_BIGENDIAN
32 #define H1(x) ((x) ^ 7)
33 #define H1_2(x) ((x) ^ 6)
34 #define H1_4(x) ((x) ^ 4)
35 #define H2(x) ((x) ^ 3)
36 #define H4(x) ((x) ^ 1)
37 #else
38 #define H1(x) (x)
39 #define H1_2(x) (x)
40 #define H1_4(x) (x)
41 #define H2(x) (x)
42 #define H4(x) (x)
43 #endif
44
45 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
46 *
47 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
48 * and bit 0 set if C is set. Compare the definitions of these variables
49 * within CPUARMState.
50 */
51
52 /* For no G bits set, NZCV = C. */
53 #define PREDTEST_INIT 1
54
55 /* This is an iterative function, called for each Pd and Pg word
56 * moving forward.
57 */
58 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
59 {
60 if (likely(g)) {
61 /* Compute N from first D & G.
62 Use bit 2 to signal first G bit seen. */
63 if (!(flags & 4)) {
64 flags |= ((d & (g & -g)) != 0) << 31;
65 flags |= 4;
66 }
67
68 /* Accumulate Z from each D & G. */
69 flags |= ((d & g) != 0) << 1;
70
71 /* Compute C from last !(D & G). Replace previous. */
72 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
73 }
74 return flags;
75 }
76
77 /* This is an iterative function, called for each Pd and Pg word
78 * moving backward.
79 */
80 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
81 {
82 if (likely(g)) {
83 /* Compute C from first (i.e last) !(D & G).
84 Use bit 2 to signal first G bit seen. */
85 if (!(flags & 4)) {
86 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
87 flags |= (d & pow2floor(g)) == 0;
88 }
89
90 /* Accumulate Z from each D & G. */
91 flags |= ((d & g) != 0) << 1;
92
93 /* Compute N from last (i.e first) D & G. Replace previous. */
94 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
95 }
96 return flags;
97 }
98
99 /* The same for a single word predicate. */
100 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
101 {
102 return iter_predtest_fwd(d, g, PREDTEST_INIT);
103 }
104
105 /* The same for a multi-word predicate. */
106 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
107 {
108 uint32_t flags = PREDTEST_INIT;
109 uint64_t *d = vd, *g = vg;
110 uintptr_t i = 0;
111
112 do {
113 flags = iter_predtest_fwd(d[i], g[i], flags);
114 } while (++i < words);
115
116 return flags;
117 }
118
119 /* Expand active predicate bits to bytes, for byte elements.
120 * for (i = 0; i < 256; ++i) {
121 * unsigned long m = 0;
122 * for (j = 0; j < 8; j++) {
123 * if ((i >> j) & 1) {
124 * m |= 0xfful << (j << 3);
125 * }
126 * }
127 * printf("0x%016lx,\n", m);
128 * }
129 */
130 static inline uint64_t expand_pred_b(uint8_t byte)
131 {
132 static const uint64_t word[256] = {
133 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
134 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
135 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
136 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
137 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
138 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
139 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
140 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
141 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
142 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
143 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
144 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
145 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
146 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
147 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
148 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
149 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
150 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
151 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
152 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
153 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
154 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
155 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
156 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
157 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
158 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
159 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
160 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
161 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
162 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
163 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
164 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
165 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
166 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
167 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
168 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
169 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
170 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
171 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
172 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
173 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
174 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
175 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
176 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
177 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
178 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
179 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
180 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
181 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
182 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
183 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
184 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
185 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
186 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
187 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
188 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
189 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
190 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
191 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
192 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
193 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
194 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
195 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
196 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
197 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
198 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
199 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
200 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
201 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
202 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
203 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
204 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
205 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
206 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
207 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
208 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
209 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
210 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
211 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
212 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
213 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
214 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
215 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
216 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
217 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
218 0xffffffffffffffff,
219 };
220 return word[byte];
221 }
222
223 /* Similarly for half-word elements.
224 * for (i = 0; i < 256; ++i) {
225 * unsigned long m = 0;
226 * if (i & 0xaa) {
227 * continue;
228 * }
229 * for (j = 0; j < 8; j += 2) {
230 * if ((i >> j) & 1) {
231 * m |= 0xfffful << (j << 3);
232 * }
233 * }
234 * printf("[0x%x] = 0x%016lx,\n", i, m);
235 * }
236 */
237 static inline uint64_t expand_pred_h(uint8_t byte)
238 {
239 static const uint64_t word[] = {
240 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
241 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
242 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
243 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
244 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
245 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
246 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
247 [0x55] = 0xffffffffffffffff,
248 };
249 return word[byte & 0x55];
250 }
251
252 /* Similarly for single word elements. */
253 static inline uint64_t expand_pred_s(uint8_t byte)
254 {
255 static const uint64_t word[] = {
256 [0x01] = 0x00000000ffffffffull,
257 [0x10] = 0xffffffff00000000ull,
258 [0x11] = 0xffffffffffffffffull,
259 };
260 return word[byte & 0x11];
261 }
262
263 /* Swap 16-bit words within a 32-bit word. */
264 static inline uint32_t hswap32(uint32_t h)
265 {
266 return rol32(h, 16);
267 }
268
269 /* Swap 16-bit words within a 64-bit word. */
270 static inline uint64_t hswap64(uint64_t h)
271 {
272 uint64_t m = 0x0000ffff0000ffffull;
273 h = rol64(h, 32);
274 return ((h & m) << 16) | ((h >> 16) & m);
275 }
276
277 /* Swap 32-bit words within a 64-bit word. */
278 static inline uint64_t wswap64(uint64_t h)
279 {
280 return rol64(h, 32);
281 }
282
283 #define LOGICAL_PPPP(NAME, FUNC) \
284 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
285 { \
286 uintptr_t opr_sz = simd_oprsz(desc); \
287 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
288 uintptr_t i; \
289 for (i = 0; i < opr_sz / 8; ++i) { \
290 d[i] = FUNC(n[i], m[i], g[i]); \
291 } \
292 }
293
294 #define DO_AND(N, M, G) (((N) & (M)) & (G))
295 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
296 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
297 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
298 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
299 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
300 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
301 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
302
303 LOGICAL_PPPP(sve_and_pppp, DO_AND)
304 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
305 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
306 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
307 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
308 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
309 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
310 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
311
312 #undef DO_AND
313 #undef DO_BIC
314 #undef DO_EOR
315 #undef DO_ORR
316 #undef DO_ORN
317 #undef DO_NOR
318 #undef DO_NAND
319 #undef DO_SEL
320 #undef LOGICAL_PPPP
321
322 /* Fully general three-operand expander, controlled by a predicate.
323 * This is complicated by the host-endian storage of the register file.
324 */
325 /* ??? I don't expect the compiler could ever vectorize this itself.
326 * With some tables we can convert bit masks to byte masks, and with
327 * extra care wrt byte/word ordering we could use gcc generic vectors
328 * and do 16 bytes at a time.
329 */
330 #define DO_ZPZZ(NAME, TYPE, H, OP) \
331 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
332 { \
333 intptr_t i, opr_sz = simd_oprsz(desc); \
334 for (i = 0; i < opr_sz; ) { \
335 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
336 do { \
337 if (pg & 1) { \
338 TYPE nn = *(TYPE *)(vn + H(i)); \
339 TYPE mm = *(TYPE *)(vm + H(i)); \
340 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
341 } \
342 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
343 } while (i & 15); \
344 } \
345 }
346
347 /* Similarly, specialized for 64-bit operands. */
348 #define DO_ZPZZ_D(NAME, TYPE, OP) \
349 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
350 { \
351 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
352 TYPE *d = vd, *n = vn, *m = vm; \
353 uint8_t *pg = vg; \
354 for (i = 0; i < opr_sz; i += 1) { \
355 if (pg[H1(i)] & 1) { \
356 TYPE nn = n[i], mm = m[i]; \
357 d[i] = OP(nn, mm); \
358 } \
359 } \
360 }
361
362 #define DO_AND(N, M) (N & M)
363 #define DO_EOR(N, M) (N ^ M)
364 #define DO_ORR(N, M) (N | M)
365 #define DO_BIC(N, M) (N & ~M)
366 #define DO_ADD(N, M) (N + M)
367 #define DO_SUB(N, M) (N - M)
368 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
369 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
370 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
371 #define DO_MUL(N, M) (N * M)
372 #define DO_DIV(N, M) (M ? N / M : 0)
373
374 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
375 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
376 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
377 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
378
379 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
380 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
381 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
382 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
383
384 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
385 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
386 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
387 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
388
389 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
390 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
391 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
392 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
393
394 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
395 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
396 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
397 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
398
399 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
400 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
401 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
402 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
403
404 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
405 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
406 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
407 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
408
409 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
410 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
411 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
412 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
413
414 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
415 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
416 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
417 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
418
419 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
420 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
421 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
422 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
423
424 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
425 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
426 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
427 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
428
429 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
430 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
431 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
432 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
433
434 /* Because the computation type is at least twice as large as required,
435 these work for both signed and unsigned source types. */
436 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
437 {
438 return (n * m) >> 8;
439 }
440
441 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
442 {
443 return (n * m) >> 16;
444 }
445
446 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
447 {
448 return (n * m) >> 32;
449 }
450
451 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
452 {
453 uint64_t lo, hi;
454 muls64(&lo, &hi, n, m);
455 return hi;
456 }
457
458 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
459 {
460 uint64_t lo, hi;
461 mulu64(&lo, &hi, n, m);
462 return hi;
463 }
464
465 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
466 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
467 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
468 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
469
470 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
471 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
472 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
473 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
474
475 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
476 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
477 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
478 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
479
480 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_DIV)
481 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_DIV)
482
483 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_DIV)
484 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_DIV)
485
486 /* Note that all bits of the shift are significant
487 and not modulo the element size. */
488 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
489 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
490 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
491
492 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
493 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
494 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
495
496 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
497 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
498 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
499
500 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
501 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
502 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
503
504 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
505 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
506 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
507
508 #undef DO_ZPZZ
509 #undef DO_ZPZZ_D
510
511 /* Three-operand expander, controlled by a predicate, in which the
512 * third operand is "wide". That is, for D = N op M, the same 64-bit
513 * value of M is used with all of the narrower values of N.
514 */
515 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
516 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
517 { \
518 intptr_t i, opr_sz = simd_oprsz(desc); \
519 for (i = 0; i < opr_sz; ) { \
520 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
521 TYPEW mm = *(TYPEW *)(vm + i); \
522 do { \
523 if (pg & 1) { \
524 TYPE nn = *(TYPE *)(vn + H(i)); \
525 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
526 } \
527 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
528 } while (i & 7); \
529 } \
530 }
531
532 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
533 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
534 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
535
536 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
537 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
538 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
539
540 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
541 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
542 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
543
544 #undef DO_ZPZW
545
546 /* Fully general two-operand expander, controlled by a predicate.
547 */
548 #define DO_ZPZ(NAME, TYPE, H, OP) \
549 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
550 { \
551 intptr_t i, opr_sz = simd_oprsz(desc); \
552 for (i = 0; i < opr_sz; ) { \
553 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
554 do { \
555 if (pg & 1) { \
556 TYPE nn = *(TYPE *)(vn + H(i)); \
557 *(TYPE *)(vd + H(i)) = OP(nn); \
558 } \
559 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
560 } while (i & 15); \
561 } \
562 }
563
564 /* Similarly, specialized for 64-bit operands. */
565 #define DO_ZPZ_D(NAME, TYPE, OP) \
566 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
567 { \
568 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
569 TYPE *d = vd, *n = vn; \
570 uint8_t *pg = vg; \
571 for (i = 0; i < opr_sz; i += 1) { \
572 if (pg[H1(i)] & 1) { \
573 TYPE nn = n[i]; \
574 d[i] = OP(nn); \
575 } \
576 } \
577 }
578
579 #define DO_CLS_B(N) (clrsb32(N) - 24)
580 #define DO_CLS_H(N) (clrsb32(N) - 16)
581
582 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
583 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
584 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
585 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
586
587 #define DO_CLZ_B(N) (clz32(N) - 24)
588 #define DO_CLZ_H(N) (clz32(N) - 16)
589
590 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
591 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
592 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
593 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
594
595 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
596 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
597 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
598 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
599
600 #define DO_CNOT(N) (N == 0)
601
602 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
603 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
604 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
605 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
606
607 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
608
609 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
610 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
611 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
612
613 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
614
615 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
616 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
617 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
618
619 #define DO_NOT(N) (~N)
620
621 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
622 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
623 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
624 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
625
626 #define DO_SXTB(N) ((int8_t)N)
627 #define DO_SXTH(N) ((int16_t)N)
628 #define DO_SXTS(N) ((int32_t)N)
629 #define DO_UXTB(N) ((uint8_t)N)
630 #define DO_UXTH(N) ((uint16_t)N)
631 #define DO_UXTS(N) ((uint32_t)N)
632
633 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
634 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
635 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
636 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
637 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
638 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
639
640 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
641 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
642 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
643 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
644 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
645 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
646
647 #define DO_ABS(N) (N < 0 ? -N : N)
648
649 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
650 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
651 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
652 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
653
654 #define DO_NEG(N) (-N)
655
656 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
657 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
658 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
659 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
660
661 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
662 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
663 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
664
665 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
666 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
667
668 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
669
670 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
671 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
672 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
673 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
674
675 /* Three-operand expander, unpredicated, in which the third operand is "wide".
676 */
677 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
678 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
679 { \
680 intptr_t i, opr_sz = simd_oprsz(desc); \
681 for (i = 0; i < opr_sz; ) { \
682 TYPEW mm = *(TYPEW *)(vm + i); \
683 do { \
684 TYPE nn = *(TYPE *)(vn + H(i)); \
685 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
686 i += sizeof(TYPE); \
687 } while (i & 7); \
688 } \
689 }
690
691 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
692 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
693 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
694
695 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
696 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
697 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
698
699 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
700 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
701 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
702
703 #undef DO_ZZW
704
705 #undef DO_CLS_B
706 #undef DO_CLS_H
707 #undef DO_CLZ_B
708 #undef DO_CLZ_H
709 #undef DO_CNOT
710 #undef DO_FABS
711 #undef DO_FNEG
712 #undef DO_ABS
713 #undef DO_NEG
714 #undef DO_ZPZ
715 #undef DO_ZPZ_D
716
717 /* Two-operand reduction expander, controlled by a predicate.
718 * The difference between TYPERED and TYPERET has to do with
719 * sign-extension. E.g. for SMAX, TYPERED must be signed,
720 * but TYPERET must be unsigned so that e.g. a 32-bit value
721 * is not sign-extended to the ABI uint64_t return type.
722 */
723 /* ??? If we were to vectorize this by hand the reduction ordering
724 * would change. For integer operands, this is perfectly fine.
725 */
726 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
727 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
728 { \
729 intptr_t i, opr_sz = simd_oprsz(desc); \
730 TYPERED ret = INIT; \
731 for (i = 0; i < opr_sz; ) { \
732 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
733 do { \
734 if (pg & 1) { \
735 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
736 ret = OP(ret, nn); \
737 } \
738 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
739 } while (i & 15); \
740 } \
741 return (TYPERET)ret; \
742 }
743
744 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
745 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
746 { \
747 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
748 TYPEE *n = vn; \
749 uint8_t *pg = vg; \
750 TYPER ret = INIT; \
751 for (i = 0; i < opr_sz; i += 1) { \
752 if (pg[H1(i)] & 1) { \
753 TYPEE nn = n[i]; \
754 ret = OP(ret, nn); \
755 } \
756 } \
757 return ret; \
758 }
759
760 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
761 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
762 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
763 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
764
765 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
766 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
767 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
768 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
769
770 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
771 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
772 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
773 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
774
775 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
776 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
777 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
778
779 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
780 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
781 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
782 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
783
784 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
785 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
786 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
787 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
788
789 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
790 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
791 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
792 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
793
794 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
795 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
796 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
797 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
798
799 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
800 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
801 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
802 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
803
804 #undef DO_VPZ
805 #undef DO_VPZ_D
806
807 /* Two vector operand, one scalar operand, unpredicated. */
808 #define DO_ZZI(NAME, TYPE, OP) \
809 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
810 { \
811 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
812 TYPE s = s64, *d = vd, *n = vn; \
813 for (i = 0; i < opr_sz; ++i) { \
814 d[i] = OP(n[i], s); \
815 } \
816 }
817
818 #define DO_SUBR(X, Y) (Y - X)
819
820 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
821 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
822 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
823 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
824
825 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
826 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
827 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
828 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
829
830 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
831 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
832 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
833 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
834
835 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
836 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
837 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
838 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
839
840 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
841 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
842 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
843 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
844
845 #undef DO_ZZI
846
847 #undef DO_AND
848 #undef DO_ORR
849 #undef DO_EOR
850 #undef DO_BIC
851 #undef DO_ADD
852 #undef DO_SUB
853 #undef DO_MAX
854 #undef DO_MIN
855 #undef DO_ABD
856 #undef DO_MUL
857 #undef DO_DIV
858 #undef DO_ASR
859 #undef DO_LSR
860 #undef DO_LSL
861 #undef DO_SUBR
862
863 /* Similar to the ARM LastActiveElement pseudocode function, except the
864 result is multiplied by the element size. This includes the not found
865 indication; e.g. not found for esz=3 is -8. */
866 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
867 {
868 uint64_t mask = pred_esz_masks[esz];
869 intptr_t i = words;
870
871 do {
872 uint64_t this_g = g[--i] & mask;
873 if (this_g) {
874 return i * 64 + (63 - clz64(this_g));
875 }
876 } while (i > 0);
877 return (intptr_t)-1 << esz;
878 }
879
880 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
881 {
882 uint32_t flags = PREDTEST_INIT;
883 uint64_t *d = vd, *g = vg;
884 intptr_t i = 0;
885
886 do {
887 uint64_t this_d = d[i];
888 uint64_t this_g = g[i];
889
890 if (this_g) {
891 if (!(flags & 4)) {
892 /* Set in D the first bit of G. */
893 this_d |= this_g & -this_g;
894 d[i] = this_d;
895 }
896 flags = iter_predtest_fwd(this_d, this_g, flags);
897 }
898 } while (++i < words);
899
900 return flags;
901 }
902
903 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
904 {
905 intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
906 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
907 uint32_t flags = PREDTEST_INIT;
908 uint64_t *d = vd, *g = vg, esz_mask;
909 intptr_t i, next;
910
911 next = last_active_element(vd, words, esz) + (1 << esz);
912 esz_mask = pred_esz_masks[esz];
913
914 /* Similar to the pseudocode for pnext, but scaled by ESZ
915 so that we find the correct bit. */
916 if (next < words * 64) {
917 uint64_t mask = -1;
918
919 if (next & 63) {
920 mask = ~((1ull << (next & 63)) - 1);
921 next &= -64;
922 }
923 do {
924 uint64_t this_g = g[next / 64] & esz_mask & mask;
925 if (this_g != 0) {
926 next = (next & -64) + ctz64(this_g);
927 break;
928 }
929 next += 64;
930 mask = -1;
931 } while (next < words * 64);
932 }
933
934 i = 0;
935 do {
936 uint64_t this_d = 0;
937 if (i == next / 64) {
938 this_d = 1ull << (next & 63);
939 }
940 d[i] = this_d;
941 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
942 } while (++i < words);
943
944 return flags;
945 }
946
947 /* Store zero into every active element of Zd. We will use this for two
948 * and three-operand predicated instructions for which logic dictates a
949 * zero result. In particular, logical shift by element size, which is
950 * otherwise undefined on the host.
951 *
952 * For element sizes smaller than uint64_t, we use tables to expand
953 * the N bits of the controlling predicate to a byte mask, and clear
954 * those bytes.
955 */
956 void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
957 {
958 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
959 uint64_t *d = vd;
960 uint8_t *pg = vg;
961 for (i = 0; i < opr_sz; i += 1) {
962 d[i] &= ~expand_pred_b(pg[H1(i)]);
963 }
964 }
965
966 void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
967 {
968 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
969 uint64_t *d = vd;
970 uint8_t *pg = vg;
971 for (i = 0; i < opr_sz; i += 1) {
972 d[i] &= ~expand_pred_h(pg[H1(i)]);
973 }
974 }
975
976 void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
977 {
978 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
979 uint64_t *d = vd;
980 uint8_t *pg = vg;
981 for (i = 0; i < opr_sz; i += 1) {
982 d[i] &= ~expand_pred_s(pg[H1(i)]);
983 }
984 }
985
986 void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
987 {
988 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
989 uint64_t *d = vd;
990 uint8_t *pg = vg;
991 for (i = 0; i < opr_sz; i += 1) {
992 if (pg[H1(i)] & 1) {
993 d[i] = 0;
994 }
995 }
996 }
997
998 /* Three-operand expander, immediate operand, controlled by a predicate.
999 */
1000 #define DO_ZPZI(NAME, TYPE, H, OP) \
1001 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1002 { \
1003 intptr_t i, opr_sz = simd_oprsz(desc); \
1004 TYPE imm = simd_data(desc); \
1005 for (i = 0; i < opr_sz; ) { \
1006 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1007 do { \
1008 if (pg & 1) { \
1009 TYPE nn = *(TYPE *)(vn + H(i)); \
1010 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1011 } \
1012 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1013 } while (i & 15); \
1014 } \
1015 }
1016
1017 /* Similarly, specialized for 64-bit operands. */
1018 #define DO_ZPZI_D(NAME, TYPE, OP) \
1019 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1020 { \
1021 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1022 TYPE *d = vd, *n = vn; \
1023 TYPE imm = simd_data(desc); \
1024 uint8_t *pg = vg; \
1025 for (i = 0; i < opr_sz; i += 1) { \
1026 if (pg[H1(i)] & 1) { \
1027 TYPE nn = n[i]; \
1028 d[i] = OP(nn, imm); \
1029 } \
1030 } \
1031 }
1032
1033 #define DO_SHR(N, M) (N >> M)
1034 #define DO_SHL(N, M) (N << M)
1035
1036 /* Arithmetic shift right for division. This rounds negative numbers
1037 toward zero as per signed division. Therefore before shifting,
1038 when N is negative, add 2**M-1. */
1039 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1040
1041 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1042 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1043 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1044 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1045
1046 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1047 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1048 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1049 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1050
1051 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1052 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1053 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1054 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1055
1056 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1057 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1058 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1059 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1060
1061 #undef DO_SHR
1062 #undef DO_SHL
1063 #undef DO_ASRD
1064 #undef DO_ZPZI
1065 #undef DO_ZPZI_D
1066
1067 /* Fully general four-operand expander, controlled by a predicate.
1068 */
1069 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
1070 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1071 void *vg, uint32_t desc) \
1072 { \
1073 intptr_t i, opr_sz = simd_oprsz(desc); \
1074 for (i = 0; i < opr_sz; ) { \
1075 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1076 do { \
1077 if (pg & 1) { \
1078 TYPE nn = *(TYPE *)(vn + H(i)); \
1079 TYPE mm = *(TYPE *)(vm + H(i)); \
1080 TYPE aa = *(TYPE *)(va + H(i)); \
1081 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1082 } \
1083 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1084 } while (i & 15); \
1085 } \
1086 }
1087
1088 /* Similarly, specialized for 64-bit operands. */
1089 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
1090 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1091 void *vg, uint32_t desc) \
1092 { \
1093 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1094 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1095 uint8_t *pg = vg; \
1096 for (i = 0; i < opr_sz; i += 1) { \
1097 if (pg[H1(i)] & 1) { \
1098 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1099 d[i] = OP(aa, nn, mm); \
1100 } \
1101 } \
1102 }
1103
1104 #define DO_MLA(A, N, M) (A + N * M)
1105 #define DO_MLS(A, N, M) (A - N * M)
1106
1107 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1108 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1109
1110 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1111 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1112
1113 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1114 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1115
1116 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1117 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1118
1119 #undef DO_MLA
1120 #undef DO_MLS
1121 #undef DO_ZPZZZ
1122 #undef DO_ZPZZZ_D
1123
1124 void HELPER(sve_index_b)(void *vd, uint32_t start,
1125 uint32_t incr, uint32_t desc)
1126 {
1127 intptr_t i, opr_sz = simd_oprsz(desc);
1128 uint8_t *d = vd;
1129 for (i = 0; i < opr_sz; i += 1) {
1130 d[H1(i)] = start + i * incr;
1131 }
1132 }
1133
1134 void HELPER(sve_index_h)(void *vd, uint32_t start,
1135 uint32_t incr, uint32_t desc)
1136 {
1137 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1138 uint16_t *d = vd;
1139 for (i = 0; i < opr_sz; i += 1) {
1140 d[H2(i)] = start + i * incr;
1141 }
1142 }
1143
1144 void HELPER(sve_index_s)(void *vd, uint32_t start,
1145 uint32_t incr, uint32_t desc)
1146 {
1147 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1148 uint32_t *d = vd;
1149 for (i = 0; i < opr_sz; i += 1) {
1150 d[H4(i)] = start + i * incr;
1151 }
1152 }
1153
1154 void HELPER(sve_index_d)(void *vd, uint64_t start,
1155 uint64_t incr, uint32_t desc)
1156 {
1157 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1158 uint64_t *d = vd;
1159 for (i = 0; i < opr_sz; i += 1) {
1160 d[i] = start + i * incr;
1161 }
1162 }
1163
1164 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1165 {
1166 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1167 uint32_t sh = simd_data(desc);
1168 uint32_t *d = vd, *n = vn, *m = vm;
1169 for (i = 0; i < opr_sz; i += 1) {
1170 d[i] = n[i] + (m[i] << sh);
1171 }
1172 }
1173
1174 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1175 {
1176 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1177 uint64_t sh = simd_data(desc);
1178 uint64_t *d = vd, *n = vn, *m = vm;
1179 for (i = 0; i < opr_sz; i += 1) {
1180 d[i] = n[i] + (m[i] << sh);
1181 }
1182 }
1183
1184 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1185 {
1186 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1187 uint64_t sh = simd_data(desc);
1188 uint64_t *d = vd, *n = vn, *m = vm;
1189 for (i = 0; i < opr_sz; i += 1) {
1190 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1191 }
1192 }
1193
1194 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1195 {
1196 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1197 uint64_t sh = simd_data(desc);
1198 uint64_t *d = vd, *n = vn, *m = vm;
1199 for (i = 0; i < opr_sz; i += 1) {
1200 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1201 }
1202 }
1203
1204 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1205 {
1206 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1207 static const uint16_t coeff[] = {
1208 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1209 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1210 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1211 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1212 };
1213 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1214 uint16_t *d = vd, *n = vn;
1215
1216 for (i = 0; i < opr_sz; i++) {
1217 uint16_t nn = n[i];
1218 intptr_t idx = extract32(nn, 0, 5);
1219 uint16_t exp = extract32(nn, 5, 5);
1220 d[i] = coeff[idx] | (exp << 10);
1221 }
1222 }
1223
1224 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1225 {
1226 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1227 static const uint32_t coeff[] = {
1228 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1229 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1230 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1231 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1232 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1233 0x1ef532, 0x20b051, 0x227043, 0x243516,
1234 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1235 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1236 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1237 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1238 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1239 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1240 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1241 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1242 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1243 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1244 };
1245 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1246 uint32_t *d = vd, *n = vn;
1247
1248 for (i = 0; i < opr_sz; i++) {
1249 uint32_t nn = n[i];
1250 intptr_t idx = extract32(nn, 0, 6);
1251 uint32_t exp = extract32(nn, 6, 8);
1252 d[i] = coeff[idx] | (exp << 23);
1253 }
1254 }
1255
1256 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1257 {
1258 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1259 static const uint64_t coeff[] = {
1260 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1261 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1262 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1263 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1264 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1265 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1266 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1267 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1268 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1269 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1270 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1271 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1272 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1273 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1274 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1275 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1276 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1277 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1278 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1279 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1280 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1281 0xFA7C1819E90D8ull,
1282 };
1283 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1284 uint64_t *d = vd, *n = vn;
1285
1286 for (i = 0; i < opr_sz; i++) {
1287 uint64_t nn = n[i];
1288 intptr_t idx = extract32(nn, 0, 6);
1289 uint64_t exp = extract32(nn, 6, 11);
1290 d[i] = coeff[idx] | (exp << 52);
1291 }
1292 }
1293
1294 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1295 {
1296 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1297 uint16_t *d = vd, *n = vn, *m = vm;
1298 for (i = 0; i < opr_sz; i += 1) {
1299 uint16_t nn = n[i];
1300 uint16_t mm = m[i];
1301 if (mm & 1) {
1302 nn = float16_one;
1303 }
1304 d[i] = nn ^ (mm & 2) << 14;
1305 }
1306 }
1307
1308 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1309 {
1310 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1311 uint32_t *d = vd, *n = vn, *m = vm;
1312 for (i = 0; i < opr_sz; i += 1) {
1313 uint32_t nn = n[i];
1314 uint32_t mm = m[i];
1315 if (mm & 1) {
1316 nn = float32_one;
1317 }
1318 d[i] = nn ^ (mm & 2) << 30;
1319 }
1320 }
1321
1322 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1323 {
1324 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1325 uint64_t *d = vd, *n = vn, *m = vm;
1326 for (i = 0; i < opr_sz; i += 1) {
1327 uint64_t nn = n[i];
1328 uint64_t mm = m[i];
1329 if (mm & 1) {
1330 nn = float64_one;
1331 }
1332 d[i] = nn ^ (mm & 2) << 62;
1333 }
1334 }
1335
1336 /*
1337 * Signed saturating addition with scalar operand.
1338 */
1339
1340 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1341 {
1342 intptr_t i, oprsz = simd_oprsz(desc);
1343
1344 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1345 int r = *(int8_t *)(a + i) + b;
1346 if (r > INT8_MAX) {
1347 r = INT8_MAX;
1348 } else if (r < INT8_MIN) {
1349 r = INT8_MIN;
1350 }
1351 *(int8_t *)(d + i) = r;
1352 }
1353 }
1354
1355 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1356 {
1357 intptr_t i, oprsz = simd_oprsz(desc);
1358
1359 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1360 int r = *(int16_t *)(a + i) + b;
1361 if (r > INT16_MAX) {
1362 r = INT16_MAX;
1363 } else if (r < INT16_MIN) {
1364 r = INT16_MIN;
1365 }
1366 *(int16_t *)(d + i) = r;
1367 }
1368 }
1369
1370 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1371 {
1372 intptr_t i, oprsz = simd_oprsz(desc);
1373
1374 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1375 int64_t r = *(int32_t *)(a + i) + b;
1376 if (r > INT32_MAX) {
1377 r = INT32_MAX;
1378 } else if (r < INT32_MIN) {
1379 r = INT32_MIN;
1380 }
1381 *(int32_t *)(d + i) = r;
1382 }
1383 }
1384
1385 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1386 {
1387 intptr_t i, oprsz = simd_oprsz(desc);
1388
1389 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1390 int64_t ai = *(int64_t *)(a + i);
1391 int64_t r = ai + b;
1392 if (((r ^ ai) & ~(ai ^ b)) < 0) {
1393 /* Signed overflow. */
1394 r = (r < 0 ? INT64_MAX : INT64_MIN);
1395 }
1396 *(int64_t *)(d + i) = r;
1397 }
1398 }
1399
1400 /*
1401 * Unsigned saturating addition with scalar operand.
1402 */
1403
1404 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1405 {
1406 intptr_t i, oprsz = simd_oprsz(desc);
1407
1408 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1409 int r = *(uint8_t *)(a + i) + b;
1410 if (r > UINT8_MAX) {
1411 r = UINT8_MAX;
1412 } else if (r < 0) {
1413 r = 0;
1414 }
1415 *(uint8_t *)(d + i) = r;
1416 }
1417 }
1418
1419 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1420 {
1421 intptr_t i, oprsz = simd_oprsz(desc);
1422
1423 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1424 int r = *(uint16_t *)(a + i) + b;
1425 if (r > UINT16_MAX) {
1426 r = UINT16_MAX;
1427 } else if (r < 0) {
1428 r = 0;
1429 }
1430 *(uint16_t *)(d + i) = r;
1431 }
1432 }
1433
1434 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1435 {
1436 intptr_t i, oprsz = simd_oprsz(desc);
1437
1438 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1439 int64_t r = *(uint32_t *)(a + i) + b;
1440 if (r > UINT32_MAX) {
1441 r = UINT32_MAX;
1442 } else if (r < 0) {
1443 r = 0;
1444 }
1445 *(uint32_t *)(d + i) = r;
1446 }
1447 }
1448
1449 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1450 {
1451 intptr_t i, oprsz = simd_oprsz(desc);
1452
1453 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1454 uint64_t r = *(uint64_t *)(a + i) + b;
1455 if (r < b) {
1456 r = UINT64_MAX;
1457 }
1458 *(uint64_t *)(d + i) = r;
1459 }
1460 }
1461
1462 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1463 {
1464 intptr_t i, oprsz = simd_oprsz(desc);
1465
1466 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1467 uint64_t ai = *(uint64_t *)(a + i);
1468 *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1469 }
1470 }
1471
1472 /* Two operand predicated copy immediate with merge. All valid immediates
1473 * can fit within 17 signed bits in the simd_data field.
1474 */
1475 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1476 uint64_t mm, uint32_t desc)
1477 {
1478 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1479 uint64_t *d = vd, *n = vn;
1480 uint8_t *pg = vg;
1481
1482 mm = dup_const(MO_8, mm);
1483 for (i = 0; i < opr_sz; i += 1) {
1484 uint64_t nn = n[i];
1485 uint64_t pp = expand_pred_b(pg[H1(i)]);
1486 d[i] = (mm & pp) | (nn & ~pp);
1487 }
1488 }
1489
1490 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1491 uint64_t mm, uint32_t desc)
1492 {
1493 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1494 uint64_t *d = vd, *n = vn;
1495 uint8_t *pg = vg;
1496
1497 mm = dup_const(MO_16, mm);
1498 for (i = 0; i < opr_sz; i += 1) {
1499 uint64_t nn = n[i];
1500 uint64_t pp = expand_pred_h(pg[H1(i)]);
1501 d[i] = (mm & pp) | (nn & ~pp);
1502 }
1503 }
1504
1505 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1506 uint64_t mm, uint32_t desc)
1507 {
1508 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1509 uint64_t *d = vd, *n = vn;
1510 uint8_t *pg = vg;
1511
1512 mm = dup_const(MO_32, mm);
1513 for (i = 0; i < opr_sz; i += 1) {
1514 uint64_t nn = n[i];
1515 uint64_t pp = expand_pred_s(pg[H1(i)]);
1516 d[i] = (mm & pp) | (nn & ~pp);
1517 }
1518 }
1519
1520 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1521 uint64_t mm, uint32_t desc)
1522 {
1523 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1524 uint64_t *d = vd, *n = vn;
1525 uint8_t *pg = vg;
1526
1527 for (i = 0; i < opr_sz; i += 1) {
1528 uint64_t nn = n[i];
1529 d[i] = (pg[H1(i)] & 1 ? mm : nn);
1530 }
1531 }
1532
1533 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1534 {
1535 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1536 uint64_t *d = vd;
1537 uint8_t *pg = vg;
1538
1539 val = dup_const(MO_8, val);
1540 for (i = 0; i < opr_sz; i += 1) {
1541 d[i] = val & expand_pred_b(pg[H1(i)]);
1542 }
1543 }
1544
1545 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1546 {
1547 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1548 uint64_t *d = vd;
1549 uint8_t *pg = vg;
1550
1551 val = dup_const(MO_16, val);
1552 for (i = 0; i < opr_sz; i += 1) {
1553 d[i] = val & expand_pred_h(pg[H1(i)]);
1554 }
1555 }
1556
1557 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1558 {
1559 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1560 uint64_t *d = vd;
1561 uint8_t *pg = vg;
1562
1563 val = dup_const(MO_32, val);
1564 for (i = 0; i < opr_sz; i += 1) {
1565 d[i] = val & expand_pred_s(pg[H1(i)]);
1566 }
1567 }
1568
1569 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1570 {
1571 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1572 uint64_t *d = vd;
1573 uint8_t *pg = vg;
1574
1575 for (i = 0; i < opr_sz; i += 1) {
1576 d[i] = (pg[H1(i)] & 1 ? val : 0);
1577 }
1578 }
1579
1580 /* Big-endian hosts need to frob the byte indicies. If the copy
1581 * happens to be 8-byte aligned, then no frobbing necessary.
1582 */
1583 static void swap_memmove(void *vd, void *vs, size_t n)
1584 {
1585 uintptr_t d = (uintptr_t)vd;
1586 uintptr_t s = (uintptr_t)vs;
1587 uintptr_t o = (d | s | n) & 7;
1588 size_t i;
1589
1590 #ifndef HOST_WORDS_BIGENDIAN
1591 o = 0;
1592 #endif
1593 switch (o) {
1594 case 0:
1595 memmove(vd, vs, n);
1596 break;
1597
1598 case 4:
1599 if (d < s || d >= s + n) {
1600 for (i = 0; i < n; i += 4) {
1601 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1602 }
1603 } else {
1604 for (i = n; i > 0; ) {
1605 i -= 4;
1606 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1607 }
1608 }
1609 break;
1610
1611 case 2:
1612 case 6:
1613 if (d < s || d >= s + n) {
1614 for (i = 0; i < n; i += 2) {
1615 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1616 }
1617 } else {
1618 for (i = n; i > 0; ) {
1619 i -= 2;
1620 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1621 }
1622 }
1623 break;
1624
1625 default:
1626 if (d < s || d >= s + n) {
1627 for (i = 0; i < n; i++) {
1628 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1629 }
1630 } else {
1631 for (i = n; i > 0; ) {
1632 i -= 1;
1633 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1634 }
1635 }
1636 break;
1637 }
1638 }
1639
1640 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1641 {
1642 intptr_t opr_sz = simd_oprsz(desc);
1643 size_t n_ofs = simd_data(desc);
1644 size_t n_siz = opr_sz - n_ofs;
1645
1646 if (vd != vm) {
1647 swap_memmove(vd, vn + n_ofs, n_siz);
1648 swap_memmove(vd + n_siz, vm, n_ofs);
1649 } else if (vd != vn) {
1650 swap_memmove(vd + n_siz, vd, n_ofs);
1651 swap_memmove(vd, vn + n_ofs, n_siz);
1652 } else {
1653 /* vd == vn == vm. Need temp space. */
1654 ARMVectorReg tmp;
1655 swap_memmove(&tmp, vm, n_ofs);
1656 swap_memmove(vd, vd + n_ofs, n_siz);
1657 memcpy(vd + n_siz, &tmp, n_ofs);
1658 }
1659 }
1660
1661 #define DO_INSR(NAME, TYPE, H) \
1662 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1663 { \
1664 intptr_t opr_sz = simd_oprsz(desc); \
1665 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1666 *(TYPE *)(vd + H(0)) = val; \
1667 }
1668
1669 DO_INSR(sve_insr_b, uint8_t, H1)
1670 DO_INSR(sve_insr_h, uint16_t, H1_2)
1671 DO_INSR(sve_insr_s, uint32_t, H1_4)
1672 DO_INSR(sve_insr_d, uint64_t, )
1673
1674 #undef DO_INSR
1675
1676 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1677 {
1678 intptr_t i, j, opr_sz = simd_oprsz(desc);
1679 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1680 uint64_t f = *(uint64_t *)(vn + i);
1681 uint64_t b = *(uint64_t *)(vn + j);
1682 *(uint64_t *)(vd + i) = bswap64(b);
1683 *(uint64_t *)(vd + j) = bswap64(f);
1684 }
1685 }
1686
1687 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1688 {
1689 intptr_t i, j, opr_sz = simd_oprsz(desc);
1690 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1691 uint64_t f = *(uint64_t *)(vn + i);
1692 uint64_t b = *(uint64_t *)(vn + j);
1693 *(uint64_t *)(vd + i) = hswap64(b);
1694 *(uint64_t *)(vd + j) = hswap64(f);
1695 }
1696 }
1697
1698 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1699 {
1700 intptr_t i, j, opr_sz = simd_oprsz(desc);
1701 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1702 uint64_t f = *(uint64_t *)(vn + i);
1703 uint64_t b = *(uint64_t *)(vn + j);
1704 *(uint64_t *)(vd + i) = rol64(b, 32);
1705 *(uint64_t *)(vd + j) = rol64(f, 32);
1706 }
1707 }
1708
1709 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1710 {
1711 intptr_t i, j, opr_sz = simd_oprsz(desc);
1712 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1713 uint64_t f = *(uint64_t *)(vn + i);
1714 uint64_t b = *(uint64_t *)(vn + j);
1715 *(uint64_t *)(vd + i) = b;
1716 *(uint64_t *)(vd + j) = f;
1717 }
1718 }
1719
1720 #define DO_TBL(NAME, TYPE, H) \
1721 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1722 { \
1723 intptr_t i, opr_sz = simd_oprsz(desc); \
1724 uintptr_t elem = opr_sz / sizeof(TYPE); \
1725 TYPE *d = vd, *n = vn, *m = vm; \
1726 ARMVectorReg tmp; \
1727 if (unlikely(vd == vn)) { \
1728 n = memcpy(&tmp, vn, opr_sz); \
1729 } \
1730 for (i = 0; i < elem; i++) { \
1731 TYPE j = m[H(i)]; \
1732 d[H(i)] = j < elem ? n[H(j)] : 0; \
1733 } \
1734 }
1735
1736 DO_TBL(sve_tbl_b, uint8_t, H1)
1737 DO_TBL(sve_tbl_h, uint16_t, H2)
1738 DO_TBL(sve_tbl_s, uint32_t, H4)
1739 DO_TBL(sve_tbl_d, uint64_t, )
1740
1741 #undef TBL
1742
1743 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1744 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1745 { \
1746 intptr_t i, opr_sz = simd_oprsz(desc); \
1747 TYPED *d = vd; \
1748 TYPES *n = vn; \
1749 ARMVectorReg tmp; \
1750 if (unlikely(vn - vd < opr_sz)) { \
1751 n = memcpy(&tmp, n, opr_sz / 2); \
1752 } \
1753 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1754 d[HD(i)] = n[HS(i)]; \
1755 } \
1756 }
1757
1758 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1759 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1760 DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1761
1762 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1763 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1764 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1765
1766 #undef DO_UNPK
1767
1768 /* Mask of bits included in the even numbered predicates of width esz.
1769 * We also use this for expand_bits/compress_bits, and so extend the
1770 * same pattern out to 16-bit units.
1771 */
1772 static const uint64_t even_bit_esz_masks[5] = {
1773 0x5555555555555555ull,
1774 0x3333333333333333ull,
1775 0x0f0f0f0f0f0f0f0full,
1776 0x00ff00ff00ff00ffull,
1777 0x0000ffff0000ffffull,
1778 };
1779
1780 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1781 * For N==0, this corresponds to the operation that in qemu/bitops.h
1782 * we call half_shuffle64; this algorithm is from Hacker's Delight,
1783 * section 7-2 Shuffling Bits.
1784 */
1785 static uint64_t expand_bits(uint64_t x, int n)
1786 {
1787 int i;
1788
1789 x &= 0xffffffffu;
1790 for (i = 4; i >= n; i--) {
1791 int sh = 1 << i;
1792 x = ((x << sh) | x) & even_bit_esz_masks[i];
1793 }
1794 return x;
1795 }
1796
1797 /* Compress units of 2**(N+1) bits to units of 2**N bits.
1798 * For N==0, this corresponds to the operation that in qemu/bitops.h
1799 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1800 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1801 */
1802 static uint64_t compress_bits(uint64_t x, int n)
1803 {
1804 int i;
1805
1806 for (i = n; i <= 4; i++) {
1807 int sh = 1 << i;
1808 x &= even_bit_esz_masks[i];
1809 x = (x >> sh) | x;
1810 }
1811 return x & 0xffffffffu;
1812 }
1813
1814 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1815 {
1816 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1817 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1818 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1819 uint64_t *d = vd;
1820 intptr_t i;
1821
1822 if (oprsz <= 8) {
1823 uint64_t nn = *(uint64_t *)vn;
1824 uint64_t mm = *(uint64_t *)vm;
1825 int half = 4 * oprsz;
1826
1827 nn = extract64(nn, high * half, half);
1828 mm = extract64(mm, high * half, half);
1829 nn = expand_bits(nn, esz);
1830 mm = expand_bits(mm, esz);
1831 d[0] = nn + (mm << (1 << esz));
1832 } else {
1833 ARMPredicateReg tmp_n, tmp_m;
1834
1835 /* We produce output faster than we consume input.
1836 Therefore we must be mindful of possible overlap. */
1837 if ((vn - vd) < (uintptr_t)oprsz) {
1838 vn = memcpy(&tmp_n, vn, oprsz);
1839 }
1840 if ((vm - vd) < (uintptr_t)oprsz) {
1841 vm = memcpy(&tmp_m, vm, oprsz);
1842 }
1843 if (high) {
1844 high = oprsz >> 1;
1845 }
1846
1847 if ((high & 3) == 0) {
1848 uint32_t *n = vn, *m = vm;
1849 high >>= 2;
1850
1851 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1852 uint64_t nn = n[H4(high + i)];
1853 uint64_t mm = m[H4(high + i)];
1854
1855 nn = expand_bits(nn, esz);
1856 mm = expand_bits(mm, esz);
1857 d[i] = nn + (mm << (1 << esz));
1858 }
1859 } else {
1860 uint8_t *n = vn, *m = vm;
1861 uint16_t *d16 = vd;
1862
1863 for (i = 0; i < oprsz / 2; i++) {
1864 uint16_t nn = n[H1(high + i)];
1865 uint16_t mm = m[H1(high + i)];
1866
1867 nn = expand_bits(nn, esz);
1868 mm = expand_bits(mm, esz);
1869 d16[H2(i)] = nn + (mm << (1 << esz));
1870 }
1871 }
1872 }
1873 }
1874
1875 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1876 {
1877 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1878 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1879 int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
1880 uint64_t *d = vd, *n = vn, *m = vm;
1881 uint64_t l, h;
1882 intptr_t i;
1883
1884 if (oprsz <= 8) {
1885 l = compress_bits(n[0] >> odd, esz);
1886 h = compress_bits(m[0] >> odd, esz);
1887 d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
1888 } else {
1889 ARMPredicateReg tmp_m;
1890 intptr_t oprsz_16 = oprsz / 16;
1891
1892 if ((vm - vd) < (uintptr_t)oprsz) {
1893 m = memcpy(&tmp_m, vm, oprsz);
1894 }
1895
1896 for (i = 0; i < oprsz_16; i++) {
1897 l = n[2 * i + 0];
1898 h = n[2 * i + 1];
1899 l = compress_bits(l >> odd, esz);
1900 h = compress_bits(h >> odd, esz);
1901 d[i] = l + (h << 32);
1902 }
1903
1904 /* For VL which is not a power of 2, the results from M do not
1905 align nicely with the uint64_t for D. Put the aligned results
1906 from M into TMP_M and then copy it into place afterward. */
1907 if (oprsz & 15) {
1908 d[i] = compress_bits(n[2 * i] >> odd, esz);
1909
1910 for (i = 0; i < oprsz_16; i++) {
1911 l = m[2 * i + 0];
1912 h = m[2 * i + 1];
1913 l = compress_bits(l >> odd, esz);
1914 h = compress_bits(h >> odd, esz);
1915 tmp_m.p[i] = l + (h << 32);
1916 }
1917 tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
1918
1919 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
1920 } else {
1921 for (i = 0; i < oprsz_16; i++) {
1922 l = m[2 * i + 0];
1923 h = m[2 * i + 1];
1924 l = compress_bits(l >> odd, esz);
1925 h = compress_bits(h >> odd, esz);
1926 d[oprsz_16 + i] = l + (h << 32);
1927 }
1928 }
1929 }
1930 }
1931
1932 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1933 {
1934 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1935 uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1936 bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1937 uint64_t *d = vd, *n = vn, *m = vm;
1938 uint64_t mask;
1939 int shr, shl;
1940 intptr_t i;
1941
1942 shl = 1 << esz;
1943 shr = 0;
1944 mask = even_bit_esz_masks[esz];
1945 if (odd) {
1946 mask <<= shl;
1947 shr = shl;
1948 shl = 0;
1949 }
1950
1951 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1952 uint64_t nn = (n[i] & mask) >> shr;
1953 uint64_t mm = (m[i] & mask) << shl;
1954 d[i] = nn + mm;
1955 }
1956 }
1957
1958 /* Reverse units of 2**N bits. */
1959 static uint64_t reverse_bits_64(uint64_t x, int n)
1960 {
1961 int i, sh;
1962
1963 x = bswap64(x);
1964 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
1965 uint64_t mask = even_bit_esz_masks[i];
1966 x = ((x & mask) << sh) | ((x >> sh) & mask);
1967 }
1968 return x;
1969 }
1970
1971 static uint8_t reverse_bits_8(uint8_t x, int n)
1972 {
1973 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
1974 int i, sh;
1975
1976 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
1977 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
1978 }
1979 return x;
1980 }
1981
1982 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
1983 {
1984 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1985 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1986 intptr_t i, oprsz_2 = oprsz / 2;
1987
1988 if (oprsz <= 8) {
1989 uint64_t l = *(uint64_t *)vn;
1990 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
1991 *(uint64_t *)vd = l;
1992 } else if ((oprsz & 15) == 0) {
1993 for (i = 0; i < oprsz_2; i += 8) {
1994 intptr_t ih = oprsz - 8 - i;
1995 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
1996 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
1997 *(uint64_t *)(vd + i) = h;
1998 *(uint64_t *)(vd + ih) = l;
1999 }
2000 } else {
2001 for (i = 0; i < oprsz_2; i += 1) {
2002 intptr_t il = H1(i);
2003 intptr_t ih = H1(oprsz - 1 - i);
2004 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2005 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2006 *(uint8_t *)(vd + il) = h;
2007 *(uint8_t *)(vd + ih) = l;
2008 }
2009 }
2010 }
2011
2012 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2013 {
2014 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2015 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2016 uint64_t *d = vd;
2017 intptr_t i;
2018
2019 if (oprsz <= 8) {
2020 uint64_t nn = *(uint64_t *)vn;
2021 int half = 4 * oprsz;
2022
2023 nn = extract64(nn, high * half, half);
2024 nn = expand_bits(nn, 0);
2025 d[0] = nn;
2026 } else {
2027 ARMPredicateReg tmp_n;
2028
2029 /* We produce output faster than we consume input.
2030 Therefore we must be mindful of possible overlap. */
2031 if ((vn - vd) < (uintptr_t)oprsz) {
2032 vn = memcpy(&tmp_n, vn, oprsz);
2033 }
2034 if (high) {
2035 high = oprsz >> 1;
2036 }
2037
2038 if ((high & 3) == 0) {
2039 uint32_t *n = vn;
2040 high >>= 2;
2041
2042 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2043 uint64_t nn = n[H4(high + i)];
2044 d[i] = expand_bits(nn, 0);
2045 }
2046 } else {
2047 uint16_t *d16 = vd;
2048 uint8_t *n = vn;
2049
2050 for (i = 0; i < oprsz / 2; i++) {
2051 uint16_t nn = n[H1(high + i)];
2052 d16[H2(i)] = expand_bits(nn, 0);
2053 }
2054 }
2055 }
2056 }
2057
2058 #define DO_ZIP(NAME, TYPE, H) \
2059 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2060 { \
2061 intptr_t oprsz = simd_oprsz(desc); \
2062 intptr_t i, oprsz_2 = oprsz / 2; \
2063 ARMVectorReg tmp_n, tmp_m; \
2064 /* We produce output faster than we consume input. \
2065 Therefore we must be mindful of possible overlap. */ \
2066 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2067 vn = memcpy(&tmp_n, vn, oprsz_2); \
2068 } \
2069 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2070 vm = memcpy(&tmp_m, vm, oprsz_2); \
2071 } \
2072 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2073 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2074 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2075 } \
2076 }
2077
2078 DO_ZIP(sve_zip_b, uint8_t, H1)
2079 DO_ZIP(sve_zip_h, uint16_t, H1_2)
2080 DO_ZIP(sve_zip_s, uint32_t, H1_4)
2081 DO_ZIP(sve_zip_d, uint64_t, )
2082
2083 #define DO_UZP(NAME, TYPE, H) \
2084 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2085 { \
2086 intptr_t oprsz = simd_oprsz(desc); \
2087 intptr_t oprsz_2 = oprsz / 2; \
2088 intptr_t odd_ofs = simd_data(desc); \
2089 intptr_t i; \
2090 ARMVectorReg tmp_m; \
2091 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2092 vm = memcpy(&tmp_m, vm, oprsz); \
2093 } \
2094 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2095 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2096 } \
2097 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2098 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2099 } \
2100 }
2101
2102 DO_UZP(sve_uzp_b, uint8_t, H1)
2103 DO_UZP(sve_uzp_h, uint16_t, H1_2)
2104 DO_UZP(sve_uzp_s, uint32_t, H1_4)
2105 DO_UZP(sve_uzp_d, uint64_t, )
2106
2107 #define DO_TRN(NAME, TYPE, H) \
2108 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2109 { \
2110 intptr_t oprsz = simd_oprsz(desc); \
2111 intptr_t odd_ofs = simd_data(desc); \
2112 intptr_t i; \
2113 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2114 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2115 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2116 *(TYPE *)(vd + H(i + 0)) = ae; \
2117 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2118 } \
2119 }
2120
2121 DO_TRN(sve_trn_b, uint8_t, H1)
2122 DO_TRN(sve_trn_h, uint16_t, H1_2)
2123 DO_TRN(sve_trn_s, uint32_t, H1_4)
2124 DO_TRN(sve_trn_d, uint64_t, )
2125
2126 #undef DO_ZIP
2127 #undef DO_UZP
2128 #undef DO_TRN
2129
2130 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2131 {
2132 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2133 uint32_t *d = vd, *n = vn;
2134 uint8_t *pg = vg;
2135
2136 for (i = j = 0; i < opr_sz; i++) {
2137 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2138 d[H4(j)] = n[H4(i)];
2139 j++;
2140 }
2141 }
2142 for (; j < opr_sz; j++) {
2143 d[H4(j)] = 0;
2144 }
2145 }
2146
2147 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2148 {
2149 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2150 uint64_t *d = vd, *n = vn;
2151 uint8_t *pg = vg;
2152
2153 for (i = j = 0; i < opr_sz; i++) {
2154 if (pg[H1(i)] & 1) {
2155 d[j] = n[i];
2156 j++;
2157 }
2158 }
2159 for (; j < opr_sz; j++) {
2160 d[j] = 0;
2161 }
2162 }
2163
2164 /* Similar to the ARM LastActiveElement pseudocode function, except the
2165 * result is multiplied by the element size. This includes the not found
2166 * indication; e.g. not found for esz=3 is -8.
2167 */
2168 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2169 {
2170 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2171 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2172
2173 return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2174 }
2175
2176 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2177 {
2178 intptr_t opr_sz = simd_oprsz(desc) / 8;
2179 int esz = simd_data(desc);
2180 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2181 intptr_t i, first_i, last_i;
2182 ARMVectorReg tmp;
2183
2184 first_i = last_i = 0;
2185 first_g = last_g = 0;
2186
2187 /* Find the extent of the active elements within VG. */
2188 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2189 pg = *(uint64_t *)(vg + i) & mask;
2190 if (pg) {
2191 if (last_g == 0) {
2192 last_g = pg;
2193 last_i = i;
2194 }
2195 first_g = pg;
2196 first_i = i;
2197 }
2198 }
2199
2200 len = 0;
2201 if (first_g != 0) {
2202 first_i = first_i * 8 + ctz64(first_g);
2203 last_i = last_i * 8 + 63 - clz64(last_g);
2204 len = last_i - first_i + (1 << esz);
2205 if (vd == vm) {
2206 vm = memcpy(&tmp, vm, opr_sz * 8);
2207 }
2208 swap_memmove(vd, vn + first_i, len);
2209 }
2210 swap_memmove(vd + len, vm, opr_sz * 8 - len);
2211 }
2212
2213 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2214 void *vg, uint32_t desc)
2215 {
2216 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2217 uint64_t *d = vd, *n = vn, *m = vm;
2218 uint8_t *pg = vg;
2219
2220 for (i = 0; i < opr_sz; i += 1) {
2221 uint64_t nn = n[i], mm = m[i];
2222 uint64_t pp = expand_pred_b(pg[H1(i)]);
2223 d[i] = (nn & pp) | (mm & ~pp);
2224 }
2225 }
2226
2227 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2228 void *vg, uint32_t desc)
2229 {
2230 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2231 uint64_t *d = vd, *n = vn, *m = vm;
2232 uint8_t *pg = vg;
2233
2234 for (i = 0; i < opr_sz; i += 1) {
2235 uint64_t nn = n[i], mm = m[i];
2236 uint64_t pp = expand_pred_h(pg[H1(i)]);
2237 d[i] = (nn & pp) | (mm & ~pp);
2238 }
2239 }
2240
2241 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2242 void *vg, uint32_t desc)
2243 {
2244 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2245 uint64_t *d = vd, *n = vn, *m = vm;
2246 uint8_t *pg = vg;
2247
2248 for (i = 0; i < opr_sz; i += 1) {
2249 uint64_t nn = n[i], mm = m[i];
2250 uint64_t pp = expand_pred_s(pg[H1(i)]);
2251 d[i] = (nn & pp) | (mm & ~pp);
2252 }
2253 }
2254
2255 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2256 void *vg, uint32_t desc)
2257 {
2258 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2259 uint64_t *d = vd, *n = vn, *m = vm;
2260 uint8_t *pg = vg;
2261
2262 for (i = 0; i < opr_sz; i += 1) {
2263 uint64_t nn = n[i], mm = m[i];
2264 d[i] = (pg[H1(i)] & 1 ? nn : mm);
2265 }
2266 }
2267
2268 /* Two operand comparison controlled by a predicate.
2269 * ??? It is very tempting to want to be able to expand this inline
2270 * with x86 instructions, e.g.
2271 *
2272 * vcmpeqw zm, zn, %ymm0
2273 * vpmovmskb %ymm0, %eax
2274 * and $0x5555, %eax
2275 * and pg, %eax
2276 *
2277 * or even aarch64, e.g.
2278 *
2279 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2280 * cmeq v0.8h, zn, zm
2281 * and v0.8h, v0.8h, mask
2282 * addv h0, v0.8h
2283 * and v0.8b, pg
2284 *
2285 * However, coming up with an abstraction that allows vector inputs and
2286 * a scalar output, and also handles the byte-ordering of sub-uint64_t
2287 * scalar outputs, is tricky.
2288 */
2289 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
2290 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2291 { \
2292 intptr_t opr_sz = simd_oprsz(desc); \
2293 uint32_t flags = PREDTEST_INIT; \
2294 intptr_t i = opr_sz; \
2295 do { \
2296 uint64_t out = 0, pg; \
2297 do { \
2298 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2299 TYPE nn = *(TYPE *)(vn + H(i)); \
2300 TYPE mm = *(TYPE *)(vm + H(i)); \
2301 out |= nn OP mm; \
2302 } while (i & 63); \
2303 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2304 out &= pg; \
2305 *(uint64_t *)(vd + (i >> 3)) = out; \
2306 flags = iter_predtest_bwd(out, pg, flags); \
2307 } while (i > 0); \
2308 return flags; \
2309 }
2310
2311 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2312 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2313 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2314 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2315 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2316 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2317 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2318 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
2319
2320 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
2321 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2322 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2323 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2324
2325 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
2326 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2327 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2328 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2329
2330 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
2331 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2332 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2333 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2334
2335 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
2336 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2337 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2338 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2339
2340 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
2341 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2342 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2343 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2344
2345 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
2346 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2347 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2348 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2349
2350 #undef DO_CMP_PPZZ_B
2351 #undef DO_CMP_PPZZ_H
2352 #undef DO_CMP_PPZZ_S
2353 #undef DO_CMP_PPZZ_D
2354 #undef DO_CMP_PPZZ
2355
2356 /* Similar, but the second source is "wide". */
2357 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
2358 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2359 { \
2360 intptr_t opr_sz = simd_oprsz(desc); \
2361 uint32_t flags = PREDTEST_INIT; \
2362 intptr_t i = opr_sz; \
2363 do { \
2364 uint64_t out = 0, pg; \
2365 do { \
2366 TYPEW mm = *(TYPEW *)(vm + i - 8); \
2367 do { \
2368 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2369 TYPE nn = *(TYPE *)(vn + H(i)); \
2370 out |= nn OP mm; \
2371 } while (i & 7); \
2372 } while (i & 63); \
2373 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2374 out &= pg; \
2375 *(uint64_t *)(vd + (i >> 3)) = out; \
2376 flags = iter_predtest_bwd(out, pg, flags); \
2377 } while (i > 0); \
2378 return flags; \
2379 }
2380
2381 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2382 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
2383 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2384 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2385 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2386 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2387
2388 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, uint8_t, uint64_t, ==)
2389 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, uint16_t, uint64_t, ==)
2390 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, uint32_t, uint64_t, ==)
2391
2392 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, uint8_t, uint64_t, !=)
2393 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, uint16_t, uint64_t, !=)
2394 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, uint32_t, uint64_t, !=)
2395
2396 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
2397 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
2398 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
2399
2400 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
2401 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
2402 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
2403
2404 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
2405 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2406 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2407
2408 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
2409 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2410 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2411
2412 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
2413 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
2414 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
2415
2416 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
2417 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
2418 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
2419
2420 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
2421 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2422 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2423
2424 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
2425 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2426 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2427
2428 #undef DO_CMP_PPZW_B
2429 #undef DO_CMP_PPZW_H
2430 #undef DO_CMP_PPZW_S
2431 #undef DO_CMP_PPZW
2432
2433 /* Similar, but the second source is immediate. */
2434 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
2435 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2436 { \
2437 intptr_t opr_sz = simd_oprsz(desc); \
2438 uint32_t flags = PREDTEST_INIT; \
2439 TYPE mm = simd_data(desc); \
2440 intptr_t i = opr_sz; \
2441 do { \
2442 uint64_t out = 0, pg; \
2443 do { \
2444 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2445 TYPE nn = *(TYPE *)(vn + H(i)); \
2446 out |= nn OP mm; \
2447 } while (i & 63); \
2448 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2449 out &= pg; \
2450 *(uint64_t *)(vd + (i >> 3)) = out; \
2451 flags = iter_predtest_bwd(out, pg, flags); \
2452 } while (i > 0); \
2453 return flags; \
2454 }
2455
2456 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2457 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2458 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2459 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2460 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2461 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2462 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2463 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
2464
2465 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
2466 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
2467 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
2468 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
2469
2470 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
2471 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
2472 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
2473 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
2474
2475 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
2476 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
2477 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
2478 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
2479
2480 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
2481 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
2482 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
2483 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
2484
2485 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
2486 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
2487 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
2488 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
2489
2490 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
2491 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
2492 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
2493 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
2494
2495 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
2496 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
2497 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
2498 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
2499
2500 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
2501 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
2502 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
2503 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
2504
2505 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
2506 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
2507 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
2508 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
2509
2510 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
2511 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
2512 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
2513 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
2514
2515 #undef DO_CMP_PPZI_B
2516 #undef DO_CMP_PPZI_H
2517 #undef DO_CMP_PPZI_S
2518 #undef DO_CMP_PPZI_D
2519 #undef DO_CMP_PPZI
2520
2521 /* Similar to the ARM LastActive pseudocode function. */
2522 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
2523 {
2524 intptr_t i;
2525
2526 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
2527 uint64_t pg = *(uint64_t *)(vg + i);
2528 if (pg) {
2529 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
2530 }
2531 }
2532 return 0;
2533 }
2534
2535 /* Compute a mask into RETB that is true for all G, up to and including
2536 * (if after) or excluding (if !after) the first G & N.
2537 * Return true if BRK found.
2538 */
2539 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
2540 bool brk, bool after)
2541 {
2542 uint64_t b;
2543
2544 if (brk) {
2545 b = 0;
2546 } else if ((g & n) == 0) {
2547 /* For all G, no N are set; break not found. */
2548 b = g;
2549 } else {
2550 /* Break somewhere in N. Locate it. */
2551 b = g & n; /* guard true, pred true */
2552 b = b & -b; /* first such */
2553 if (after) {
2554 b = b | (b - 1); /* break after same */
2555 } else {
2556 b = b - 1; /* break before same */
2557 }
2558 brk = true;
2559 }
2560
2561 *retb = b;
2562 return brk;
2563 }
2564
2565 /* Compute a zeroing BRK. */
2566 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
2567 intptr_t oprsz, bool after)
2568 {
2569 bool brk = false;
2570 intptr_t i;
2571
2572 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2573 uint64_t this_b, this_g = g[i];
2574
2575 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2576 d[i] = this_b & this_g;
2577 }
2578 }
2579
2580 /* Likewise, but also compute flags. */
2581 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
2582 intptr_t oprsz, bool after)
2583 {
2584 uint32_t flags = PREDTEST_INIT;
2585 bool brk = false;
2586 intptr_t i;
2587
2588 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2589 uint64_t this_b, this_d, this_g = g[i];
2590
2591 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2592 d[i] = this_d = this_b & this_g;
2593 flags = iter_predtest_fwd(this_d, this_g, flags);
2594 }
2595 return flags;
2596 }
2597
2598 /* Compute a merging BRK. */
2599 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
2600 intptr_t oprsz, bool after)
2601 {
2602 bool brk = false;
2603 intptr_t i;
2604
2605 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2606 uint64_t this_b, this_g = g[i];
2607
2608 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2609 d[i] = (this_b & this_g) | (d[i] & ~this_g);
2610 }
2611 }
2612
2613 /* Likewise, but also compute flags. */
2614 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
2615 intptr_t oprsz, bool after)
2616 {
2617 uint32_t flags = PREDTEST_INIT;
2618 bool brk = false;
2619 intptr_t i;
2620
2621 for (i = 0; i < oprsz / 8; ++i) {
2622 uint64_t this_b, this_d = d[i], this_g = g[i];
2623
2624 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2625 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
2626 flags = iter_predtest_fwd(this_d, this_g, flags);
2627 }
2628 return flags;
2629 }
2630
2631 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
2632 {
2633 /* It is quicker to zero the whole predicate than loop on OPRSZ.
2634 * The compiler should turn this into 4 64-bit integer stores.
2635 */
2636 memset(d, 0, sizeof(ARMPredicateReg));
2637 return PREDTEST_INIT;
2638 }
2639
2640 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
2641 uint32_t pred_desc)
2642 {
2643 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2644 if (last_active_pred(vn, vg, oprsz)) {
2645 compute_brk_z(vd, vm, vg, oprsz, true);
2646 } else {
2647 do_zero(vd, oprsz);
2648 }
2649 }
2650
2651 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
2652 uint32_t pred_desc)
2653 {
2654 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2655 if (last_active_pred(vn, vg, oprsz)) {
2656 return compute_brks_z(vd, vm, vg, oprsz, true);
2657 } else {
2658 return do_zero(vd, oprsz);
2659 }
2660 }
2661
2662 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
2663 uint32_t pred_desc)
2664 {
2665 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2666 if (last_active_pred(vn, vg, oprsz)) {
2667 compute_brk_z(vd, vm, vg, oprsz, false);
2668 } else {
2669 do_zero(vd, oprsz);
2670 }
2671 }
2672
2673 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
2674 uint32_t pred_desc)
2675 {
2676 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2677 if (last_active_pred(vn, vg, oprsz)) {
2678 return compute_brks_z(vd, vm, vg, oprsz, false);
2679 } else {
2680 return do_zero(vd, oprsz);
2681 }
2682 }
2683
2684 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2685 {
2686 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2687 compute_brk_z(vd, vn, vg, oprsz, true);
2688 }
2689
2690 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2691 {
2692 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2693 return compute_brks_z(vd, vn, vg, oprsz, true);
2694 }
2695
2696 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2697 {
2698 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2699 compute_brk_z(vd, vn, vg, oprsz, false);
2700 }
2701
2702 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2703 {
2704 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2705 return compute_brks_z(vd, vn, vg, oprsz, false);
2706 }
2707
2708 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2709 {
2710 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2711 compute_brk_m(vd, vn, vg, oprsz, true);
2712 }
2713
2714 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2715 {
2716 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2717 return compute_brks_m(vd, vn, vg, oprsz, true);
2718 }
2719
2720 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2721 {
2722 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2723 compute_brk_m(vd, vn, vg, oprsz, false);
2724 }
2725
2726 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2727 {
2728 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2729 return compute_brks_m(vd, vn, vg, oprsz, false);
2730 }
2731
2732 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2733 {
2734 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2735
2736 if (!last_active_pred(vn, vg, oprsz)) {
2737 do_zero(vd, oprsz);
2738 }
2739 }
2740
2741 /* As if PredTest(Ones(PL), D, esz). */
2742 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
2743 uint64_t esz_mask)
2744 {
2745 uint32_t flags = PREDTEST_INIT;
2746 intptr_t i;
2747
2748 for (i = 0; i < oprsz / 8; i++) {
2749 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
2750 }
2751 if (oprsz & 7) {
2752 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
2753 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
2754 }
2755 return flags;
2756 }
2757
2758 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2759 {
2760 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2761
2762 if (last_active_pred(vn, vg, oprsz)) {
2763 return predtest_ones(vd, oprsz, -1);
2764 } else {
2765 return do_zero(vd, oprsz);
2766 }
2767 }
2768
2769 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
2770 {
2771 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2772 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2773 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
2774 intptr_t i;
2775
2776 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2777 uint64_t t = n[i] & g[i] & mask;
2778 sum += ctpop64(t);
2779 }
2780 return sum;
2781 }
2782
2783 uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
2784 {
2785 uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2786 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2787 uint64_t esz_mask = pred_esz_masks[esz];
2788 ARMPredicateReg *d = vd;
2789 uint32_t flags;
2790 intptr_t i;
2791
2792 /* Begin with a zero predicate register. */
2793 flags = do_zero(d, oprsz);
2794 if (count == 0) {
2795 return flags;
2796 }
2797
2798 /* Scale from predicate element count to bits. */
2799 count <<= esz;
2800 /* Bound to the bits in the predicate. */
2801 count = MIN(count, oprsz * 8);
2802
2803 /* Set all of the requested bits. */
2804 for (i = 0; i < count / 64; ++i) {
2805 d->p[i] = esz_mask;
2806 }
2807 if (count & 63) {
2808 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
2809 }
2810
2811 return predtest_ones(d, oprsz, esz_mask);
2812 }
2813
2814 /*
2815 * Load contiguous data, protected by a governing predicate.
2816 */
2817 #define DO_LD1(NAME, FN, TYPEE, TYPEM, H) \
2818 static void do_##NAME(CPUARMState *env, void *vd, void *vg, \
2819 target_ulong addr, intptr_t oprsz, \
2820 uintptr_t ra) \
2821 { \
2822 intptr_t i = 0; \
2823 do { \
2824 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2825 do { \
2826 TYPEM m = 0; \
2827 if (pg & 1) { \
2828 m = FN(env, addr, ra); \
2829 } \
2830 *(TYPEE *)(vd + H(i)) = m; \
2831 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
2832 addr += sizeof(TYPEM); \
2833 } while (i & 15); \
2834 } while (i < oprsz); \
2835 } \
2836 void HELPER(NAME)(CPUARMState *env, void *vg, \
2837 target_ulong addr, uint32_t desc) \
2838 { \
2839 do_##NAME(env, &env->vfp.zregs[simd_data(desc)], vg, \
2840 addr, simd_oprsz(desc), GETPC()); \
2841 }
2842
2843 #define DO_LD2(NAME, FN, TYPEE, TYPEM, H) \
2844 void HELPER(NAME)(CPUARMState *env, void *vg, \
2845 target_ulong addr, uint32_t desc) \
2846 { \
2847 intptr_t i, oprsz = simd_oprsz(desc); \
2848 intptr_t ra = GETPC(); \
2849 unsigned rd = simd_data(desc); \
2850 void *d1 = &env->vfp.zregs[rd]; \
2851 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
2852 for (i = 0; i < oprsz; ) { \
2853 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2854 do { \
2855 TYPEM m1 = 0, m2 = 0; \
2856 if (pg & 1) { \
2857 m1 = FN(env, addr, ra); \
2858 m2 = FN(env, addr + sizeof(TYPEM), ra); \
2859 } \
2860 *(TYPEE *)(d1 + H(i)) = m1; \
2861 *(TYPEE *)(d2 + H(i)) = m2; \
2862 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
2863 addr += 2 * sizeof(TYPEM); \
2864 } while (i & 15); \
2865 } \
2866 }
2867
2868 #define DO_LD3(NAME, FN, TYPEE, TYPEM, H) \
2869 void HELPER(NAME)(CPUARMState *env, void *vg, \
2870 target_ulong addr, uint32_t desc) \
2871 { \
2872 intptr_t i, oprsz = simd_oprsz(desc); \
2873 intptr_t ra = GETPC(); \
2874 unsigned rd = simd_data(desc); \
2875 void *d1 = &env->vfp.zregs[rd]; \
2876 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
2877 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
2878 for (i = 0; i < oprsz; ) { \
2879 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2880 do { \
2881 TYPEM m1 = 0, m2 = 0, m3 = 0; \
2882 if (pg & 1) { \
2883 m1 = FN(env, addr, ra); \
2884 m2 = FN(env, addr + sizeof(TYPEM), ra); \
2885 m3 = FN(env, addr + 2 * sizeof(TYPEM), ra); \
2886 } \
2887 *(TYPEE *)(d1 + H(i)) = m1; \
2888 *(TYPEE *)(d2 + H(i)) = m2; \
2889 *(TYPEE *)(d3 + H(i)) = m3; \
2890 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
2891 addr += 3 * sizeof(TYPEM); \
2892 } while (i & 15); \
2893 } \
2894 }
2895
2896 #define DO_LD4(NAME, FN, TYPEE, TYPEM, H) \
2897 void HELPER(NAME)(CPUARMState *env, void *vg, \
2898 target_ulong addr, uint32_t desc) \
2899 { \
2900 intptr_t i, oprsz = simd_oprsz(desc); \
2901 intptr_t ra = GETPC(); \
2902 unsigned rd = simd_data(desc); \
2903 void *d1 = &env->vfp.zregs[rd]; \
2904 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
2905 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
2906 void *d4 = &env->vfp.zregs[(rd + 3) & 31]; \
2907 for (i = 0; i < oprsz; ) { \
2908 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2909 do { \
2910 TYPEM m1 = 0, m2 = 0, m3 = 0, m4 = 0; \
2911 if (pg & 1) { \
2912 m1 = FN(env, addr, ra); \
2913 m2 = FN(env, addr + sizeof(TYPEM), ra); \
2914 m3 = FN(env, addr + 2 * sizeof(TYPEM), ra); \
2915 m4 = FN(env, addr + 3 * sizeof(TYPEM), ra); \
2916 } \
2917 *(TYPEE *)(d1 + H(i)) = m1; \
2918 *(TYPEE *)(d2 + H(i)) = m2; \
2919 *(TYPEE *)(d3 + H(i)) = m3; \
2920 *(TYPEE *)(d4 + H(i)) = m4; \
2921 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
2922 addr += 4 * sizeof(TYPEM); \
2923 } while (i & 15); \
2924 } \
2925 }
2926
2927 DO_LD1(sve_ld1bhu_r, cpu_ldub_data_ra, uint16_t, uint8_t, H1_2)
2928 DO_LD1(sve_ld1bhs_r, cpu_ldsb_data_ra, uint16_t, int8_t, H1_2)
2929 DO_LD1(sve_ld1bsu_r, cpu_ldub_data_ra, uint32_t, uint8_t, H1_4)
2930 DO_LD1(sve_ld1bss_r, cpu_ldsb_data_ra, uint32_t, int8_t, H1_4)
2931 DO_LD1(sve_ld1bdu_r, cpu_ldub_data_ra, uint64_t, uint8_t, )
2932 DO_LD1(sve_ld1bds_r, cpu_ldsb_data_ra, uint64_t, int8_t, )
2933
2934 DO_LD1(sve_ld1hsu_r, cpu_lduw_data_ra, uint32_t, uint16_t, H1_4)
2935 DO_LD1(sve_ld1hss_r, cpu_ldsw_data_ra, uint32_t, int8_t, H1_4)
2936 DO_LD1(sve_ld1hdu_r, cpu_lduw_data_ra, uint64_t, uint16_t, )
2937 DO_LD1(sve_ld1hds_r, cpu_ldsw_data_ra, uint64_t, int16_t, )
2938
2939 DO_LD1(sve_ld1sdu_r, cpu_ldl_data_ra, uint64_t, uint32_t, )
2940 DO_LD1(sve_ld1sds_r, cpu_ldl_data_ra, uint64_t, int32_t, )
2941
2942 DO_LD1(sve_ld1bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
2943 DO_LD2(sve_ld2bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
2944 DO_LD3(sve_ld3bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
2945 DO_LD4(sve_ld4bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
2946
2947 DO_LD1(sve_ld1hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
2948 DO_LD2(sve_ld2hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
2949 DO_LD3(sve_ld3hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
2950 DO_LD4(sve_ld4hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
2951
2952 DO_LD1(sve_ld1ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
2953 DO_LD2(sve_ld2ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
2954 DO_LD3(sve_ld3ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
2955 DO_LD4(sve_ld4ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
2956
2957 DO_LD1(sve_ld1dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
2958 DO_LD2(sve_ld2dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
2959 DO_LD3(sve_ld3dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
2960 DO_LD4(sve_ld4dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
2961
2962 #undef DO_LD1
2963 #undef DO_LD2
2964 #undef DO_LD3
2965 #undef DO_LD4
2966
2967 /*
2968 * Load contiguous data, first-fault and no-fault.
2969 */
2970
2971 #ifdef CONFIG_USER_ONLY
2972
2973 /* Fault on byte I. All bits in FFR from I are cleared. The vector
2974 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
2975 * option, which leaves subsequent data unchanged.
2976 */
2977 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
2978 {
2979 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
2980
2981 if (i & 63) {
2982 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
2983 i = ROUND_UP(i, 64);
2984 }
2985 for (; i < oprsz; i += 64) {
2986 ffr[i / 64] = 0;
2987 }
2988 }
2989
2990 /* Hold the mmap lock during the operation so that there is no race
2991 * between page_check_range and the load operation. We expect the
2992 * usual case to have no faults at all, so we check the whole range
2993 * first and if successful defer to the normal load operation.
2994 *
2995 * TODO: Change mmap_lock to a rwlock so that multiple readers
2996 * can run simultaneously. This will probably help other uses
2997 * within QEMU as well.
2998 */
2999 #define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \
3000 static void do_sve_ldff1##PART(CPUARMState *env, void *vd, void *vg, \
3001 target_ulong addr, intptr_t oprsz, \
3002 bool first, uintptr_t ra) \
3003 { \
3004 intptr_t i = 0; \
3005 do { \
3006 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3007 do { \
3008 TYPEM m = 0; \
3009 if (pg & 1) { \
3010 if (!first && \
3011 unlikely(page_check_range(addr, sizeof(TYPEM), \
3012 PAGE_READ))) { \
3013 record_fault(env, i, oprsz); \
3014 return; \
3015 } \
3016 m = FN(env, addr, ra); \
3017 first = false; \
3018 } \
3019 *(TYPEE *)(vd + H(i)) = m; \
3020 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
3021 addr += sizeof(TYPEM); \
3022 } while (i & 15); \
3023 } while (i < oprsz); \
3024 } \
3025 void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \
3026 target_ulong addr, uint32_t desc) \