s390x/pci: use a PCI Group structure
[qemu.git] / tcg / tcg-op-gvec.c
1 /*
2 * Generic vector operation expansion
3 *
4 * Copyright (c) 2018 Linaro
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "qemu/osdep.h"
21 #include "tcg/tcg.h"
22 #include "tcg/tcg-op.h"
23 #include "tcg/tcg-op-gvec.h"
24 #include "qemu/main-loop.h"
25 #include "tcg/tcg-gvec-desc.h"
26
27 #define MAX_UNROLL 4
28
29 #ifdef CONFIG_DEBUG_TCG
30 static const TCGOpcode vecop_list_empty[1] = { 0 };
31 #else
32 #define vecop_list_empty NULL
33 #endif
34
35
36 /* Verify vector size and alignment rules. OFS should be the OR of all
37 of the operand offsets so that we can check them all at once. */
38 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
39 {
40 uint32_t max_align;
41
42 switch (oprsz) {
43 case 8:
44 case 16:
45 case 32:
46 tcg_debug_assert(oprsz <= maxsz);
47 break;
48 default:
49 tcg_debug_assert(oprsz == maxsz);
50 break;
51 }
52 tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS));
53
54 max_align = maxsz >= 16 ? 15 : 7;
55 tcg_debug_assert((maxsz & max_align) == 0);
56 tcg_debug_assert((ofs & max_align) == 0);
57 }
58
59 /* Verify vector overlap rules for two operands. */
60 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
61 {
62 tcg_debug_assert(d == a || d + s <= a || a + s <= d);
63 }
64
65 /* Verify vector overlap rules for three operands. */
66 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
67 {
68 check_overlap_2(d, a, s);
69 check_overlap_2(d, b, s);
70 check_overlap_2(a, b, s);
71 }
72
73 /* Verify vector overlap rules for four operands. */
74 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
75 uint32_t c, uint32_t s)
76 {
77 check_overlap_2(d, a, s);
78 check_overlap_2(d, b, s);
79 check_overlap_2(d, c, s);
80 check_overlap_2(a, b, s);
81 check_overlap_2(a, c, s);
82 check_overlap_2(b, c, s);
83 }
84
85 /* Create a descriptor from components. */
86 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
87 {
88 uint32_t desc = 0;
89
90 check_size_align(oprsz, maxsz, 0);
91 tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS));
92
93 oprsz = (oprsz / 8) - 1;
94 maxsz = (maxsz / 8) - 1;
95
96 /*
97 * We have just asserted in check_size_align that either
98 * oprsz is {8,16,32} or matches maxsz. Encode the final
99 * case with '2', as that would otherwise map to 24.
100 */
101 if (oprsz == maxsz) {
102 oprsz = 2;
103 }
104
105 desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
106 desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
107 desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
108
109 return desc;
110 }
111
112 /* Generate a call to a gvec-style helper with two vector operands. */
113 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
114 uint32_t oprsz, uint32_t maxsz, int32_t data,
115 gen_helper_gvec_2 *fn)
116 {
117 TCGv_ptr a0, a1;
118 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
119
120 a0 = tcg_temp_new_ptr();
121 a1 = tcg_temp_new_ptr();
122
123 tcg_gen_addi_ptr(a0, cpu_env, dofs);
124 tcg_gen_addi_ptr(a1, cpu_env, aofs);
125
126 fn(a0, a1, desc);
127
128 tcg_temp_free_ptr(a0);
129 tcg_temp_free_ptr(a1);
130 tcg_temp_free_i32(desc);
131 }
132
133 /* Generate a call to a gvec-style helper with two vector operands
134 and one scalar operand. */
135 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
136 uint32_t oprsz, uint32_t maxsz, int32_t data,
137 gen_helper_gvec_2i *fn)
138 {
139 TCGv_ptr a0, a1;
140 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
141
142 a0 = tcg_temp_new_ptr();
143 a1 = tcg_temp_new_ptr();
144
145 tcg_gen_addi_ptr(a0, cpu_env, dofs);
146 tcg_gen_addi_ptr(a1, cpu_env, aofs);
147
148 fn(a0, a1, c, desc);
149
150 tcg_temp_free_ptr(a0);
151 tcg_temp_free_ptr(a1);
152 tcg_temp_free_i32(desc);
153 }
154
155 /* Generate a call to a gvec-style helper with three vector operands. */
156 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
157 uint32_t oprsz, uint32_t maxsz, int32_t data,
158 gen_helper_gvec_3 *fn)
159 {
160 TCGv_ptr a0, a1, a2;
161 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
162
163 a0 = tcg_temp_new_ptr();
164 a1 = tcg_temp_new_ptr();
165 a2 = tcg_temp_new_ptr();
166
167 tcg_gen_addi_ptr(a0, cpu_env, dofs);
168 tcg_gen_addi_ptr(a1, cpu_env, aofs);
169 tcg_gen_addi_ptr(a2, cpu_env, bofs);
170
171 fn(a0, a1, a2, desc);
172
173 tcg_temp_free_ptr(a0);
174 tcg_temp_free_ptr(a1);
175 tcg_temp_free_ptr(a2);
176 tcg_temp_free_i32(desc);
177 }
178
179 /* Generate a call to a gvec-style helper with four vector operands. */
180 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
181 uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
182 int32_t data, gen_helper_gvec_4 *fn)
183 {
184 TCGv_ptr a0, a1, a2, a3;
185 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
186
187 a0 = tcg_temp_new_ptr();
188 a1 = tcg_temp_new_ptr();
189 a2 = tcg_temp_new_ptr();
190 a3 = tcg_temp_new_ptr();
191
192 tcg_gen_addi_ptr(a0, cpu_env, dofs);
193 tcg_gen_addi_ptr(a1, cpu_env, aofs);
194 tcg_gen_addi_ptr(a2, cpu_env, bofs);
195 tcg_gen_addi_ptr(a3, cpu_env, cofs);
196
197 fn(a0, a1, a2, a3, desc);
198
199 tcg_temp_free_ptr(a0);
200 tcg_temp_free_ptr(a1);
201 tcg_temp_free_ptr(a2);
202 tcg_temp_free_ptr(a3);
203 tcg_temp_free_i32(desc);
204 }
205
206 /* Generate a call to a gvec-style helper with five vector operands. */
207 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
208 uint32_t cofs, uint32_t xofs, uint32_t oprsz,
209 uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
210 {
211 TCGv_ptr a0, a1, a2, a3, a4;
212 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
213
214 a0 = tcg_temp_new_ptr();
215 a1 = tcg_temp_new_ptr();
216 a2 = tcg_temp_new_ptr();
217 a3 = tcg_temp_new_ptr();
218 a4 = tcg_temp_new_ptr();
219
220 tcg_gen_addi_ptr(a0, cpu_env, dofs);
221 tcg_gen_addi_ptr(a1, cpu_env, aofs);
222 tcg_gen_addi_ptr(a2, cpu_env, bofs);
223 tcg_gen_addi_ptr(a3, cpu_env, cofs);
224 tcg_gen_addi_ptr(a4, cpu_env, xofs);
225
226 fn(a0, a1, a2, a3, a4, desc);
227
228 tcg_temp_free_ptr(a0);
229 tcg_temp_free_ptr(a1);
230 tcg_temp_free_ptr(a2);
231 tcg_temp_free_ptr(a3);
232 tcg_temp_free_ptr(a4);
233 tcg_temp_free_i32(desc);
234 }
235
236 /* Generate a call to a gvec-style helper with three vector operands
237 and an extra pointer operand. */
238 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
239 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
240 int32_t data, gen_helper_gvec_2_ptr *fn)
241 {
242 TCGv_ptr a0, a1;
243 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
244
245 a0 = tcg_temp_new_ptr();
246 a1 = tcg_temp_new_ptr();
247
248 tcg_gen_addi_ptr(a0, cpu_env, dofs);
249 tcg_gen_addi_ptr(a1, cpu_env, aofs);
250
251 fn(a0, a1, ptr, desc);
252
253 tcg_temp_free_ptr(a0);
254 tcg_temp_free_ptr(a1);
255 tcg_temp_free_i32(desc);
256 }
257
258 /* Generate a call to a gvec-style helper with three vector operands
259 and an extra pointer operand. */
260 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
261 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
262 int32_t data, gen_helper_gvec_3_ptr *fn)
263 {
264 TCGv_ptr a0, a1, a2;
265 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
266
267 a0 = tcg_temp_new_ptr();
268 a1 = tcg_temp_new_ptr();
269 a2 = tcg_temp_new_ptr();
270
271 tcg_gen_addi_ptr(a0, cpu_env, dofs);
272 tcg_gen_addi_ptr(a1, cpu_env, aofs);
273 tcg_gen_addi_ptr(a2, cpu_env, bofs);
274
275 fn(a0, a1, a2, ptr, desc);
276
277 tcg_temp_free_ptr(a0);
278 tcg_temp_free_ptr(a1);
279 tcg_temp_free_ptr(a2);
280 tcg_temp_free_i32(desc);
281 }
282
283 /* Generate a call to a gvec-style helper with four vector operands
284 and an extra pointer operand. */
285 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
286 uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
287 uint32_t maxsz, int32_t data,
288 gen_helper_gvec_4_ptr *fn)
289 {
290 TCGv_ptr a0, a1, a2, a3;
291 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
292
293 a0 = tcg_temp_new_ptr();
294 a1 = tcg_temp_new_ptr();
295 a2 = tcg_temp_new_ptr();
296 a3 = tcg_temp_new_ptr();
297
298 tcg_gen_addi_ptr(a0, cpu_env, dofs);
299 tcg_gen_addi_ptr(a1, cpu_env, aofs);
300 tcg_gen_addi_ptr(a2, cpu_env, bofs);
301 tcg_gen_addi_ptr(a3, cpu_env, cofs);
302
303 fn(a0, a1, a2, a3, ptr, desc);
304
305 tcg_temp_free_ptr(a0);
306 tcg_temp_free_ptr(a1);
307 tcg_temp_free_ptr(a2);
308 tcg_temp_free_ptr(a3);
309 tcg_temp_free_i32(desc);
310 }
311
312 /* Generate a call to a gvec-style helper with five vector operands
313 and an extra pointer operand. */
314 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
315 uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
316 uint32_t oprsz, uint32_t maxsz, int32_t data,
317 gen_helper_gvec_5_ptr *fn)
318 {
319 TCGv_ptr a0, a1, a2, a3, a4;
320 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
321
322 a0 = tcg_temp_new_ptr();
323 a1 = tcg_temp_new_ptr();
324 a2 = tcg_temp_new_ptr();
325 a3 = tcg_temp_new_ptr();
326 a4 = tcg_temp_new_ptr();
327
328 tcg_gen_addi_ptr(a0, cpu_env, dofs);
329 tcg_gen_addi_ptr(a1, cpu_env, aofs);
330 tcg_gen_addi_ptr(a2, cpu_env, bofs);
331 tcg_gen_addi_ptr(a3, cpu_env, cofs);
332 tcg_gen_addi_ptr(a4, cpu_env, eofs);
333
334 fn(a0, a1, a2, a3, a4, ptr, desc);
335
336 tcg_temp_free_ptr(a0);
337 tcg_temp_free_ptr(a1);
338 tcg_temp_free_ptr(a2);
339 tcg_temp_free_ptr(a3);
340 tcg_temp_free_ptr(a4);
341 tcg_temp_free_i32(desc);
342 }
343
344 /* Return true if we want to implement something of OPRSZ bytes
345 in units of LNSZ. This limits the expansion of inline code. */
346 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
347 {
348 uint32_t q, r;
349
350 if (oprsz < lnsz) {
351 return false;
352 }
353
354 q = oprsz / lnsz;
355 r = oprsz % lnsz;
356 tcg_debug_assert((r & 7) == 0);
357
358 if (lnsz < 16) {
359 /* For sizes below 16, accept no remainder. */
360 if (r != 0) {
361 return false;
362 }
363 } else {
364 /*
365 * Recall that ARM SVE allows vector sizes that are not a
366 * power of 2, but always a multiple of 16. The intent is
367 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
368 * In addition, expand_clr needs to handle a multiple of 8.
369 * Thus we can handle the tail with one more operation per
370 * diminishing power of 2.
371 */
372 q += ctpop32(r);
373 }
374
375 return q <= MAX_UNROLL;
376 }
377
378 static void expand_clr(uint32_t dofs, uint32_t maxsz);
379
380 /* Duplicate C as per VECE. */
381 uint64_t (dup_const)(unsigned vece, uint64_t c)
382 {
383 switch (vece) {
384 case MO_8:
385 return 0x0101010101010101ull * (uint8_t)c;
386 case MO_16:
387 return 0x0001000100010001ull * (uint16_t)c;
388 case MO_32:
389 return 0x0000000100000001ull * (uint32_t)c;
390 case MO_64:
391 return c;
392 default:
393 g_assert_not_reached();
394 }
395 }
396
397 /* Duplicate IN into OUT as per VECE. */
398 static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
399 {
400 switch (vece) {
401 case MO_8:
402 tcg_gen_ext8u_i32(out, in);
403 tcg_gen_muli_i32(out, out, 0x01010101);
404 break;
405 case MO_16:
406 tcg_gen_deposit_i32(out, in, in, 16, 16);
407 break;
408 case MO_32:
409 tcg_gen_mov_i32(out, in);
410 break;
411 default:
412 g_assert_not_reached();
413 }
414 }
415
416 static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
417 {
418 switch (vece) {
419 case MO_8:
420 tcg_gen_ext8u_i64(out, in);
421 tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
422 break;
423 case MO_16:
424 tcg_gen_ext16u_i64(out, in);
425 tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
426 break;
427 case MO_32:
428 tcg_gen_deposit_i64(out, in, in, 32, 32);
429 break;
430 case MO_64:
431 tcg_gen_mov_i64(out, in);
432 break;
433 default:
434 g_assert_not_reached();
435 }
436 }
437
438 /* Select a supported vector type for implementing an operation on SIZE
439 * bytes. If OP is 0, assume that the real operation to be performed is
440 * required by all backends. Otherwise, make sure than OP can be performed
441 * on elements of size VECE in the selected type. Do not select V64 if
442 * PREFER_I64 is true. Return 0 if no vector type is selected.
443 */
444 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
445 uint32_t size, bool prefer_i64)
446 {
447 /*
448 * Recall that ARM SVE allows vector sizes that are not a
449 * power of 2, but always a multiple of 16. The intent is
450 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
451 * It is hard to imagine a case in which v256 is supported
452 * but v128 is not, but check anyway.
453 * In addition, expand_clr needs to handle a multiple of 8.
454 */
455 if (TCG_TARGET_HAS_v256 &&
456 check_size_impl(size, 32) &&
457 tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) &&
458 (!(size & 16) ||
459 (TCG_TARGET_HAS_v128 &&
460 tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) &&
461 (!(size & 8) ||
462 (TCG_TARGET_HAS_v64 &&
463 tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
464 return TCG_TYPE_V256;
465 }
466 if (TCG_TARGET_HAS_v128 &&
467 check_size_impl(size, 16) &&
468 tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) &&
469 (!(size & 8) ||
470 (TCG_TARGET_HAS_v64 &&
471 tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
472 return TCG_TYPE_V128;
473 }
474 if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
475 && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
476 return TCG_TYPE_V64;
477 }
478 return 0;
479 }
480
481 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
482 uint32_t maxsz, TCGv_vec t_vec)
483 {
484 uint32_t i = 0;
485
486 tcg_debug_assert(oprsz >= 8);
487
488 /*
489 * This may be expand_clr for the tail of an operation, e.g.
490 * oprsz == 8 && maxsz == 64. The first 8 bytes of this store
491 * are misaligned wrt the maximum vector size, so do that first.
492 */
493 if (dofs & 8) {
494 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
495 i += 8;
496 }
497
498 switch (type) {
499 case TCG_TYPE_V256:
500 /*
501 * Recall that ARM SVE allows vector sizes that are not a
502 * power of 2, but always a multiple of 16. The intent is
503 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
504 */
505 for (; i + 32 <= oprsz; i += 32) {
506 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
507 }
508 /* fallthru */
509 case TCG_TYPE_V128:
510 for (; i + 16 <= oprsz; i += 16) {
511 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
512 }
513 break;
514 case TCG_TYPE_V64:
515 for (; i < oprsz; i += 8) {
516 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
517 }
518 break;
519 default:
520 g_assert_not_reached();
521 }
522
523 if (oprsz < maxsz) {
524 expand_clr(dofs + oprsz, maxsz - oprsz);
525 }
526 }
527
528 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
529 * Only one of IN_32 or IN_64 may be set;
530 * IN_C is used if IN_32 and IN_64 are unset.
531 */
532 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
533 uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
534 uint64_t in_c)
535 {
536 TCGType type;
537 TCGv_i64 t_64;
538 TCGv_i32 t_32, t_desc;
539 TCGv_ptr t_ptr;
540 uint32_t i;
541
542 assert(vece <= (in_32 ? MO_32 : MO_64));
543 assert(in_32 == NULL || in_64 == NULL);
544
545 /* If we're storing 0, expand oprsz to maxsz. */
546 if (in_32 == NULL && in_64 == NULL) {
547 in_c = dup_const(vece, in_c);
548 if (in_c == 0) {
549 oprsz = maxsz;
550 }
551 }
552
553 /* Implement inline with a vector type, if possible.
554 * Prefer integer when 64-bit host and no variable dup.
555 */
556 type = choose_vector_type(NULL, vece, oprsz,
557 (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
558 && (in_64 == NULL || vece == MO_64)));
559 if (type != 0) {
560 TCGv_vec t_vec = tcg_temp_new_vec(type);
561
562 if (in_32) {
563 tcg_gen_dup_i32_vec(vece, t_vec, in_32);
564 } else if (in_64) {
565 tcg_gen_dup_i64_vec(vece, t_vec, in_64);
566 } else {
567 tcg_gen_dupi_vec(vece, t_vec, in_c);
568 }
569 do_dup_store(type, dofs, oprsz, maxsz, t_vec);
570 tcg_temp_free_vec(t_vec);
571 return;
572 }
573
574 /* Otherwise, inline with an integer type, unless "large". */
575 if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
576 t_64 = NULL;
577 t_32 = NULL;
578
579 if (in_32) {
580 /* We are given a 32-bit variable input. For a 64-bit host,
581 use a 64-bit operation unless the 32-bit operation would
582 be simple enough. */
583 if (TCG_TARGET_REG_BITS == 64
584 && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
585 t_64 = tcg_temp_new_i64();
586 tcg_gen_extu_i32_i64(t_64, in_32);
587 gen_dup_i64(vece, t_64, t_64);
588 } else {
589 t_32 = tcg_temp_new_i32();
590 gen_dup_i32(vece, t_32, in_32);
591 }
592 } else if (in_64) {
593 /* We are given a 64-bit variable input. */
594 t_64 = tcg_temp_new_i64();
595 gen_dup_i64(vece, t_64, in_64);
596 } else {
597 /* We are given a constant input. */
598 /* For 64-bit hosts, use 64-bit constants for "simple" constants
599 or when we'd need too many 32-bit stores, or when a 64-bit
600 constant is really required. */
601 if (vece == MO_64
602 || (TCG_TARGET_REG_BITS == 64
603 && (in_c == 0 || in_c == -1
604 || !check_size_impl(oprsz, 4)))) {
605 t_64 = tcg_const_i64(in_c);
606 } else {
607 t_32 = tcg_const_i32(in_c);
608 }
609 }
610
611 /* Implement inline if we picked an implementation size above. */
612 if (t_32) {
613 for (i = 0; i < oprsz; i += 4) {
614 tcg_gen_st_i32(t_32, cpu_env, dofs + i);
615 }
616 tcg_temp_free_i32(t_32);
617 goto done;
618 }
619 if (t_64) {
620 for (i = 0; i < oprsz; i += 8) {
621 tcg_gen_st_i64(t_64, cpu_env, dofs + i);
622 }
623 tcg_temp_free_i64(t_64);
624 goto done;
625 }
626 }
627
628 /* Otherwise implement out of line. */
629 t_ptr = tcg_temp_new_ptr();
630 tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
631 t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
632
633 if (vece == MO_64) {
634 if (in_64) {
635 gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
636 } else {
637 t_64 = tcg_const_i64(in_c);
638 gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
639 tcg_temp_free_i64(t_64);
640 }
641 } else {
642 typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
643 static dup_fn * const fns[3] = {
644 gen_helper_gvec_dup8,
645 gen_helper_gvec_dup16,
646 gen_helper_gvec_dup32
647 };
648
649 if (in_32) {
650 fns[vece](t_ptr, t_desc, in_32);
651 } else {
652 t_32 = tcg_temp_new_i32();
653 if (in_64) {
654 tcg_gen_extrl_i64_i32(t_32, in_64);
655 } else if (vece == MO_8) {
656 tcg_gen_movi_i32(t_32, in_c & 0xff);
657 } else if (vece == MO_16) {
658 tcg_gen_movi_i32(t_32, in_c & 0xffff);
659 } else {
660 tcg_gen_movi_i32(t_32, in_c);
661 }
662 fns[vece](t_ptr, t_desc, t_32);
663 tcg_temp_free_i32(t_32);
664 }
665 }
666
667 tcg_temp_free_ptr(t_ptr);
668 tcg_temp_free_i32(t_desc);
669 return;
670
671 done:
672 if (oprsz < maxsz) {
673 expand_clr(dofs + oprsz, maxsz - oprsz);
674 }
675 }
676
677 /* Likewise, but with zero. */
678 static void expand_clr(uint32_t dofs, uint32_t maxsz)
679 {
680 do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
681 }
682
683 /* Expand OPSZ bytes worth of two-operand operations using i32 elements. */
684 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
685 bool load_dest, void (*fni)(TCGv_i32, TCGv_i32))
686 {
687 TCGv_i32 t0 = tcg_temp_new_i32();
688 TCGv_i32 t1 = tcg_temp_new_i32();
689 uint32_t i;
690
691 for (i = 0; i < oprsz; i += 4) {
692 tcg_gen_ld_i32(t0, cpu_env, aofs + i);
693 if (load_dest) {
694 tcg_gen_ld_i32(t1, cpu_env, dofs + i);
695 }
696 fni(t1, t0);
697 tcg_gen_st_i32(t1, cpu_env, dofs + i);
698 }
699 tcg_temp_free_i32(t0);
700 tcg_temp_free_i32(t1);
701 }
702
703 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
704 int32_t c, bool load_dest,
705 void (*fni)(TCGv_i32, TCGv_i32, int32_t))
706 {
707 TCGv_i32 t0 = tcg_temp_new_i32();
708 TCGv_i32 t1 = tcg_temp_new_i32();
709 uint32_t i;
710
711 for (i = 0; i < oprsz; i += 4) {
712 tcg_gen_ld_i32(t0, cpu_env, aofs + i);
713 if (load_dest) {
714 tcg_gen_ld_i32(t1, cpu_env, dofs + i);
715 }
716 fni(t1, t0, c);
717 tcg_gen_st_i32(t1, cpu_env, dofs + i);
718 }
719 tcg_temp_free_i32(t0);
720 tcg_temp_free_i32(t1);
721 }
722
723 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
724 TCGv_i32 c, bool scalar_first,
725 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
726 {
727 TCGv_i32 t0 = tcg_temp_new_i32();
728 TCGv_i32 t1 = tcg_temp_new_i32();
729 uint32_t i;
730
731 for (i = 0; i < oprsz; i += 4) {
732 tcg_gen_ld_i32(t0, cpu_env, aofs + i);
733 if (scalar_first) {
734 fni(t1, c, t0);
735 } else {
736 fni(t1, t0, c);
737 }
738 tcg_gen_st_i32(t1, cpu_env, dofs + i);
739 }
740 tcg_temp_free_i32(t0);
741 tcg_temp_free_i32(t1);
742 }
743
744 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */
745 static void expand_3_i32(uint32_t dofs, uint32_t aofs,
746 uint32_t bofs, uint32_t oprsz, bool load_dest,
747 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
748 {
749 TCGv_i32 t0 = tcg_temp_new_i32();
750 TCGv_i32 t1 = tcg_temp_new_i32();
751 TCGv_i32 t2 = tcg_temp_new_i32();
752 uint32_t i;
753
754 for (i = 0; i < oprsz; i += 4) {
755 tcg_gen_ld_i32(t0, cpu_env, aofs + i);
756 tcg_gen_ld_i32(t1, cpu_env, bofs + i);
757 if (load_dest) {
758 tcg_gen_ld_i32(t2, cpu_env, dofs + i);
759 }
760 fni(t2, t0, t1);
761 tcg_gen_st_i32(t2, cpu_env, dofs + i);
762 }
763 tcg_temp_free_i32(t2);
764 tcg_temp_free_i32(t1);
765 tcg_temp_free_i32(t0);
766 }
767
768 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
769 uint32_t oprsz, int32_t c, bool load_dest,
770 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
771 {
772 TCGv_i32 t0 = tcg_temp_new_i32();
773 TCGv_i32 t1 = tcg_temp_new_i32();
774 TCGv_i32 t2 = tcg_temp_new_i32();
775 uint32_t i;
776
777 for (i = 0; i < oprsz; i += 4) {
778 tcg_gen_ld_i32(t0, cpu_env, aofs + i);
779 tcg_gen_ld_i32(t1, cpu_env, bofs + i);
780 if (load_dest) {
781 tcg_gen_ld_i32(t2, cpu_env, dofs + i);
782 }
783 fni(t2, t0, t1, c);
784 tcg_gen_st_i32(t2, cpu_env, dofs + i);
785 }
786 tcg_temp_free_i32(t0);
787 tcg_temp_free_i32(t1);
788 tcg_temp_free_i32(t2);
789 }
790
791 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */
792 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
793 uint32_t cofs, uint32_t oprsz, bool write_aofs,
794 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
795 {
796 TCGv_i32 t0 = tcg_temp_new_i32();
797 TCGv_i32 t1 = tcg_temp_new_i32();
798 TCGv_i32 t2 = tcg_temp_new_i32();
799 TCGv_i32 t3 = tcg_temp_new_i32();
800 uint32_t i;
801
802 for (i = 0; i < oprsz; i += 4) {
803 tcg_gen_ld_i32(t1, cpu_env, aofs + i);
804 tcg_gen_ld_i32(t2, cpu_env, bofs + i);
805 tcg_gen_ld_i32(t3, cpu_env, cofs + i);
806 fni(t0, t1, t2, t3);
807 tcg_gen_st_i32(t0, cpu_env, dofs + i);
808 if (write_aofs) {
809 tcg_gen_st_i32(t1, cpu_env, aofs + i);
810 }
811 }
812 tcg_temp_free_i32(t3);
813 tcg_temp_free_i32(t2);
814 tcg_temp_free_i32(t1);
815 tcg_temp_free_i32(t0);
816 }
817
818 /* Expand OPSZ bytes worth of two-operand operations using i64 elements. */
819 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
820 bool load_dest, void (*fni)(TCGv_i64, TCGv_i64))
821 {
822 TCGv_i64 t0 = tcg_temp_new_i64();
823 TCGv_i64 t1 = tcg_temp_new_i64();
824 uint32_t i;
825
826 for (i = 0; i < oprsz; i += 8) {
827 tcg_gen_ld_i64(t0, cpu_env, aofs + i);
828 if (load_dest) {
829 tcg_gen_ld_i64(t1, cpu_env, dofs + i);
830 }
831 fni(t1, t0);
832 tcg_gen_st_i64(t1, cpu_env, dofs + i);
833 }
834 tcg_temp_free_i64(t0);
835 tcg_temp_free_i64(t1);
836 }
837
838 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
839 int64_t c, bool load_dest,
840 void (*fni)(TCGv_i64, TCGv_i64, int64_t))
841 {
842 TCGv_i64 t0 = tcg_temp_new_i64();
843 TCGv_i64 t1 = tcg_temp_new_i64();
844 uint32_t i;
845
846 for (i = 0; i < oprsz; i += 8) {
847 tcg_gen_ld_i64(t0, cpu_env, aofs + i);
848 if (load_dest) {
849 tcg_gen_ld_i64(t1, cpu_env, dofs + i);
850 }
851 fni(t1, t0, c);
852 tcg_gen_st_i64(t1, cpu_env, dofs + i);
853 }
854 tcg_temp_free_i64(t0);
855 tcg_temp_free_i64(t1);
856 }
857
858 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
859 TCGv_i64 c, bool scalar_first,
860 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
861 {
862 TCGv_i64 t0 = tcg_temp_new_i64();
863 TCGv_i64 t1 = tcg_temp_new_i64();
864 uint32_t i;
865
866 for (i = 0; i < oprsz; i += 8) {
867 tcg_gen_ld_i64(t0, cpu_env, aofs + i);
868 if (scalar_first) {
869 fni(t1, c, t0);
870 } else {
871 fni(t1, t0, c);
872 }
873 tcg_gen_st_i64(t1, cpu_env, dofs + i);
874 }
875 tcg_temp_free_i64(t0);
876 tcg_temp_free_i64(t1);
877 }
878
879 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */
880 static void expand_3_i64(uint32_t dofs, uint32_t aofs,
881 uint32_t bofs, uint32_t oprsz, bool load_dest,
882 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
883 {
884 TCGv_i64 t0 = tcg_temp_new_i64();
885 TCGv_i64 t1 = tcg_temp_new_i64();
886 TCGv_i64 t2 = tcg_temp_new_i64();
887 uint32_t i;
888
889 for (i = 0; i < oprsz; i += 8) {
890 tcg_gen_ld_i64(t0, cpu_env, aofs + i);
891 tcg_gen_ld_i64(t1, cpu_env, bofs + i);
892 if (load_dest) {
893 tcg_gen_ld_i64(t2, cpu_env, dofs + i);
894 }
895 fni(t2, t0, t1);
896 tcg_gen_st_i64(t2, cpu_env, dofs + i);
897 }
898 tcg_temp_free_i64(t2);
899 tcg_temp_free_i64(t1);
900 tcg_temp_free_i64(t0);
901 }
902
903 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
904 uint32_t oprsz, int64_t c, bool load_dest,
905 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
906 {
907 TCGv_i64 t0 = tcg_temp_new_i64();
908 TCGv_i64 t1 = tcg_temp_new_i64();
909 TCGv_i64 t2 = tcg_temp_new_i64();
910 uint32_t i;
911
912 for (i = 0; i < oprsz; i += 8) {
913 tcg_gen_ld_i64(t0, cpu_env, aofs + i);
914 tcg_gen_ld_i64(t1, cpu_env, bofs + i);
915 if (load_dest) {
916 tcg_gen_ld_i64(t2, cpu_env, dofs + i);
917 }
918 fni(t2, t0, t1, c);
919 tcg_gen_st_i64(t2, cpu_env, dofs + i);
920 }
921 tcg_temp_free_i64(t0);
922 tcg_temp_free_i64(t1);
923 tcg_temp_free_i64(t2);
924 }
925
926 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */
927 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
928 uint32_t cofs, uint32_t oprsz, bool write_aofs,
929 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
930 {
931 TCGv_i64 t0 = tcg_temp_new_i64();
932 TCGv_i64 t1 = tcg_temp_new_i64();
933 TCGv_i64 t2 = tcg_temp_new_i64();
934 TCGv_i64 t3 = tcg_temp_new_i64();
935 uint32_t i;
936
937 for (i = 0; i < oprsz; i += 8) {
938 tcg_gen_ld_i64(t1, cpu_env, aofs + i);
939 tcg_gen_ld_i64(t2, cpu_env, bofs + i);
940 tcg_gen_ld_i64(t3, cpu_env, cofs + i);
941 fni(t0, t1, t2, t3);
942 tcg_gen_st_i64(t0, cpu_env, dofs + i);
943 if (write_aofs) {
944 tcg_gen_st_i64(t1, cpu_env, aofs + i);
945 }
946 }
947 tcg_temp_free_i64(t3);
948 tcg_temp_free_i64(t2);
949 tcg_temp_free_i64(t1);
950 tcg_temp_free_i64(t0);
951 }
952
953 /* Expand OPSZ bytes worth of two-operand operations using host vectors. */
954 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
955 uint32_t oprsz, uint32_t tysz, TCGType type,
956 bool load_dest,
957 void (*fni)(unsigned, TCGv_vec, TCGv_vec))
958 {
959 TCGv_vec t0 = tcg_temp_new_vec(type);
960 TCGv_vec t1 = tcg_temp_new_vec(type);
961 uint32_t i;
962
963 for (i = 0; i < oprsz; i += tysz) {
964 tcg_gen_ld_vec(t0, cpu_env, aofs + i);
965 if (load_dest) {
966 tcg_gen_ld_vec(t1, cpu_env, dofs + i);
967 }
968 fni(vece, t1, t0);
969 tcg_gen_st_vec(t1, cpu_env, dofs + i);
970 }
971 tcg_temp_free_vec(t0);
972 tcg_temp_free_vec(t1);
973 }
974
975 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
976 using host vectors. */
977 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
978 uint32_t oprsz, uint32_t tysz, TCGType type,
979 int64_t c, bool load_dest,
980 void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
981 {
982 TCGv_vec t0 = tcg_temp_new_vec(type);
983 TCGv_vec t1 = tcg_temp_new_vec(type);
984 uint32_t i;
985
986 for (i = 0; i < oprsz; i += tysz) {
987 tcg_gen_ld_vec(t0, cpu_env, aofs + i);
988 if (load_dest) {
989 tcg_gen_ld_vec(t1, cpu_env, dofs + i);
990 }
991 fni(vece, t1, t0, c);
992 tcg_gen_st_vec(t1, cpu_env, dofs + i);
993 }
994 tcg_temp_free_vec(t0);
995 tcg_temp_free_vec(t1);
996 }
997
998 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
999 uint32_t oprsz, uint32_t tysz, TCGType type,
1000 TCGv_vec c, bool scalar_first,
1001 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1002 {
1003 TCGv_vec t0 = tcg_temp_new_vec(type);
1004 TCGv_vec t1 = tcg_temp_new_vec(type);
1005 uint32_t i;
1006
1007 for (i = 0; i < oprsz; i += tysz) {
1008 tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1009 if (scalar_first) {
1010 fni(vece, t1, c, t0);
1011 } else {
1012 fni(vece, t1, t0, c);
1013 }
1014 tcg_gen_st_vec(t1, cpu_env, dofs + i);
1015 }
1016 tcg_temp_free_vec(t0);
1017 tcg_temp_free_vec(t1);
1018 }
1019
1020 /* Expand OPSZ bytes worth of three-operand operations using host vectors. */
1021 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1022 uint32_t bofs, uint32_t oprsz,
1023 uint32_t tysz, TCGType type, bool load_dest,
1024 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1025 {
1026 TCGv_vec t0 = tcg_temp_new_vec(type);
1027 TCGv_vec t1 = tcg_temp_new_vec(type);
1028 TCGv_vec t2 = tcg_temp_new_vec(type);
1029 uint32_t i;
1030
1031 for (i = 0; i < oprsz; i += tysz) {
1032 tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1033 tcg_gen_ld_vec(t1, cpu_env, bofs + i);
1034 if (load_dest) {
1035 tcg_gen_ld_vec(t2, cpu_env, dofs + i);
1036 }
1037 fni(vece, t2, t0, t1);
1038 tcg_gen_st_vec(t2, cpu_env, dofs + i);
1039 }
1040 tcg_temp_free_vec(t2);
1041 tcg_temp_free_vec(t1);
1042 tcg_temp_free_vec(t0);
1043 }
1044
1045 /*
1046 * Expand OPSZ bytes worth of three-vector operands and an immediate operand
1047 * using host vectors.
1048 */
1049 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1050 uint32_t bofs, uint32_t oprsz, uint32_t tysz,
1051 TCGType type, int64_t c, bool load_dest,
1052 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
1053 int64_t))
1054 {
1055 TCGv_vec t0 = tcg_temp_new_vec(type);
1056 TCGv_vec t1 = tcg_temp_new_vec(type);
1057 TCGv_vec t2 = tcg_temp_new_vec(type);
1058 uint32_t i;
1059
1060 for (i = 0; i < oprsz; i += tysz) {
1061 tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1062 tcg_gen_ld_vec(t1, cpu_env, bofs + i);
1063 if (load_dest) {
1064 tcg_gen_ld_vec(t2, cpu_env, dofs + i);
1065 }
1066 fni(vece, t2, t0, t1, c);
1067 tcg_gen_st_vec(t2, cpu_env, dofs + i);
1068 }
1069 tcg_temp_free_vec(t0);
1070 tcg_temp_free_vec(t1);
1071 tcg_temp_free_vec(t2);
1072 }
1073
1074 /* Expand OPSZ bytes worth of four-operand operations using host vectors. */
1075 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1076 uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1077 uint32_t tysz, TCGType type, bool write_aofs,
1078 void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1079 TCGv_vec, TCGv_vec))
1080 {
1081 TCGv_vec t0 = tcg_temp_new_vec(type);
1082 TCGv_vec t1 = tcg_temp_new_vec(type);
1083 TCGv_vec t2 = tcg_temp_new_vec(type);
1084 TCGv_vec t3 = tcg_temp_new_vec(type);
1085 uint32_t i;
1086
1087 for (i = 0; i < oprsz; i += tysz) {
1088 tcg_gen_ld_vec(t1, cpu_env, aofs + i);
1089 tcg_gen_ld_vec(t2, cpu_env, bofs + i);
1090 tcg_gen_ld_vec(t3, cpu_env, cofs + i);
1091 fni(vece, t0, t1, t2, t3);
1092 tcg_gen_st_vec(t0, cpu_env, dofs + i);
1093 if (write_aofs) {
1094 tcg_gen_st_vec(t1, cpu_env, aofs + i);
1095 }
1096 }
1097 tcg_temp_free_vec(t3);
1098 tcg_temp_free_vec(t2);
1099 tcg_temp_free_vec(t1);
1100 tcg_temp_free_vec(t0);
1101 }
1102
1103 /* Expand a vector two-operand operation. */
1104 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
1105 uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
1106 {
1107 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1108 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1109 TCGType type;
1110 uint32_t some;
1111
1112 check_size_align(oprsz, maxsz, dofs | aofs);
1113 check_overlap_2(dofs, aofs, maxsz);
1114
1115 type = 0;
1116 if (g->fniv) {
1117 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1118 }
1119 switch (type) {
1120 case TCG_TYPE_V256:
1121 /* Recall that ARM SVE allows vector sizes that are not a
1122 * power of 2, but always a multiple of 16. The intent is
1123 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1124 */
1125 some = QEMU_ALIGN_DOWN(oprsz, 32);
1126 expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1127 g->load_dest, g->fniv);
1128 if (some == oprsz) {
1129 break;
1130 }
1131 dofs += some;
1132 aofs += some;
1133 oprsz -= some;
1134 maxsz -= some;
1135 /* fallthru */
1136 case TCG_TYPE_V128:
1137 expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1138 g->load_dest, g->fniv);
1139 break;
1140 case TCG_TYPE_V64:
1141 expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1142 g->load_dest, g->fniv);
1143 break;
1144
1145 case 0:
1146 if (g->fni8 && check_size_impl(oprsz, 8)) {
1147 expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8);
1148 } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1149 expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4);
1150 } else {
1151 assert(g->fno != NULL);
1152 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
1153 oprsz = maxsz;
1154 }
1155 break;
1156
1157 default:
1158 g_assert_not_reached();
1159 }
1160 tcg_swap_vecop_list(hold_list);
1161
1162 if (oprsz < maxsz) {
1163 expand_clr(dofs + oprsz, maxsz - oprsz);
1164 }
1165 }
1166
1167 /* Expand a vector operation with two vectors and an immediate. */
1168 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1169 uint32_t maxsz, int64_t c, const GVecGen2i *g)
1170 {
1171 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1172 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1173 TCGType type;
1174 uint32_t some;
1175
1176 check_size_align(oprsz, maxsz, dofs | aofs);
1177 check_overlap_2(dofs, aofs, maxsz);
1178
1179 type = 0;
1180 if (g->fniv) {
1181 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1182 }
1183 switch (type) {
1184 case TCG_TYPE_V256:
1185 /* Recall that ARM SVE allows vector sizes that are not a
1186 * power of 2, but always a multiple of 16. The intent is
1187 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1188 */
1189 some = QEMU_ALIGN_DOWN(oprsz, 32);
1190 expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1191 c, g->load_dest, g->fniv);
1192 if (some == oprsz) {
1193 break;
1194 }
1195 dofs += some;
1196 aofs += some;
1197 oprsz -= some;
1198 maxsz -= some;
1199 /* fallthru */
1200 case TCG_TYPE_V128:
1201 expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1202 c, g->load_dest, g->fniv);
1203 break;
1204 case TCG_TYPE_V64:
1205 expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1206 c, g->load_dest, g->fniv);
1207 break;
1208
1209 case 0:
1210 if (g->fni8 && check_size_impl(oprsz, 8)) {
1211 expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1212 } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1213 expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1214 } else {
1215 if (g->fno) {
1216 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1217 } else {
1218 TCGv_i64 tcg_c = tcg_const_i64(c);
1219 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1220 maxsz, c, g->fnoi);
1221 tcg_temp_free_i64(tcg_c);
1222 }
1223 oprsz = maxsz;
1224 }
1225 break;
1226
1227 default:
1228 g_assert_not_reached();
1229 }
1230 tcg_swap_vecop_list(hold_list);
1231
1232 if (oprsz < maxsz) {
1233 expand_clr(dofs + oprsz, maxsz - oprsz);
1234 }
1235 }
1236
1237 /* Expand a vector operation with two vectors and a scalar. */
1238 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1239 uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1240 {
1241 TCGType type;
1242
1243 check_size_align(oprsz, maxsz, dofs | aofs);
1244 check_overlap_2(dofs, aofs, maxsz);
1245
1246 type = 0;
1247 if (g->fniv) {
1248 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1249 }
1250 if (type != 0) {
1251 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1252 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1253 TCGv_vec t_vec = tcg_temp_new_vec(type);
1254 uint32_t some;
1255
1256 tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1257
1258 switch (type) {
1259 case TCG_TYPE_V256:
1260 /* Recall that ARM SVE allows vector sizes that are not a
1261 * power of 2, but always a multiple of 16. The intent is
1262 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1263 */
1264 some = QEMU_ALIGN_DOWN(oprsz, 32);
1265 expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1266 t_vec, g->scalar_first, g->fniv);
1267 if (some == oprsz) {
1268 break;
1269 }
1270 dofs += some;
1271 aofs += some;
1272 oprsz -= some;
1273 maxsz -= some;
1274 /* fallthru */
1275
1276 case TCG_TYPE_V128:
1277 expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1278 t_vec, g->scalar_first, g->fniv);
1279 break;
1280
1281 case TCG_TYPE_V64:
1282 expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1283 t_vec, g->scalar_first, g->fniv);
1284 break;
1285
1286 default:
1287 g_assert_not_reached();
1288 }
1289 tcg_temp_free_vec(t_vec);
1290 tcg_swap_vecop_list(hold_list);
1291 } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1292 TCGv_i64 t64 = tcg_temp_new_i64();
1293
1294 gen_dup_i64(g->vece, t64, c);
1295 expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1296 tcg_temp_free_i64(t64);
1297 } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1298 TCGv_i32 t32 = tcg_temp_new_i32();
1299
1300 tcg_gen_extrl_i64_i32(t32, c);
1301 gen_dup_i32(g->vece, t32, t32);
1302 expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1303 tcg_temp_free_i32(t32);
1304 } else {
1305 tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1306 return;
1307 }
1308
1309 if (oprsz < maxsz) {
1310 expand_clr(dofs + oprsz, maxsz - oprsz);
1311 }
1312 }
1313
1314 /* Expand a vector three-operand operation. */
1315 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1316 uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1317 {
1318 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1319 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1320 TCGType type;
1321 uint32_t some;
1322
1323 check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1324 check_overlap_3(dofs, aofs, bofs, maxsz);
1325
1326 type = 0;
1327 if (g->fniv) {
1328 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1329 }
1330 switch (type) {
1331 case TCG_TYPE_V256:
1332 /* Recall that ARM SVE allows vector sizes that are not a
1333 * power of 2, but always a multiple of 16. The intent is
1334 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1335 */
1336 some = QEMU_ALIGN_DOWN(oprsz, 32);
1337 expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1338 g->load_dest, g->fniv);
1339 if (some == oprsz) {
1340 break;
1341 }
1342 dofs += some;
1343 aofs += some;
1344 bofs += some;
1345 oprsz -= some;
1346 maxsz -= some;
1347 /* fallthru */
1348 case TCG_TYPE_V128:
1349 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1350 g->load_dest, g->fniv);
1351 break;
1352 case TCG_TYPE_V64:
1353 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1354 g->load_dest, g->fniv);
1355 break;
1356
1357 case 0:
1358 if (g->fni8 && check_size_impl(oprsz, 8)) {
1359 expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1360 } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1361 expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1362 } else {
1363 assert(g->fno != NULL);
1364 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1365 maxsz, g->data, g->fno);
1366 oprsz = maxsz;
1367 }
1368 break;
1369
1370 default:
1371 g_assert_not_reached();
1372 }
1373 tcg_swap_vecop_list(hold_list);
1374
1375 if (oprsz < maxsz) {
1376 expand_clr(dofs + oprsz, maxsz - oprsz);
1377 }
1378 }
1379
1380 /* Expand a vector operation with three vectors and an immediate. */
1381 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1382 uint32_t oprsz, uint32_t maxsz, int64_t c,
1383 const GVecGen3i *g)
1384 {
1385 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1386 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1387 TCGType type;
1388 uint32_t some;
1389
1390 check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1391 check_overlap_3(dofs, aofs, bofs, maxsz);
1392
1393 type = 0;
1394 if (g->fniv) {
1395 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1396 }
1397 switch (type) {
1398 case TCG_TYPE_V256:
1399 /*
1400 * Recall that ARM SVE allows vector sizes that are not a
1401 * power of 2, but always a multiple of 16. The intent is
1402 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1403 */
1404 some = QEMU_ALIGN_DOWN(oprsz, 32);
1405 expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1406 c, g->load_dest, g->fniv);
1407 if (some == oprsz) {
1408 break;
1409 }
1410 dofs += some;
1411 aofs += some;
1412 bofs += some;
1413 oprsz -= some;
1414 maxsz -= some;
1415 /* fallthru */
1416 case TCG_TYPE_V128:
1417 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1418 c, g->load_dest, g->fniv);
1419 break;
1420 case TCG_TYPE_V64:
1421 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1422 c, g->load_dest, g->fniv);
1423 break;
1424
1425 case 0:
1426 if (g->fni8 && check_size_impl(oprsz, 8)) {
1427 expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8);
1428 } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1429 expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4);
1430 } else {
1431 assert(g->fno != NULL);
1432 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
1433 oprsz = maxsz;
1434 }
1435 break;
1436
1437 default:
1438 g_assert_not_reached();
1439 }
1440 tcg_swap_vecop_list(hold_list);
1441
1442 if (oprsz < maxsz) {
1443 expand_clr(dofs + oprsz, maxsz - oprsz);
1444 }
1445 }
1446
1447 /* Expand a vector four-operand operation. */
1448 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1449 uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1450 {
1451 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1452 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1453 TCGType type;
1454 uint32_t some;
1455
1456 check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1457 check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1458
1459 type = 0;
1460 if (g->fniv) {
1461 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1462 }
1463 switch (type) {
1464 case TCG_TYPE_V256:
1465 /* Recall that ARM SVE allows vector sizes that are not a
1466 * power of 2, but always a multiple of 16. The intent is
1467 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1468 */
1469 some = QEMU_ALIGN_DOWN(oprsz, 32);
1470 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1471 32, TCG_TYPE_V256, g->write_aofs, g->fniv);
1472 if (some == oprsz) {
1473 break;
1474 }
1475 dofs += some;
1476 aofs += some;
1477 bofs += some;
1478 cofs += some;
1479 oprsz -= some;
1480 maxsz -= some;
1481 /* fallthru */
1482 case TCG_TYPE_V128:
1483 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1484 16, TCG_TYPE_V128, g->write_aofs, g->fniv);
1485 break;
1486 case TCG_TYPE_V64:
1487 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1488 8, TCG_TYPE_V64, g->write_aofs, g->fniv);
1489 break;
1490
1491 case 0:
1492 if (g->fni8 && check_size_impl(oprsz, 8)) {
1493 expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
1494 g->write_aofs, g->fni8);
1495 } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1496 expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
1497 g->write_aofs, g->fni4);
1498 } else {
1499 assert(g->fno != NULL);
1500 tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1501 oprsz, maxsz, g->data, g->fno);
1502 oprsz = maxsz;
1503 }
1504 break;
1505
1506 default:
1507 g_assert_not_reached();
1508 }
1509 tcg_swap_vecop_list(hold_list);
1510
1511 if (oprsz < maxsz) {
1512 expand_clr(dofs + oprsz, maxsz - oprsz);
1513 }
1514 }
1515
1516 /*
1517 * Expand specific vector operations.
1518 */
1519
1520 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1521 {
1522 tcg_gen_mov_vec(a, b);
1523 }
1524
1525 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1526 uint32_t oprsz, uint32_t maxsz)
1527 {
1528 static const GVecGen2 g = {
1529 .fni8 = tcg_gen_mov_i64,
1530 .fniv = vec_mov2,
1531 .fno = gen_helper_gvec_mov,
1532 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1533 };
1534 if (dofs != aofs) {
1535 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1536 } else {
1537 check_size_align(oprsz, maxsz, dofs);
1538 if (oprsz < maxsz) {
1539 expand_clr(dofs + oprsz, maxsz - oprsz);
1540 }
1541 }
1542 }
1543
1544 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1545 uint32_t maxsz, TCGv_i32 in)
1546 {
1547 check_size_align(oprsz, maxsz, dofs);
1548 tcg_debug_assert(vece <= MO_32);
1549 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1550 }
1551
1552 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1553 uint32_t maxsz, TCGv_i64 in)
1554 {
1555 check_size_align(oprsz, maxsz, dofs);
1556 tcg_debug_assert(vece <= MO_64);
1557 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1558 }
1559
1560 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1561 uint32_t oprsz, uint32_t maxsz)
1562 {
1563 check_size_align(oprsz, maxsz, dofs);
1564 if (vece <= MO_64) {
1565 TCGType type = choose_vector_type(NULL, vece, oprsz, 0);
1566 if (type != 0) {
1567 TCGv_vec t_vec = tcg_temp_new_vec(type);
1568 tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs);
1569 do_dup_store(type, dofs, oprsz, maxsz, t_vec);
1570 tcg_temp_free_vec(t_vec);
1571 } else if (vece <= MO_32) {
1572 TCGv_i32 in = tcg_temp_new_i32();
1573 switch (vece) {
1574 case MO_8:
1575 tcg_gen_ld8u_i32(in, cpu_env, aofs);
1576 break;
1577 case MO_16:
1578 tcg_gen_ld16u_i32(in, cpu_env, aofs);
1579 break;
1580 default:
1581 tcg_gen_ld_i32(in, cpu_env, aofs);
1582 break;
1583 }
1584 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1585 tcg_temp_free_i32(in);
1586 } else {
1587 TCGv_i64 in = tcg_temp_new_i64();
1588 tcg_gen_ld_i64(in, cpu_env, aofs);
1589 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1590 tcg_temp_free_i64(in);
1591 }
1592 } else if (vece == 4) {
1593 /* 128-bit duplicate. */
1594 int i;
1595
1596 tcg_debug_assert(oprsz >= 16);
1597 if (TCG_TARGET_HAS_v128) {
1598 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1599
1600 tcg_gen_ld_vec(in, cpu_env, aofs);
1601 for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1602 tcg_gen_st_vec(in, cpu_env, dofs + i);
1603 }
1604 tcg_temp_free_vec(in);
1605 } else {
1606 TCGv_i64 in0 = tcg_temp_new_i64();
1607 TCGv_i64 in1 = tcg_temp_new_i64();
1608
1609 tcg_gen_ld_i64(in0, cpu_env, aofs);
1610 tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
1611 for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1612 tcg_gen_st_i64(in0, cpu_env, dofs + i);
1613 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
1614 }
1615 tcg_temp_free_i64(in0);
1616 tcg_temp_free_i64(in1);
1617 }
1618 if (oprsz < maxsz) {
1619 expand_clr(dofs + oprsz, maxsz - oprsz);
1620 }
1621 } else if (vece == 5) {
1622 /* 256-bit duplicate. */
1623 int i;
1624
1625 tcg_debug_assert(oprsz >= 32);
1626 tcg_debug_assert(oprsz % 32 == 0);
1627 if (TCG_TARGET_HAS_v256) {
1628 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256);
1629
1630 tcg_gen_ld_vec(in, cpu_env, aofs);
1631 for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1632 tcg_gen_st_vec(in, cpu_env, dofs + i);
1633 }
1634 tcg_temp_free_vec(in);
1635 } else if (TCG_TARGET_HAS_v128) {
1636 TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128);
1637 TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128);
1638
1639 tcg_gen_ld_vec(in0, cpu_env, aofs);
1640 tcg_gen_ld_vec(in1, cpu_env, aofs + 16);
1641 for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1642 tcg_gen_st_vec(in0, cpu_env, dofs + i);
1643 tcg_gen_st_vec(in1, cpu_env, dofs + i + 16);
1644 }
1645 tcg_temp_free_vec(in0);
1646 tcg_temp_free_vec(in1);
1647 } else {
1648 TCGv_i64 in[4];
1649 int j;
1650
1651 for (j = 0; j < 4; ++j) {
1652 in[j] = tcg_temp_new_i64();
1653 tcg_gen_ld_i64(in[j], cpu_env, aofs + j * 8);
1654 }
1655 for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1656 for (j = 0; j < 4; ++j) {
1657 tcg_gen_st_i64(in[j], cpu_env, dofs + i + j * 8);
1658 }
1659 }
1660 for (j = 0; j < 4; ++j) {
1661 tcg_temp_free_i64(in[j]);
1662 }
1663 }
1664 if (oprsz < maxsz) {
1665 expand_clr(dofs + oprsz, maxsz - oprsz);
1666 }
1667 } else {
1668 g_assert_not_reached();
1669 }
1670 }
1671
1672 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz,
1673 uint32_t maxsz, uint64_t x)
1674 {
1675 check_size_align(oprsz, maxsz, dofs);
1676 do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x);
1677 }
1678
1679 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1680 uint32_t oprsz, uint32_t maxsz)
1681 {
1682 static const GVecGen2 g = {
1683 .fni8 = tcg_gen_not_i64,
1684 .fniv = tcg_gen_not_vec,
1685 .fno = gen_helper_gvec_not,
1686 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1687 };
1688 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1689 }
1690
1691 /* Perform a vector addition using normal addition and a mask. The mask
1692 should be the sign bit of each lane. This 6-operation form is more
1693 efficient than separate additions when there are 4 or more lanes in
1694 the 64-bit operation. */
1695 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1696 {
1697 TCGv_i64 t1 = tcg_temp_new_i64();
1698 TCGv_i64 t2 = tcg_temp_new_i64();
1699 TCGv_i64 t3 = tcg_temp_new_i64();
1700
1701 tcg_gen_andc_i64(t1, a, m);
1702 tcg_gen_andc_i64(t2, b, m);
1703 tcg_gen_xor_i64(t3, a, b);
1704 tcg_gen_add_i64(d, t1, t2);
1705 tcg_gen_and_i64(t3, t3, m);
1706 tcg_gen_xor_i64(d, d, t3);
1707
1708 tcg_temp_free_i64(t1);
1709 tcg_temp_free_i64(t2);
1710 tcg_temp_free_i64(t3);
1711 }
1712
1713 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1714 {
1715 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1716 gen_addv_mask(d, a, b, m);
1717 tcg_temp_free_i64(m);
1718 }
1719
1720 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1721 {
1722 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1723 gen_addv_mask(d, a, b, m);
1724 tcg_temp_free_i64(m);
1725 }
1726
1727 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1728 {
1729 TCGv_i64 t1 = tcg_temp_new_i64();
1730 TCGv_i64 t2 = tcg_temp_new_i64();
1731
1732 tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1733 tcg_gen_add_i64(t2, a, b);
1734 tcg_gen_add_i64(t1, t1, b);
1735 tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1736
1737 tcg_temp_free_i64(t1);
1738 tcg_temp_free_i64(t2);
1739 }
1740
1741 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
1742
1743 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1744 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1745 {
1746 static const GVecGen3 g[4] = {
1747 { .fni8 = tcg_gen_vec_add8_i64,
1748 .fniv = tcg_gen_add_vec,
1749 .fno = gen_helper_gvec_add8,
1750 .opt_opc = vecop_list_add,
1751 .vece = MO_8 },
1752 { .fni8 = tcg_gen_vec_add16_i64,
1753 .fniv = tcg_gen_add_vec,
1754 .fno = gen_helper_gvec_add16,
1755 .opt_opc = vecop_list_add,
1756 .vece = MO_16 },
1757 { .fni4 = tcg_gen_add_i32,
1758 .fniv = tcg_gen_add_vec,
1759 .fno = gen_helper_gvec_add32,
1760 .opt_opc = vecop_list_add,
1761 .vece = MO_32 },
1762 { .fni8 = tcg_gen_add_i64,
1763 .fniv = tcg_gen_add_vec,
1764 .fno = gen_helper_gvec_add64,
1765 .opt_opc = vecop_list_add,
1766 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1767 .vece = MO_64 },
1768 };
1769
1770 tcg_debug_assert(vece <= MO_64);
1771 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1772 }
1773
1774 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1775 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1776 {
1777 static const GVecGen2s g[4] = {
1778 { .fni8 = tcg_gen_vec_add8_i64,
1779 .fniv = tcg_gen_add_vec,
1780 .fno = gen_helper_gvec_adds8,
1781 .opt_opc = vecop_list_add,
1782 .vece = MO_8 },
1783 { .fni8 = tcg_gen_vec_add16_i64,
1784 .fniv = tcg_gen_add_vec,
1785 .fno = gen_helper_gvec_adds16,
1786 .opt_opc = vecop_list_add,
1787 .vece = MO_16 },
1788 { .fni4 = tcg_gen_add_i32,
1789 .fniv = tcg_gen_add_vec,
1790 .fno = gen_helper_gvec_adds32,
1791 .opt_opc = vecop_list_add,
1792 .vece = MO_32 },
1793 { .fni8 = tcg_gen_add_i64,
1794 .fniv = tcg_gen_add_vec,
1795 .fno = gen_helper_gvec_adds64,
1796 .opt_opc = vecop_list_add,
1797 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1798 .vece = MO_64 },
1799 };
1800
1801 tcg_debug_assert(vece <= MO_64);
1802 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1803 }
1804
1805 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
1806 int64_t c, uint32_t oprsz, uint32_t maxsz)
1807 {
1808 TCGv_i64 tmp = tcg_const_i64(c);
1809 tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
1810 tcg_temp_free_i64(tmp);
1811 }
1812
1813 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
1814
1815 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
1816 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1817 {
1818 static const GVecGen2s g[4] = {
1819 { .fni8 = tcg_gen_vec_sub8_i64,
1820 .fniv = tcg_gen_sub_vec,
1821 .fno = gen_helper_gvec_subs8,
1822 .opt_opc = vecop_list_sub,
1823 .vece = MO_8 },
1824 { .fni8 = tcg_gen_vec_sub16_i64,
1825 .fniv = tcg_gen_sub_vec,
1826 .fno = gen_helper_gvec_subs16,
1827 .opt_opc = vecop_list_sub,
1828 .vece = MO_16 },
1829 { .fni4 = tcg_gen_sub_i32,
1830 .fniv = tcg_gen_sub_vec,
1831 .fno = gen_helper_gvec_subs32,
1832 .opt_opc = vecop_list_sub,
1833 .vece = MO_32 },
1834 { .fni8 = tcg_gen_sub_i64,
1835 .fniv = tcg_gen_sub_vec,
1836 .fno = gen_helper_gvec_subs64,
1837 .opt_opc = vecop_list_sub,
1838 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1839 .vece = MO_64 },
1840 };
1841
1842 tcg_debug_assert(vece <= MO_64);
1843 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1844 }
1845
1846 /* Perform a vector subtraction using normal subtraction and a mask.
1847 Compare gen_addv_mask above. */
1848 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1849 {
1850 TCGv_i64 t1 = tcg_temp_new_i64();
1851 TCGv_i64 t2 = tcg_temp_new_i64();
1852 TCGv_i64 t3 = tcg_temp_new_i64();
1853
1854 tcg_gen_or_i64(t1, a, m);
1855 tcg_gen_andc_i64(t2, b, m);
1856 tcg_gen_eqv_i64(t3, a, b);
1857 tcg_gen_sub_i64(d, t1, t2);
1858 tcg_gen_and_i64(t3, t3, m);
1859 tcg_gen_xor_i64(d, d, t3);
1860
1861 tcg_temp_free_i64(t1);
1862 tcg_temp_free_i64(t2);
1863 tcg_temp_free_i64(t3);
1864 }
1865
1866 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1867 {
1868 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1869 gen_subv_mask(d, a, b, m);
1870 tcg_temp_free_i64(m);
1871 }
1872
1873 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1874 {
1875 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1876 gen_subv_mask(d, a, b, m);
1877 tcg_temp_free_i64(m);
1878 }
1879
1880 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1881 {
1882 TCGv_i64 t1 = tcg_temp_new_i64();
1883 TCGv_i64 t2 = tcg_temp_new_i64();
1884
1885 tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1886 tcg_gen_sub_i64(t2, a, b);
1887 tcg_gen_sub_i64(t1, a, t1);
1888 tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1889
1890 tcg_temp_free_i64(t1);
1891 tcg_temp_free_i64(t2);
1892 }
1893
1894 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
1895 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1896 {
1897 static const GVecGen3 g[4] = {
1898 { .fni8 = tcg_gen_vec_sub8_i64,
1899 .fniv = tcg_gen_sub_vec,
1900 .fno = gen_helper_gvec_sub8,
1901 .opt_opc = vecop_list_sub,
1902 .vece = MO_8 },
1903 { .fni8 = tcg_gen_vec_sub16_i64,
1904 .fniv = tcg_gen_sub_vec,
1905 .fno = gen_helper_gvec_sub16,
1906 .opt_opc = vecop_list_sub,
1907 .vece = MO_16 },
1908 { .fni4 = tcg_gen_sub_i32,
1909 .fniv = tcg_gen_sub_vec,
1910 .fno = gen_helper_gvec_sub32,
1911 .opt_opc = vecop_list_sub,
1912 .vece = MO_32 },
1913 { .fni8 = tcg_gen_sub_i64,
1914 .fniv = tcg_gen_sub_vec,
1915 .fno = gen_helper_gvec_sub64,
1916 .opt_opc = vecop_list_sub,
1917 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1918 .vece = MO_64 },
1919 };
1920
1921 tcg_debug_assert(vece <= MO_64);
1922 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1923 }
1924
1925 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
1926
1927 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
1928 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1929 {
1930 static const GVecGen3 g[4] = {
1931 { .fniv = tcg_gen_mul_vec,
1932 .fno = gen_helper_gvec_mul8,
1933 .opt_opc = vecop_list_mul,
1934 .vece = MO_8 },
1935 { .fniv = tcg_gen_mul_vec,
1936 .fno = gen_helper_gvec_mul16,
1937 .opt_opc = vecop_list_mul,
1938 .vece = MO_16 },
1939 { .fni4 = tcg_gen_mul_i32,
1940 .fniv = tcg_gen_mul_vec,
1941 .fno = gen_helper_gvec_mul32,
1942 .opt_opc = vecop_list_mul,
1943 .vece = MO_32 },
1944 { .fni8 = tcg_gen_mul_i64,
1945 .fniv = tcg_gen_mul_vec,
1946 .fno = gen_helper_gvec_mul64,
1947 .opt_opc = vecop_list_mul,
1948 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1949 .vece = MO_64 },
1950 };
1951
1952 tcg_debug_assert(vece <= MO_64);
1953 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1954 }
1955
1956 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
1957 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1958 {
1959 static const GVecGen2s g[4] = {
1960 { .fniv = tcg_gen_mul_vec,
1961 .fno = gen_helper_gvec_muls8,
1962 .opt_opc = vecop_list_mul,
1963 .vece = MO_8 },
1964 { .fniv = tcg_gen_mul_vec,
1965 .fno = gen_helper_gvec_muls16,
1966 .opt_opc = vecop_list_mul,
1967 .vece = MO_16 },
1968 { .fni4 = tcg_gen_mul_i32,
1969 .fniv = tcg_gen_mul_vec,
1970 .fno = gen_helper_gvec_muls32,
1971 .opt_opc = vecop_list_mul,
1972 .vece = MO_32 },
1973 { .fni8 = tcg_gen_mul_i64,
1974 .fniv = tcg_gen_mul_vec,
1975 .fno = gen_helper_gvec_muls64,
1976 .opt_opc = vecop_list_mul,
1977 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1978 .vece = MO_64 },
1979 };
1980
1981 tcg_debug_assert(vece <= MO_64);
1982 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1983 }
1984
1985 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
1986 int64_t c, uint32_t oprsz, uint32_t maxsz)
1987 {
1988 TCGv_i64 tmp = tcg_const_i64(c);
1989 tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
1990 tcg_temp_free_i64(tmp);
1991 }
1992
1993 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1994 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1995 {
1996 static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
1997 static const GVecGen3 g[4] = {
1998 { .fniv = tcg_gen_ssadd_vec,
1999 .fno = gen_helper_gvec_ssadd8,
2000 .opt_opc = vecop_list,
2001 .vece = MO_8 },
2002 { .fniv = tcg_gen_ssadd_vec,
2003 .fno = gen_helper_gvec_ssadd16,
2004 .opt_opc = vecop_list,
2005 .vece = MO_16 },
2006 { .fniv = tcg_gen_ssadd_vec,
2007 .fno = gen_helper_gvec_ssadd32,
2008 .opt_opc = vecop_list,
2009 .vece = MO_32 },
2010 { .fniv = tcg_gen_ssadd_vec,
2011 .fno = gen_helper_gvec_ssadd64,
2012 .opt_opc = vecop_list,
2013 .vece = MO_64 },
2014 };
2015 tcg_debug_assert(vece <= MO_64);
2016 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2017 }
2018
2019 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
2020 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2021 {
2022 static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
2023 static const GVecGen3 g[4] = {
2024 { .fniv = tcg_gen_sssub_vec,
2025 .fno = gen_helper_gvec_sssub8,
2026 .opt_opc = vecop_list,
2027 .vece = MO_8 },
2028 { .fniv = tcg_gen_sssub_vec,
2029 .fno = gen_helper_gvec_sssub16,
2030 .opt_opc = vecop_list,
2031 .vece = MO_16 },
2032 { .fniv = tcg_gen_sssub_vec,
2033 .fno = gen_helper_gvec_sssub32,
2034 .opt_opc = vecop_list,
2035 .vece = MO_32 },
2036 { .fniv = tcg_gen_sssub_vec,
2037 .fno = gen_helper_gvec_sssub64,
2038 .opt_opc = vecop_list,
2039 .vece = MO_64 },
2040 };
2041 tcg_debug_assert(vece <= MO_64);
2042 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2043 }
2044
2045 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2046 {
2047 TCGv_i32 max = tcg_const_i32(-1);
2048 tcg_gen_add_i32(d, a, b);
2049 tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
2050 tcg_temp_free_i32(max);
2051 }
2052
2053 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2054 {
2055 TCGv_i64 max = tcg_const_i64(-1);
2056 tcg_gen_add_i64(d, a, b);
2057 tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
2058 tcg_temp_free_i64(max);
2059 }
2060
2061 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2062 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2063 {
2064 static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
2065 static const GVecGen3 g[4] = {
2066 { .fniv = tcg_gen_usadd_vec,
2067 .fno = gen_helper_gvec_usadd8,
2068 .opt_opc = vecop_list,
2069 .vece = MO_8 },
2070 { .fniv = tcg_gen_usadd_vec,
2071 .fno = gen_helper_gvec_usadd16,
2072 .opt_opc = vecop_list,
2073 .vece = MO_16 },
2074 { .fni4 = tcg_gen_usadd_i32,
2075 .fniv = tcg_gen_usadd_vec,
2076 .fno = gen_helper_gvec_usadd32,
2077 .opt_opc = vecop_list,
2078 .vece = MO_32 },
2079 { .fni8 = tcg_gen_usadd_i64,
2080 .fniv = tcg_gen_usadd_vec,
2081 .fno = gen_helper_gvec_usadd64,
2082 .opt_opc = vecop_list,
2083 .vece = MO_64 }
2084 };
2085 tcg_debug_assert(vece <= MO_64);
2086 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2087 }
2088
2089 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2090 {
2091 TCGv_i32 min = tcg_const_i32(0);
2092 tcg_gen_sub_i32(d, a, b);
2093 tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
2094 tcg_temp_free_i32(min);
2095 }
2096
2097 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2098 {
2099 TCGv_i64 min = tcg_const_i64(0);
2100 tcg_gen_sub_i64(d, a, b);
2101 tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
2102 tcg_temp_free_i64(min);
2103 }
2104
2105 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
2106 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2107 {
2108 static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
2109 static const GVecGen3 g[4] = {
2110 { .fniv = tcg_gen_ussub_vec,
2111 .fno = gen_helper_gvec_ussub8,
2112 .opt_opc = vecop_list,
2113 .vece = MO_8 },
2114 { .fniv = tcg_gen_ussub_vec,
2115 .fno = gen_helper_gvec_ussub16,
2116 .opt_opc = vecop_list,
2117 .vece = MO_16 },
2118 { .fni4 = tcg_gen_ussub_i32,
2119 .fniv = tcg_gen_ussub_vec,
2120 .fno = gen_helper_gvec_ussub32,
2121 .opt_opc = vecop_list,
2122 .vece = MO_32 },
2123 { .fni8 = tcg_gen_ussub_i64,
2124 .fniv = tcg_gen_ussub_vec,
2125 .fno = gen_helper_gvec_ussub64,
2126 .opt_opc = vecop_list,
2127 .vece = MO_64 }
2128 };
2129 tcg_debug_assert(vece <= MO_64);
2130 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2131 }
2132
2133 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
2134 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2135 {
2136 static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
2137 static const GVecGen3 g[4] = {
2138 { .fniv = tcg_gen_smin_vec,
2139 .fno = gen_helper_gvec_smin8,
2140 .opt_opc = vecop_list,
2141 .vece = MO_8 },
2142 { .fniv = tcg_gen_smin_vec,
2143 .fno = gen_helper_gvec_smin16,
2144 .opt_opc = vecop_list,
2145 .vece = MO_16 },
2146 { .fni4 = tcg_gen_smin_i32,
2147 .fniv = tcg_gen_smin_vec,
2148 .fno = gen_helper_gvec_smin32,
2149 .opt_opc = vecop_list,
2150 .vece = MO_32 },
2151 { .fni8 = tcg_gen_smin_i64,
2152 .fniv = tcg_gen_smin_vec,
2153 .fno = gen_helper_gvec_smin64,
2154 .opt_opc = vecop_list,
2155 .vece = MO_64 }
2156 };
2157 tcg_debug_assert(vece <= MO_64);
2158 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2159 }
2160
2161 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
2162 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2163 {
2164 static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
2165 static const GVecGen3 g[4] = {
2166 { .fniv = tcg_gen_umin_vec,
2167 .fno = gen_helper_gvec_umin8,
2168 .opt_opc = vecop_list,
2169 .vece = MO_8 },
2170 { .fniv = tcg_gen_umin_vec,
2171 .fno = gen_helper_gvec_umin16,
2172 .opt_opc = vecop_list,
2173 .vece = MO_16 },
2174 { .fni4 = tcg_gen_umin_i32,
2175 .fniv = tcg_gen_umin_vec,
2176 .fno = gen_helper_gvec_umin32,
2177 .opt_opc = vecop_list,
2178 .vece = MO_32 },
2179 { .fni8 = tcg_gen_umin_i64,
2180 .fniv = tcg_gen_umin_vec,
2181 .fno = gen_helper_gvec_umin64,
2182 .opt_opc = vecop_list,
2183 .vece = MO_64 }
2184 };
2185 tcg_debug_assert(vece <= MO_64);
2186 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2187 }
2188
2189 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
2190 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2191 {
2192 static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
2193 static const GVecGen3 g[4] = {
2194 { .fniv = tcg_gen_smax_vec,
2195 .fno = gen_helper_gvec_smax8,
2196 .opt_opc = vecop_list,
2197 .vece = MO_8 },
2198 { .fniv = tcg_gen_smax_vec,
2199 .fno = gen_helper_gvec_smax16,
2200 .opt_opc = vecop_list,
2201 .vece = MO_16 },
2202 { .fni4 = tcg_gen_smax_i32,
2203 .fniv = tcg_gen_smax_vec,
2204 .fno = gen_helper_gvec_smax32,
2205 .opt_opc = vecop_list,
2206 .vece = MO_32 },
2207 { .fni8 = tcg_gen_smax_i64,
2208 .fniv = tcg_gen_smax_vec,
2209 .fno = gen_helper_gvec_smax64,
2210 .opt_opc = vecop_list,
2211 .vece = MO_64 }
2212 };
2213 tcg_debug_assert(vece <= MO_64);
2214 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2215 }
2216
2217 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
2218 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2219 {
2220 static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
2221 static const GVecGen3 g[4] = {
2222 { .fniv = tcg_gen_umax_vec,
2223 .fno = gen_helper_gvec_umax8,
2224 .opt_opc = vecop_list,
2225 .vece = MO_8 },
2226 { .fniv = tcg_gen_umax_vec,
2227 .fno = gen_helper_gvec_umax16,
2228 .opt_opc = vecop_list,
2229 .vece = MO_16 },
2230 { .fni4 = tcg_gen_umax_i32,
2231 .fniv = tcg_gen_umax_vec,
2232 .fno = gen_helper_gvec_umax32,
2233 .opt_opc = vecop_list,
2234 .vece = MO_32 },
2235 { .fni8 = tcg_gen_umax_i64,
2236 .fniv = tcg_gen_umax_vec,
2237 .fno = gen_helper_gvec_umax64,
2238 .opt_opc = vecop_list,
2239 .vece = MO_64 }
2240 };
2241 tcg_debug_assert(vece <= MO_64);
2242 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2243 }
2244
2245 /* Perform a vector negation using normal negation and a mask.
2246 Compare gen_subv_mask above. */
2247 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
2248 {
2249 TCGv_i64 t2 = tcg_temp_new_i64();
2250 TCGv_i64 t3 = tcg_temp_new_i64();
2251
2252 tcg_gen_andc_i64(t3, m, b);
2253 tcg_gen_andc_i64(t2, b, m);
2254 tcg_gen_sub_i64(d, m, t2);
2255 tcg_gen_xor_i64(d, d, t3);
2256
2257 tcg_temp_free_i64(t2);
2258 tcg_temp_free_i64(t3);
2259 }
2260
2261 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
2262 {
2263 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
2264 gen_negv_mask(d, b, m);
2265 tcg_temp_free_i64(m);
2266 }
2267
2268 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
2269 {
2270 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
2271 gen_negv_mask(d, b, m);
2272 tcg_temp_free_i64(m);
2273 }
2274
2275 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
2276 {
2277 TCGv_i64 t1 = tcg_temp_new_i64();
2278 TCGv_i64 t2 = tcg_temp_new_i64();
2279
2280 tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2281 tcg_gen_neg_i64(t2, b);
2282 tcg_gen_neg_i64(t1, t1);
2283 tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2284
2285 tcg_temp_free_i64(t1);
2286 tcg_temp_free_i64(t2);
2287 }
2288
2289 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
2290 uint32_t oprsz, uint32_t maxsz)
2291 {
2292 static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
2293 static const GVecGen2 g[4] = {
2294 { .fni8 = tcg_gen_vec_neg8_i64,
2295 .fniv = tcg_gen_neg_vec,
2296 .fno = gen_helper_gvec_neg8,
2297 .opt_opc = vecop_list,
2298 .vece = MO_8 },
2299 { .fni8 = tcg_gen_vec_neg16_i64,
2300 .fniv = tcg_gen_neg_vec,
2301 .fno = gen_helper_gvec_neg16,
2302 .opt_opc = vecop_list,
2303 .vece = MO_16 },
2304 { .fni4 = tcg_gen_neg_i32,
2305 .fniv = tcg_gen_neg_vec,
2306 .fno = gen_helper_gvec_neg32,
2307 .opt_opc = vecop_list,
2308 .vece = MO_32 },
2309 { .fni8 = tcg_gen_neg_i64,
2310 .fniv = tcg_gen_neg_vec,
2311 .fno = gen_helper_gvec_neg64,
2312 .opt_opc = vecop_list,
2313 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2314 .vece = MO_64 },
2315 };
2316
2317 tcg_debug_assert(vece <= MO_64);
2318 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2319 }
2320
2321 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
2322 {
2323 TCGv_i64 t = tcg_temp_new_i64();
2324 int nbit = 8 << vece;
2325
2326 /* Create -1 for each negative element. */
2327 tcg_gen_shri_i64(t, b, nbit - 1);
2328 tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2329 tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
2330
2331 /*
2332 * Invert (via xor -1) and add one.
2333 * Because of the ordering the msb is cleared,
2334 * so we never have carry into the next element.
2335 */
2336 tcg_gen_xor_i64(d, b, t);
2337 tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2338 tcg_gen_add_i64(d, d, t);
2339
2340 tcg_temp_free_i64(t);
2341 }
2342
2343 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b)
2344 {
2345 gen_absv_mask(d, b, MO_8);
2346 }
2347
2348 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b)
2349 {
2350 gen_absv_mask(d, b, MO_16);
2351 }
2352
2353 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
2354 uint32_t oprsz, uint32_t maxsz)
2355 {
2356 static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 };
2357 static const GVecGen2 g[4] = {
2358 { .fni8 = tcg_gen_vec_abs8_i64,
2359 .fniv = tcg_gen_abs_vec,
2360 .fno = gen_helper_gvec_abs8,
2361 .opt_opc = vecop_list,
2362 .vece = MO_8 },
2363 { .fni8 = tcg_gen_vec_abs16_i64,
2364 .fniv = tcg_gen_abs_vec,
2365 .fno = gen_helper_gvec_abs16,
2366 .opt_opc = vecop_list,
2367 .vece = MO_16 },
2368 { .fni4 = tcg_gen_abs_i32,
2369 .fniv = tcg_gen_abs_vec,
2370 .fno = gen_helper_gvec_abs32,
2371 .opt_opc = vecop_list,
2372 .vece = MO_32 },
2373 { .fni8 = tcg_gen_abs_i64,
2374 .fniv = tcg_gen_abs_vec,
2375 .fno = gen_helper_gvec_abs64,
2376 .opt_opc = vecop_list,
2377 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2378 .vece = MO_64 },
2379 };
2380
2381 tcg_debug_assert(vece <= MO_64);
2382 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2383 }
2384
2385 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
2386 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2387 {
2388 static const GVecGen3 g = {
2389 .fni8 = tcg_gen_and_i64,
2390 .fniv = tcg_gen_and_vec,
2391 .fno = gen_helper_gvec_and,
2392 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2393 };
2394
2395 if (aofs == bofs) {
2396 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2397 } else {
2398 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2399 }
2400 }
2401
2402 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
2403 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2404 {
2405 static const GVecGen3 g = {
2406 .fni8 = tcg_gen_or_i64,
2407 .fniv = tcg_gen_or_vec,
2408 .fno = gen_helper_gvec_or,
2409 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2410 };
2411
2412 if (aofs == bofs) {
2413 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2414 } else {
2415 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2416 }
2417 }
2418
2419 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
2420 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2421 {
2422 static const GVecGen3 g = {
2423 .fni8 = tcg_gen_xor_i64,
2424 .fniv = tcg_gen_xor_vec,
2425 .fno = gen_helper_gvec_xor,
2426 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2427 };
2428
2429 if (aofs == bofs) {
2430 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2431 } else {
2432 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2433 }
2434 }
2435
2436 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
2437 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2438 {
2439 static const GVecGen3 g = {
2440 .fni8 = tcg_gen_andc_i64,
2441 .fniv = tcg_gen_andc_vec,
2442 .fno = gen_helper_gvec_andc,
2443 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2444 };
2445
2446 if (aofs == bofs) {
2447 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2448 } else {
2449 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2450 }
2451 }
2452
2453 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
2454 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2455 {
2456 static const GVecGen3 g = {
2457 .fni8 = tcg_gen_orc_i64,
2458 .fniv = tcg_gen_orc_vec,
2459 .fno = gen_helper_gvec_orc,
2460 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2461 };
2462
2463 if (aofs == bofs) {
2464 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2465 } else {
2466 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2467 }
2468 }
2469
2470 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
2471 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2472 {
2473 static const GVecGen3 g = {
2474 .fni8 = tcg_gen_nand_i64,
2475 .fniv = tcg_gen_nand_vec,
2476 .fno = gen_helper_gvec_nand,
2477 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2478 };
2479
2480 if (aofs == bofs) {
2481 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2482 } else {
2483 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2484 }
2485 }
2486
2487 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
2488 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2489 {
2490 static const GVecGen3 g = {
2491 .fni8 = tcg_gen_nor_i64,
2492 .fniv = tcg_gen_nor_vec,
2493 .fno = gen_helper_gvec_nor,
2494 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2495 };
2496
2497 if (aofs == bofs) {
2498 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2499 } else {
2500 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2501 }
2502 }
2503
2504 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
2505 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2506 {
2507 static const GVecGen3 g = {
2508 .fni8 = tcg_gen_eqv_i64,
2509 .fniv = tcg_gen_eqv_vec,
2510 .fno = gen_helper_gvec_eqv,
2511 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2512 };
2513
2514 if (aofs == bofs) {
2515 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2516 } else {
2517 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2518 }
2519 }
2520
2521 static const GVecGen2s gop_ands = {
2522 .fni8 = tcg_gen_and_i64,
2523 .fniv = tcg_gen_and_vec,
2524 .fno = gen_helper_gvec_ands,
2525 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2526 .vece = MO_64
2527 };
2528
2529 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
2530 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2531 {
2532 TCGv_i64 tmp = tcg_temp_new_i64();
2533 gen_dup_i64(vece, tmp, c);
2534 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2535 tcg_temp_free_i64(tmp);
2536 }
2537
2538 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
2539 int64_t c, uint32_t oprsz, uint32_t maxsz)
2540 {
2541 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2542 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2543 tcg_temp_free_i64(tmp);
2544 }
2545
2546 static const GVecGen2s gop_xors = {
2547 .fni8 = tcg_gen_xor_i64,
2548 .fniv = tcg_gen_xor_vec,
2549 .fno = gen_helper_gvec_xors,
2550 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2551 .vece = MO_64
2552 };
2553
2554 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
2555 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2556 {
2557 TCGv_i64 tmp = tcg_temp_new_i64();
2558 gen_dup_i64(vece, tmp, c);
2559 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2560 tcg_temp_free_i64(tmp);
2561 }
2562
2563 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
2564 int64_t c, uint32_t oprsz, uint32_t maxsz)
2565 {
2566 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2567 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2568 tcg_temp_free_i64(tmp);
2569 }
2570
2571 static const GVecGen2s gop_ors = {
2572 .fni8 = tcg_gen_or_i64,
2573 .fniv = tcg_gen_or_vec,
2574 .fno = gen_helper_gvec_ors,
2575 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2576 .vece = MO_64
2577 };
2578
2579 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
2580 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2581 {
2582 TCGv_i64 tmp = tcg_temp_new_i64();
2583 gen_dup_i64(vece, tmp, c);
2584 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2585 tcg_temp_free_i64(tmp);
2586 }
2587
2588 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
2589 int64_t c, uint32_t oprsz, uint32_t maxsz)
2590 {
2591 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2592 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2593 tcg_temp_free_i64(tmp);
2594 }
2595
2596 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2597 {
2598 uint64_t mask = dup_const(MO_8, 0xff << c);
2599 tcg_gen_shli_i64(d, a, c);
2600 tcg_gen_andi_i64(d, d, mask);
2601 }
2602
2603 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2604 {
2605 uint64_t mask = dup_const(MO_16, 0xffff << c);
2606 tcg_gen_shli_i64(d, a, c);
2607 tcg_gen_andi_i64(d, d, mask);
2608 }
2609
2610 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
2611 int64_t shift, uint32_t oprsz, uint32_t maxsz)
2612 {
2613 static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
2614 static const GVecGen2i g[4] = {
2615 { .fni8 = tcg_gen_vec_shl8i_i64,
2616 .fniv = tcg_gen_shli_vec,
2617 .fno = gen_helper_gvec_shl8i,
2618 .opt_opc = vecop_list,
2619 .vece = MO_8 },
2620 { .fni8 = tcg_gen_vec_shl16i_i64,
2621 .fniv = tcg_gen_shli_vec,
2622 .fno = gen_helper_gvec_shl16i,
2623 .opt_opc = vecop_list,
2624 .vece = MO_16 },
2625 { .fni4 = tcg_gen_shli_i32,
2626 .fniv = tcg_gen_shli_vec,
2627 .fno = gen_helper_gvec_shl32i,
2628 .opt_opc = vecop_list,
2629 .vece = MO_32 },
2630 { .fni8 = tcg_gen_shli_i64,
2631 .fniv = tcg_gen_shli_vec,
2632 .fno = gen_helper_gvec_shl64i,
2633 .opt_opc = vecop_list,
2634 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2635 .vece = MO_64 },
2636 };
2637
2638 tcg_debug_assert(vece <= MO_64);
2639 tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2640 if (shift == 0) {
2641 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2642 } else {
2643 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2644 }
2645 }
2646
2647 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2648 {
2649 uint64_t mask = dup_const(MO_8, 0xff >> c);
2650 tcg_gen_shri_i64(d, a, c);
2651 tcg_gen_andi_i64(d, d, mask);
2652 }
2653
2654 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2655 {
2656 uint64_t mask = dup_const(MO_16, 0xffff >> c);
2657 tcg_gen_shri_i64(d, a, c);
2658 tcg_gen_andi_i64(d, d, mask);
2659 }
2660
2661 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2662 int64_t shift, uint32_t oprsz, uint32_t maxsz)
2663 {
2664 static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
2665 static const GVecGen2i g[4] = {
2666 { .fni8 = tcg_gen_vec_shr8i_i64,
2667 .fniv = tcg_gen_shri_vec,
2668 .fno = gen_helper_gvec_shr8i,
2669 .opt_opc = vecop_list,
2670 .vece = MO_8 },
2671 { .fni8 = tcg_gen_vec_shr16i_i64,
2672 .fniv = tcg_gen_shri_vec,
2673 .fno = gen_helper_gvec_shr16i,
2674 .opt_opc = vecop_list,
2675 .vece = MO_16 },
2676 { .fni4 = tcg_gen_shri_i32,
2677 .fniv = tcg_gen_shri_vec,
2678 .fno = gen_helper_gvec_shr32i,
2679 .opt_opc = vecop_list,
2680 .vece = MO_32 },
2681 { .fni8 = tcg_gen_shri_i64,
2682 .fniv = tcg_gen_shri_vec,
2683 .fno = gen_helper_gvec_shr64i,
2684 .opt_opc = vecop_list,
2685 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2686 .vece = MO_64 },
2687 };
2688
2689 tcg_debug_assert(vece <= MO_64);
2690 tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2691 if (shift == 0) {
2692 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2693 } else {
2694 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2695 }
2696 }
2697
2698 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2699 {
2700 uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2701 uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2702 TCGv_i64 s = tcg_temp_new_i64();
2703
2704 tcg_gen_shri_i64(d, a, c);
2705 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */
2706 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2707 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */
2708 tcg_gen_or_i64(d, d, s); /* include sign extension */
2709 tcg_temp_free_i64(s);
2710 }
2711
2712 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2713 {
2714 uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2715 uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2716 TCGv_i64 s = tcg_temp_new_i64();
2717
2718 tcg_gen_shri_i64(d, a, c);
2719 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */
2720 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */
2721 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2722 tcg_gen_or_i64(d, d, s); /* include sign extension */
2723 tcg_temp_free_i64(s);
2724 }
2725
2726 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
2727 int64_t shift, uint32_t oprsz, uint32_t maxsz)
2728 {
2729 static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
2730 static const GVecGen2i g[4] = {
2731 { .fni8 = tcg_gen_vec_sar8i_i64,
2732 .fniv = tcg_gen_sari_vec,
2733 .fno = gen_helper_gvec_sar8i,
2734 .opt_opc = vecop_list,
2735 .vece = MO_8 },
2736 { .fni8 = tcg_gen_vec_sar16i_i64,
2737 .fniv = tcg_gen_sari_vec,
2738 .fno = gen_helper_gvec_sar16i,
2739 .opt_opc = vecop_list,
2740 .vece = MO_16 },
2741 { .fni4 = tcg_gen_sari_i32,
2742 .fniv = tcg_gen_sari_vec,
2743 .fno = gen_helper_gvec_sar32i,
2744 .opt_opc = vecop_list,
2745 .vece = MO_32 },
2746 { .fni8 = tcg_gen_sari_i64,
2747 .fniv = tcg_gen_sari_vec,
2748 .fno = gen_helper_gvec_sar64i,
2749 .opt_opc = vecop_list,
2750 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2751 .vece = MO_64 },
2752 };
2753
2754 tcg_debug_assert(vece <= MO_64);
2755 tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2756 if (shift == 0) {
2757 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2758 } else {
2759 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2760 }
2761 }
2762
2763 void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2764 {
2765 uint64_t mask = dup_const(MO_8, 0xff << c);
2766
2767 tcg_gen_shli_i64(d, a, c);
2768 tcg_gen_shri_i64(a, a, 8 - c);
2769 tcg_gen_andi_i64(d, d, mask);
2770 tcg_gen_andi_i64(a, a, ~mask);
2771 tcg_gen_or_i64(d, d, a);
2772 }
2773
2774 void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2775 {
2776 uint64_t mask = dup_const(MO_16, 0xffff << c);
2777
2778 tcg_gen_shli_i64(d, a, c);
2779 tcg_gen_shri_i64(a, a, 16 - c);
2780 tcg_gen_andi_i64(d, d, mask);
2781 tcg_gen_andi_i64(a, a, ~mask);
2782 tcg_gen_or_i64(d, d, a);
2783 }
2784
2785 void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs,
2786 int64_t shift, uint32_t oprsz, uint32_t maxsz)
2787 {
2788 static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 };
2789 static const GVecGen2i g[4] = {
2790 { .fni8 = tcg_gen_vec_rotl8i_i64,
2791 .fniv = tcg_gen_rotli_vec,
2792 .fno = gen_helper_gvec_rotl8i,
2793 .opt_opc = vecop_list,
2794 .vece = MO_8 },
2795 { .fni8 = tcg_gen_vec_rotl16i_i64,
2796 .fniv = tcg_gen_rotli_vec,
2797 .fno = gen_helper_gvec_rotl16i,
2798 .opt_opc = vecop_list,
2799 .vece = MO_16 },
2800 { .fni4 = tcg_gen_rotli_i32,
2801 .fniv = tcg_gen_rotli_vec,
2802 .fno = gen_helper_gvec_rotl32i,
2803 .opt_opc = vecop_list,
2804 .vece = MO_32 },
2805 { .fni8 = tcg_gen_rotli_i64,
2806 .fniv = tcg_gen_rotli_vec,
2807 .fno = gen_helper_gvec_rotl64i,
2808 .opt_opc = vecop_list,
2809 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2810 .vece = MO_64 },
2811 };
2812
2813 tcg_debug_assert(vece <= MO_64);
2814 tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2815 if (shift == 0) {
2816 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2817 } else {
2818 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2819 }
2820 }
2821
2822 void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs,
2823 int64_t shift, uint32_t oprsz, uint32_t maxsz)
2824 {
2825 tcg_debug_assert(vece <= MO_64);
2826 tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2827 tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1),
2828 oprsz, maxsz);
2829 }
2830
2831 /*
2832 * Specialized generation vector shifts by a non-constant scalar.
2833 */
2834
2835 typedef struct {
2836 void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
2837 void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
2838 void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32);
2839 void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
2840 gen_helper_gvec_2 *fno[4];
2841 TCGOpcode s_list[2];
2842 TCGOpcode v_list[2];
2843 } GVecGen2sh;
2844
2845 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
2846 uint32_t oprsz, uint32_t tysz, TCGType type,
2847 TCGv_i32 shift,
2848 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
2849 {
2850 TCGv_vec t0 = tcg_temp_new_vec(type);
2851 uint32_t i;
2852
2853 for (i = 0; i < oprsz; i += tysz) {
2854 tcg_gen_ld_vec(t0, cpu_env, aofs + i);
2855 fni(vece, t0, t0, shift);
2856 tcg_gen_st_vec(t0, cpu_env, dofs + i);
2857 }
2858 tcg_temp_free_vec(t0);
2859 }
2860
2861 static void
2862 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift,
2863 uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g)
2864 {
2865 TCGType type;
2866 uint32_t some;
2867
2868 check_size_align(oprsz, maxsz, dofs | aofs);
2869 check_overlap_2(dofs, aofs, maxsz);
2870
2871 /* If the backend has a scalar expansion, great. */
2872 type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64);
2873 if (type) {
2874 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
2875 switch (type) {
2876 case TCG_TYPE_V256:
2877 some = QEMU_ALIGN_DOWN(oprsz, 32);
2878 expand_2sh_vec(vece, dofs, aofs, some, 32,
2879 TCG_TYPE_V256, shift, g->fniv_s);
2880 if (some == oprsz) {
2881 break;
2882 }
2883 dofs += some;
2884 aofs += some;
2885 oprsz -= some;
2886 maxsz -= some;
2887 /* fallthru */
2888 case TCG_TYPE_V128:
2889 expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
2890 TCG_TYPE_V128, shift, g->fniv_s);
2891 break;
2892 case TCG_TYPE_V64:
2893 expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
2894 TCG_TYPE_V64, shift, g->fniv_s);
2895 break;
2896 default:
2897 g_assert_not_reached();
2898 }
2899 tcg_swap_vecop_list(hold_list);
2900 goto clear_tail;
2901 }
2902
2903 /* If the backend supports variable vector shifts, also cool. */
2904 type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64);
2905 if (type) {
2906 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
2907 TCGv_vec v_shift = tcg_temp_new_vec(type);
2908
2909 if (vece == MO_64) {
2910 TCGv_i64 sh64 = tcg_temp_new_i64();
2911 tcg_gen_extu_i32_i64(sh64, shift);
2912 tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
2913 tcg_temp_free_i64(sh64);
2914 } else {
2915 tcg_gen_dup_i32_vec(vece, v_shift, shift);
2916 }
2917
2918 switch (type) {
2919 case TCG_TYPE_V256:
2920 some = QEMU_ALIGN_DOWN(oprsz, 32);
2921 expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
2922 v_shift, false, g->fniv_v);
2923 if (some == oprsz) {
2924 break;
2925 }
2926 dofs += some;
2927 aofs += some;
2928 oprsz -= some;
2929 maxsz -= some;
2930 /* fallthru */
2931 case TCG_TYPE_V128:
2932 expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
2933 v_shift, false, g->fniv_v);
2934 break;
2935 case TCG_TYPE_V64:
2936 expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
2937 v_shift, false, g->fniv_v);
2938 break;
2939 default:
2940 g_assert_not_reached();
2941 }
2942 tcg_temp_free_vec(v_shift);
2943 tcg_swap_vecop_list(hold_list);
2944 goto clear_tail;
2945 }
2946
2947 /* Otherwise fall back to integral... */
2948 if (vece == MO_32 && check_size_impl(oprsz, 4)) {
2949 expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4);
2950 } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
2951 TCGv_i64 sh64 = tcg_temp_new_i64();
2952 tcg_gen_extu_i32_i64(sh64, shift);
2953 expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8);
2954 tcg_temp_free_i64(sh64);
2955 } else {
2956 TCGv_ptr a0 = tcg_temp_new_ptr();
2957 TCGv_ptr a1 = tcg_temp_new_ptr();
2958 TCGv_i32 desc = tcg_temp_new_i32();
2959
2960 tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
2961 tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
2962 tcg_gen_addi_ptr(a0, cpu_env, dofs);
2963 tcg_gen_addi_ptr(a1, cpu_env, aofs);
2964
2965 g->fno[vece](a0, a1, desc);
2966
2967 tcg_temp_free_ptr(a0);
2968 tcg_temp_free_ptr(a1);
2969 tcg_temp_free_i32(desc);
2970 return;
2971 }
2972
2973 clear_tail:
2974 if (oprsz < maxsz) {
2975 expand_clr(dofs + oprsz, maxsz - oprsz);
2976 }
2977 }
2978
2979 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
2980 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
2981 {
2982 static const GVecGen2sh g = {
2983 .fni4 = tcg_gen_shl_i32,
2984 .fni8 = tcg_gen_shl_i64,
2985 .fniv_s = tcg_gen_shls_vec,
2986 .fniv_v = tcg_gen_shlv_vec,
2987 .fno = {
2988 gen_helper_gvec_shl8i,
2989 gen_helper_gvec_shl16i,
2990 gen_helper_gvec_shl32i,
2991 gen_helper_gvec_shl64i,
2992 },
2993 .s_list = { INDEX_op_shls_vec, 0 },
2994 .v_list = { INDEX_op_shlv_vec, 0 },
2995 };
2996
2997 tcg_debug_assert(vece <= MO_64);
2998 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
2999 }
3000
3001 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
3002 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3003 {
3004 static const GVecGen2sh g = {
3005 .fni4 = tcg_gen_shr_i32,
3006 .fni8 = tcg_gen_shr_i64,
3007 .fniv_s = tcg_gen_shrs_vec,
3008 .fniv_v = tcg_gen_shrv_vec,
3009 .fno = {
3010 gen_helper_gvec_shr8i,
3011 gen_helper_gvec_shr16i,
3012 gen_helper_gvec_shr32i,
3013 gen_helper_gvec_shr64i,
3014 },
3015 .s_list = { INDEX_op_shrs_vec, 0 },
3016 .v_list = { INDEX_op_shrv_vec, 0 },
3017 };
3018
3019 tcg_debug_assert(vece <= MO_64);
3020 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3021 }
3022
3023 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
3024 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3025 {
3026 static const GVecGen2sh g = {
3027 .fni4 = tcg_gen_sar_i32,
3028 .fni8 = tcg_gen_sar_i64,
3029 .fniv_s = tcg_gen_sars_vec,
3030 .fniv_v = tcg_gen_sarv_vec,
3031 .fno = {
3032 gen_helper_gvec_sar8i,
3033 gen_helper_gvec_sar16i,
3034 gen_helper_gvec_sar32i,
3035 gen_helper_gvec_sar64i,
3036 },
3037 .s_list = { INDEX_op_sars_vec, 0 },
3038 .v_list = { INDEX_op_sarv_vec, 0 },
3039 };
3040
3041 tcg_debug_assert(vece <= MO_64);
3042 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3043 }
3044
3045 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
3046 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3047 {
3048 static const GVecGen2sh g = {
3049 .fni4 = tcg_gen_rotl_i32,
3050 .fni8 = tcg_gen_rotl_i64,
3051 .fniv_s = tcg_gen_rotls_vec,
3052 .fniv_v = tcg_gen_rotlv_vec,
3053 .fno = {
3054 gen_helper_gvec_rotl8i,
3055 gen_helper_gvec_rotl16i,
3056 gen_helper_gvec_rotl32i,
3057 gen_helper_gvec_rotl64i,
3058 },
3059 .s_list = { INDEX_op_rotls_vec, 0 },
3060 .v_list = { INDEX_op_rotlv_vec, 0 },
3061 };
3062
3063 tcg_debug_assert(vece <= MO_64);
3064 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3065 }
3066
3067 /*
3068 * Expand D = A << (B % element bits)
3069 *
3070 * Unlike scalar shifts, where it is easy for the target front end
3071 * to include the modulo as part of the expansion. If the target
3072 * naturally includes the modulo as part of the operation, great!
3073 * If the target has some other behaviour from out-of-range shifts,
3074 * then it could not use this function anyway, and would need to
3075 * do it's own expansion with custom functions.
3076 */
3077 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
3078 TCGv_vec a, TCGv_vec b)
3079 {
3080 TCGv_vec t = tcg_temp_new_vec_matching(d);
3081
3082 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
3083 tcg_gen_and_vec(vece, t, t, b);
3084 tcg_gen_shlv_vec(vece, d, a, t);
3085 tcg_temp_free_vec(t);
3086 }
3087
3088 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3089 {
3090 TCGv_i32 t = tcg_temp_new_i32();
3091
3092 tcg_gen_andi_i32(t, b, 31);
3093 tcg_gen_shl_i32(d, a, t);
3094 tcg_temp_free_i32(t);
3095 }
3096
3097 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3098 {
3099 TCGv_i64 t = tcg_temp_new_i64();
3100
3101 tcg_gen_andi_i64(t, b, 63);
3102 tcg_gen_shl_i64(d, a, t);
3103 tcg_temp_free_i64(t);
3104 }
3105
3106 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3107 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3108 {
3109 static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
3110 static const GVecGen3 g[4] = {
3111 { .fniv = tcg_gen_shlv_mod_vec,
3112 .fno = gen_helper_gvec_shl8v,
3113 .opt_opc = vecop_list,
3114 .vece = MO_8 },
3115 { .fniv = tcg_gen_shlv_mod_vec,
3116 .fno = gen_helper_gvec_shl16v,
3117 .opt_opc = vecop_list,
3118 .vece = MO_16 },
3119 { .fni4 = tcg_gen_shl_mod_i32,
3120 .fniv = tcg_gen_shlv_mod_vec,
3121 .fno = gen_helper_gvec_shl32v,
3122 .opt_opc = vecop_list,
3123 .vece = MO_32 },
3124 { .fni8 = tcg_gen_shl_mod_i64,
3125 .fniv = tcg_gen_shlv_mod_vec,
3126 .fno = gen_helper_gvec_shl64v,
3127 .opt_opc = vecop_list,
3128 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3129 .vece = MO_64 },
3130 };
3131
3132 tcg_debug_assert(vece <= MO_64);
3133 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3134 }
3135
3136 /*
3137 * Similarly for logical right shifts.
3138 */
3139
3140 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
3141 TCGv_vec a, TCGv_vec b)
3142 {
3143 TCGv_vec t = tcg_temp_new_vec_matching(d);
3144
3145 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
3146 tcg_gen_and_vec(vece, t, t, b);
3147 tcg_gen_shrv_vec(vece, d, a, t);
3148 tcg_temp_free_vec(t);
3149 }
3150
3151 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3152 {
3153 TCGv_i32 t = tcg_temp_new_i32();
3154
3155 tcg_gen_andi_i32(t, b, 31);
3156 tcg_gen_shr_i32(d, a, t);
3157 tcg_temp_free_i32(t);
3158 }
3159
3160 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3161 {
3162 TCGv_i64 t = tcg_temp_new_i64();
3163
3164 tcg_gen_andi_i64(t, b, 63);
3165 tcg_gen_shr_i64(d, a, t);
3166 tcg_temp_free_i64(t);
3167 }
3168
3169 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3170 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3171 {
3172 static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
3173 static const GVecGen3 g[4] = {
3174 { .fniv = tcg_gen_shrv_mod_vec,
3175 .fno = gen_helper_gvec_shr8v,
3176 .opt_opc = vecop_list,
3177 .vece = MO_8 },
3178 { .fniv = tcg_gen_shrv_mod_vec,
3179 .fno = gen_helper_gvec_shr16v,
3180 .opt_opc = vecop_list,
3181 .vece = MO_16 },
3182 { .fni4 = tcg_gen_shr_mod_i32,
3183 .fniv = tcg_gen_shrv_mod_vec,
3184 .fno = gen_helper_gvec_shr32v,
3185 .opt_opc = vecop_list,
3186 .vece = MO_32 },
3187 { .fni8 = tcg_gen_shr_mod_i64,
3188 .fniv = tcg_gen_shrv_mod_vec,
3189 .fno = gen_helper_gvec_shr64v,
3190 .opt_opc = vecop_list,
3191 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3192 .vece = MO_64 },
3193 };
3194
3195 tcg_debug_assert(vece <= MO_64);
3196 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3197 }
3198
3199 /*
3200 * Similarly for arithmetic right shifts.
3201 */
3202
3203 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
3204 TCGv_vec a, TCGv_vec b)
3205 {
3206 TCGv_vec t = tcg_temp_new_vec_matching(d);
3207
3208 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
3209 tcg_gen_and_vec(vece, t, t, b);
3210 tcg_gen_sarv_vec(vece, d, a, t);
3211 tcg_temp_free_vec(t);
3212 }
3213
3214 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3215 {
3216 TCGv_i32 t = tcg_temp_new_i32();
3217
3218 tcg_gen_andi_i32(t, b, 31);
3219 tcg_gen_sar_i32(d, a, t);
3220 tcg_temp_free_i32(t);
3221 }
3222
3223 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3224 {
3225 TCGv_i64 t = tcg_temp_new_i64();
3226
3227 tcg_gen_andi_i64(t, b, 63);
3228 tcg_gen_sar_i64(d, a, t);
3229 tcg_temp_free_i64(t);
3230 }
3231
3232 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
3233 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3234 {
3235 static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
3236 static const GVecGen3 g[4] = {
3237 { .fniv = tcg_gen_sarv_mod_vec,
3238 .fno = gen_helper_gvec_sar8v,
3239 .opt_opc = vecop_list,
3240 .vece = MO_8 },
3241 { .fniv = tcg_gen_sarv_mod_vec,
3242 .fno = gen_helper_gvec_sar16v,
3243 .opt_opc = vecop_list,
3244 .vece = MO_16 },
3245 { .fni4 = tcg_gen_sar_mod_i32,
3246 .fniv = tcg_gen_sarv_mod_vec,
3247 .fno = gen_helper_gvec_sar32v,
3248 .opt_opc = vecop_list,
3249 .vece = MO_32 },
3250 { .fni8 = tcg_gen_sar_mod_i64,
3251 .fniv = tcg_gen_sarv_mod_vec,
3252 .fno = gen_helper_gvec_sar64v,
3253 .opt_opc = vecop_list,
3254 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3255 .vece = MO_64 },
3256 };
3257
3258 tcg_debug_assert(vece <= MO_64);
3259 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3260 }
3261
3262 /*
3263 * Similarly for rotates.
3264 */
3265
3266 static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d,
3267 TCGv_vec a, TCGv_vec b)
3268 {
3269 TCGv_vec t = tcg_temp_new_vec_matching(d);
3270
3271 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
3272 tcg_gen_and_vec(vece, t, t, b);
3273 tcg_gen_rotlv_vec(vece, d, a, t);
3274 tcg_temp_free_vec(t);
3275 }
3276
3277 static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3278 {
3279 TCGv_i32 t = tcg_temp_new_i32();
3280
3281 tcg_gen_andi_i32(t, b, 31);
3282 tcg_gen_rotl_i32(d, a, t);
3283 tcg_temp_free_i32(t);
3284 }
3285
3286 static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3287 {
3288 TCGv_i64 t = tcg_temp_new_i64();
3289
3290 tcg_gen_andi_i64(t, b, 63);
3291 tcg_gen_rotl_i64(d, a, t);
3292 tcg_temp_free_i64(t);
3293 }
3294
3295 void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3296 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3297 {
3298 static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 };
3299 static const GVecGen3 g[4] = {
3300 { .fniv = tcg_gen_rotlv_mod_vec,
3301 .fno = gen_helper_gvec_rotl8v,
3302 .opt_opc = vecop_list,
3303 .vece = MO_8 },
3304 { .fniv = tcg_gen_rotlv_mod_vec,
3305 .fno = gen_helper_gvec_rotl16v,
3306 .opt_opc = vecop_list,
3307 .vece = MO_16 },
3308 { .fni4 = tcg_gen_rotl_mod_i32,
3309 .fniv = tcg_gen_rotlv_mod_vec,
3310 .fno = gen_helper_gvec_rotl32v,
3311 .opt_opc = vecop_list,
3312 .vece = MO_32 },
3313 { .fni8 = tcg_gen_rotl_mod_i64,
3314 .fniv = tcg_gen_rotlv_mod_vec,
3315 .fno = gen_helper_gvec_rotl64v,
3316 .opt_opc = vecop_list,
3317 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3318 .vece = MO_64 },
3319 };
3320
3321 tcg_debug_assert(vece <= MO_64);
3322 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3323 }
3324
3325 static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d,
3326 TCGv_vec a, TCGv_vec b)
3327 {
3328 TCGv_vec t = tcg_temp_new_vec_matching(d);
3329
3330 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
3331 tcg_gen_and_vec(vece, t, t, b);
3332 tcg_gen_rotrv_vec(vece, d, a, t);
3333 tcg_temp_free_vec(t);
3334 }
3335
3336 static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3337 {
3338 TCGv_i32 t = tcg_temp_new_i32();
3339
3340 tcg_gen_andi_i32(t, b, 31);
3341 tcg_gen_rotr_i32(d, a, t);
3342 tcg_temp_free_i32(t);
3343 }
3344
3345 static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3346 {
3347 TCGv_i64 t = tcg_temp_new_i64();
3348
3349 tcg_gen_andi_i64(t, b, 63);
3350 tcg_gen_rotr_i64(d, a, t);
3351 tcg_temp_free_i64(t);
3352 }
3353
3354 void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3355 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3356 {
3357 static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 };
3358 static const GVecGen3 g[4] = {
3359 { .fniv = tcg_gen_rotrv_mod_vec,
3360 .fno = gen_helper_gvec_rotr8v,
3361 .opt_opc = vecop_list,
3362 .vece = MO_8 },
3363 { .fniv = tcg_gen_rotrv_mod_vec,
3364 .fno = gen_helper_gvec_rotr16v,
3365 .opt_opc = vecop_list,
3366 .vece = MO_16 },
3367 { .fni4 = tcg_gen_rotr_mod_i32,
3368 .fniv = tcg_gen_rotrv_mod_vec,
3369 .fno = gen_helper_gvec_rotr32v,
3370 .opt_opc = vecop_list,
3371 .vece = MO_32 },
3372 { .fni8 = tcg_gen_rotr_mod_i64,
3373 .fniv = tcg_gen_rotrv_mod_vec,
3374 .fno = gen_helper_gvec_rotr64v,
3375 .opt_opc = vecop_list,
3376 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3377 .vece = MO_64 },
3378 };
3379
3380 tcg_debug_assert(vece <= MO_64);
3381 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3382 }
3383
3384 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */
3385 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3386 uint32_t oprsz, TCGCond cond)
3387 {
3388 TCGv_i32 t0 = tcg_temp_new_i32();
3389 TCGv_i32 t1 = tcg_temp_new_i32();
3390 uint32_t i;
3391
3392 for (i = 0; i < oprsz; i += 4) {
3393 tcg_gen_ld_i32(t0, cpu_env, aofs + i);
3394 tcg_gen_ld_i32(t1, cpu_env, bofs + i);
3395 tcg_gen_setcond_i32(cond, t0, t0, t1);
3396 tcg_gen_neg_i32(t0, t0);
3397 tcg_gen_st_i32(t0, cpu_env, dofs + i);
3398 }
3399 tcg_temp_free_i32(t1);
3400 tcg_temp_free_i32(t0);
3401 }
3402
3403 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3404 uint32_t oprsz, TCGCond cond)
3405 {
3406 TCGv_i64 t0 = tcg_temp_new_i64();
3407 TCGv_i64 t1 = tcg_temp_new_i64();
3408 uint32_t i;
3409
3410 for (i = 0; i < oprsz; i += 8) {
3411 tcg_gen_ld_i64(t0, cpu_env, aofs + i);
3412 tcg_gen_ld_i64(t1, cpu_env, bofs + i);
3413 tcg_gen_setcond_i64(cond, t0, t0, t1);
3414 tcg_gen_neg_i64(t0, t0);
3415 tcg_gen_st_i64(t0, cpu_env, dofs + i);
3416 }
3417 tcg_temp_free_i64(t1);
3418 tcg_temp_free_i64(t0);
3419 }
3420
3421 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3422 uint32_t bofs, uint32_t oprsz, uint32_t tysz,
3423 TCGType type, TCGCond cond)
3424 {
3425 TCGv_vec t0 = tcg_temp_new_vec(type);
3426 TCGv_vec t1 = tcg_temp_new_vec(type);
3427 uint32_t i;
3428
3429 for (i = 0; i < oprsz; i += tysz) {
3430 tcg_gen_ld_vec(t0, cpu_env, aofs + i);
3431 tcg_gen_ld_vec(t1, cpu_env, bofs + i);
3432 tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
3433 tcg_gen_st_vec(t0, cpu_env, dofs + i);
3434 }
3435 tcg_temp_free_vec(t1);
3436 tcg_temp_free_vec(t0);
3437 }
3438
3439 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
3440 uint32_t aofs, uint32_t bofs,
3441 uint32_t oprsz, uint32_t maxsz)
3442 {
3443 static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3444 static gen_helper_gvec_3 * const eq_fn[4] = {
3445 gen_helper_gvec_eq8, gen_helper_gvec_eq16,
3446 gen_helper_gvec_eq32, gen_helper_gvec_eq64
3447 };
3448 static gen_helper_gvec_3 * const ne_fn[4] = {
3449 gen_helper_gvec_ne8, gen_helper_gvec_ne16,
3450 gen_helper_gvec_ne32, gen_helper_gvec_ne64
3451 };
3452 static gen_helper_gvec_3 * const lt_fn[4] = {
3453 gen_helper_gvec_lt8, gen_helper_gvec_lt16,
3454 gen_helper_gvec_lt32, gen_helper_gvec_lt64
3455 };
3456 static gen_helper_gvec_3 * const le_fn[4] = {
3457 gen_helper_gvec_le8, gen_helper_gvec_le16,
3458 gen_helper_gvec_le32, gen_helper_gvec_le64
3459 };
3460 static gen_helper_gvec_3 * const ltu_fn[4] = {
3461 gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
3462 gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
3463 };
3464 static gen_helper_gvec_3 * const leu_fn[4] = {
3465 gen_helper_gvec_leu8, gen_helper_gvec_leu16,
3466 gen_helper_gvec_leu32, gen_helper_gvec_leu64
3467 };
3468 static gen_helper_gvec_3 * const * const fns[16] = {
3469 [TCG_COND_EQ] = eq_fn,
3470 [TCG_COND_NE] = ne_fn,
3471 [TCG_COND_LT] = lt_fn,
3472 [TCG_COND_LE] = le_fn,
3473 [TCG_COND_LTU] = ltu_fn,
3474 [TCG_COND_LEU] = leu_fn,
3475 };
3476
3477 const TCGOpcode *hold_list;
3478 TCGType type;
3479 uint32_t some;
3480
3481 check_size_align(oprsz, maxsz, dofs | aofs | bofs);
3482 check_overlap_3(dofs, aofs, bofs, maxsz);
3483
3484 if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3485 do_dup(MO_8, dofs, oprsz, maxsz,
3486 NULL, NULL, -(cond == TCG_COND_ALWAYS));
3487 return;
3488 }
3489
3490 /*
3491 * Implement inline with a vector type, if possible.
3492 * Prefer integer when 64-bit host and 64-bit comparison.
3493 */
3494 hold_list = tcg_swap_vecop_list(cmp_list);
3495 type = choose_vector_type(cmp_list, vece, oprsz,
3496 TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3497 switch (type) {
3498 case TCG_TYPE_V256:
3499 /* Recall that ARM SVE allows vector sizes that are not a
3500 * power of 2, but always a multiple of 16. The intent is
3501 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
3502 */
3503 some = QEMU_ALIGN_DOWN(oprsz, 32);
3504 expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
3505 if (some == oprsz) {
3506 break;
3507 }
3508 dofs += some;
3509 aofs += some;
3510 bofs += some;
3511 oprsz -= some;
3512 maxsz -= some;
3513 /* fallthru */
3514 case TCG_TYPE_V128:
3515 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
3516 break