target/arm: Implement fp16 for Neon VMLA, VMLS operations
[qemu.git] / target / arm / translate-neon.c.inc
1 /*
2  *  ARM translation: AArch32 Neon instructions
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *  Copyright (c) 2005-2007 CodeSourcery
6  *  Copyright (c) 2007 OpenedHand, Ltd.
7  *  Copyright (c) 2020 Linaro, Ltd.
8  *
9  * This library is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2 of the License, or (at your option) any later version.
13  *
14  * This library is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
21  */
22
23 /*
24  * This file is intended to be included from translate.c; it uses
25  * some macros and definitions provided by that file.
26  * It might be possible to convert it to a standalone .c file eventually.
27  */
28
29 static inline int plus1(DisasContext *s, int x)
30 {
31     return x + 1;
32 }
33
34 static inline int rsub_64(DisasContext *s, int x)
35 {
36     return 64 - x;
37 }
38
39 static inline int rsub_32(DisasContext *s, int x)
40 {
41     return 32 - x;
42 }
43 static inline int rsub_16(DisasContext *s, int x)
44 {
45     return 16 - x;
46 }
47 static inline int rsub_8(DisasContext *s, int x)
48 {
49     return 8 - x;
50 }
51
52 /* Include the generated Neon decoder */
53 #include "decode-neon-dp.c.inc"
54 #include "decode-neon-ls.c.inc"
55 #include "decode-neon-shared.c.inc"
56
57 /* Return the offset of a 2**SIZE piece of a NEON register, at index ELE,
58  * where 0 is the least significant end of the register.
59  */
60 static inline long
61 neon_element_offset(int reg, int element, MemOp size)
62 {
63     int element_size = 1 << size;
64     int ofs = element * element_size;
65 #ifdef HOST_WORDS_BIGENDIAN
66     /* Calculate the offset assuming fully little-endian,
67      * then XOR to account for the order of the 8-byte units.
68      */
69     if (element_size < 8) {
70         ofs ^= 8 - element_size;
71     }
72 #endif
73     return neon_reg_offset(reg, 0) + ofs;
74 }
75
76 static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
77 {
78     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
79
80     switch (mop) {
81     case MO_UB:
82         tcg_gen_ld8u_i32(var, cpu_env, offset);
83         break;
84     case MO_UW:
85         tcg_gen_ld16u_i32(var, cpu_env, offset);
86         break;
87     case MO_UL:
88         tcg_gen_ld_i32(var, cpu_env, offset);
89         break;
90     default:
91         g_assert_not_reached();
92     }
93 }
94
95 static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
96 {
97     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
98
99     switch (mop) {
100     case MO_UB:
101         tcg_gen_ld8u_i64(var, cpu_env, offset);
102         break;
103     case MO_UW:
104         tcg_gen_ld16u_i64(var, cpu_env, offset);
105         break;
106     case MO_UL:
107         tcg_gen_ld32u_i64(var, cpu_env, offset);
108         break;
109     case MO_Q:
110         tcg_gen_ld_i64(var, cpu_env, offset);
111         break;
112     default:
113         g_assert_not_reached();
114     }
115 }
116
117 static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
118 {
119     long offset = neon_element_offset(reg, ele, size);
120
121     switch (size) {
122     case MO_8:
123         tcg_gen_st8_i32(var, cpu_env, offset);
124         break;
125     case MO_16:
126         tcg_gen_st16_i32(var, cpu_env, offset);
127         break;
128     case MO_32:
129         tcg_gen_st_i32(var, cpu_env, offset);
130         break;
131     default:
132         g_assert_not_reached();
133     }
134 }
135
136 static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
137 {
138     long offset = neon_element_offset(reg, ele, size);
139
140     switch (size) {
141     case MO_8:
142         tcg_gen_st8_i64(var, cpu_env, offset);
143         break;
144     case MO_16:
145         tcg_gen_st16_i64(var, cpu_env, offset);
146         break;
147     case MO_32:
148         tcg_gen_st32_i64(var, cpu_env, offset);
149         break;
150     case MO_64:
151         tcg_gen_st_i64(var, cpu_env, offset);
152         break;
153     default:
154         g_assert_not_reached();
155     }
156 }
157
158 static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
159 {
160     int opr_sz;
161     TCGv_ptr fpst;
162     gen_helper_gvec_3_ptr *fn_gvec_ptr;
163
164     if (!dc_isar_feature(aa32_vcma, s)
165         || (!a->size && !dc_isar_feature(aa32_fp16_arith, s))) {
166         return false;
167     }
168
169     /* UNDEF accesses to D16-D31 if they don't exist. */
170     if (!dc_isar_feature(aa32_simd_r32, s) &&
171         ((a->vd | a->vn | a->vm) & 0x10)) {
172         return false;
173     }
174
175     if ((a->vn | a->vm | a->vd) & a->q) {
176         return false;
177     }
178
179     if (!vfp_access_check(s)) {
180         return true;
181     }
182
183     opr_sz = (1 + a->q) * 8;
184     fpst = fpstatus_ptr(a->size == 0 ? FPST_STD_F16 : FPST_STD);
185     fn_gvec_ptr = a->size ? gen_helper_gvec_fcmlas : gen_helper_gvec_fcmlah;
186     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
187                        vfp_reg_offset(1, a->vn),
188                        vfp_reg_offset(1, a->vm),
189                        fpst, opr_sz, opr_sz, a->rot,
190                        fn_gvec_ptr);
191     tcg_temp_free_ptr(fpst);
192     return true;
193 }
194
195 static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
196 {
197     int opr_sz;
198     TCGv_ptr fpst;
199     gen_helper_gvec_3_ptr *fn_gvec_ptr;
200
201     if (!dc_isar_feature(aa32_vcma, s)
202         || (!a->size && !dc_isar_feature(aa32_fp16_arith, s))) {
203         return false;
204     }
205
206     /* UNDEF accesses to D16-D31 if they don't exist. */
207     if (!dc_isar_feature(aa32_simd_r32, s) &&
208         ((a->vd | a->vn | a->vm) & 0x10)) {
209         return false;
210     }
211
212     if ((a->vn | a->vm | a->vd) & a->q) {
213         return false;
214     }
215
216     if (!vfp_access_check(s)) {
217         return true;
218     }
219
220     opr_sz = (1 + a->q) * 8;
221     fpst = fpstatus_ptr(a->size == 0 ? FPST_STD_F16 : FPST_STD);
222     fn_gvec_ptr = a->size ? gen_helper_gvec_fcadds : gen_helper_gvec_fcaddh;
223     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
224                        vfp_reg_offset(1, a->vn),
225                        vfp_reg_offset(1, a->vm),
226                        fpst, opr_sz, opr_sz, a->rot,
227                        fn_gvec_ptr);
228     tcg_temp_free_ptr(fpst);
229     return true;
230 }
231
232 static bool trans_VDOT(DisasContext *s, arg_VDOT *a)
233 {
234     int opr_sz;
235     gen_helper_gvec_3 *fn_gvec;
236
237     if (!dc_isar_feature(aa32_dp, s)) {
238         return false;
239     }
240
241     /* UNDEF accesses to D16-D31 if they don't exist. */
242     if (!dc_isar_feature(aa32_simd_r32, s) &&
243         ((a->vd | a->vn | a->vm) & 0x10)) {
244         return false;
245     }
246
247     if ((a->vn | a->vm | a->vd) & a->q) {
248         return false;
249     }
250
251     if (!vfp_access_check(s)) {
252         return true;
253     }
254
255     opr_sz = (1 + a->q) * 8;
256     fn_gvec = a->u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b;
257     tcg_gen_gvec_3_ool(vfp_reg_offset(1, a->vd),
258                        vfp_reg_offset(1, a->vn),
259                        vfp_reg_offset(1, a->vm),
260                        opr_sz, opr_sz, 0, fn_gvec);
261     return true;
262 }
263
264 static bool trans_VFML(DisasContext *s, arg_VFML *a)
265 {
266     int opr_sz;
267
268     if (!dc_isar_feature(aa32_fhm, s)) {
269         return false;
270     }
271
272     /* UNDEF accesses to D16-D31 if they don't exist. */
273     if (!dc_isar_feature(aa32_simd_r32, s) &&
274         (a->vd & 0x10)) {
275         return false;
276     }
277
278     if (a->vd & a->q) {
279         return false;
280     }
281
282     if (!vfp_access_check(s)) {
283         return true;
284     }
285
286     opr_sz = (1 + a->q) * 8;
287     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
288                        vfp_reg_offset(a->q, a->vn),
289                        vfp_reg_offset(a->q, a->vm),
290                        cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
291                        gen_helper_gvec_fmlal_a32);
292     return true;
293 }
294
295 static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
296 {
297     gen_helper_gvec_3_ptr *fn_gvec_ptr;
298     int opr_sz;
299     TCGv_ptr fpst;
300
301     if (!dc_isar_feature(aa32_vcma, s)) {
302         return false;
303     }
304     if (a->size == 0 && !dc_isar_feature(aa32_fp16_arith, s)) {
305         return false;
306     }
307
308     /* UNDEF accesses to D16-D31 if they don't exist. */
309     if (!dc_isar_feature(aa32_simd_r32, s) &&
310         ((a->vd | a->vn | a->vm) & 0x10)) {
311         return false;
312     }
313
314     if ((a->vd | a->vn) & a->q) {
315         return false;
316     }
317
318     if (!vfp_access_check(s)) {
319         return true;
320     }
321
322     fn_gvec_ptr = (a->size ? gen_helper_gvec_fcmlas_idx
323                    : gen_helper_gvec_fcmlah_idx);
324     opr_sz = (1 + a->q) * 8;
325     fpst = fpstatus_ptr(a->size == 0 ? FPST_STD_F16 : FPST_STD);
326     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
327                        vfp_reg_offset(1, a->vn),
328                        vfp_reg_offset(1, a->vm),
329                        fpst, opr_sz, opr_sz,
330                        (a->index << 2) | a->rot, fn_gvec_ptr);
331     tcg_temp_free_ptr(fpst);
332     return true;
333 }
334
335 static bool trans_VDOT_scalar(DisasContext *s, arg_VDOT_scalar *a)
336 {
337     gen_helper_gvec_3 *fn_gvec;
338     int opr_sz;
339     TCGv_ptr fpst;
340
341     if (!dc_isar_feature(aa32_dp, s)) {
342         return false;
343     }
344
345     /* UNDEF accesses to D16-D31 if they don't exist. */
346     if (!dc_isar_feature(aa32_simd_r32, s) &&
347         ((a->vd | a->vn) & 0x10)) {
348         return false;
349     }
350
351     if ((a->vd | a->vn) & a->q) {
352         return false;
353     }
354
355     if (!vfp_access_check(s)) {
356         return true;
357     }
358
359     fn_gvec = a->u ? gen_helper_gvec_udot_idx_b : gen_helper_gvec_sdot_idx_b;
360     opr_sz = (1 + a->q) * 8;
361     fpst = fpstatus_ptr(FPST_STD);
362     tcg_gen_gvec_3_ool(vfp_reg_offset(1, a->vd),
363                        vfp_reg_offset(1, a->vn),
364                        vfp_reg_offset(1, a->rm),
365                        opr_sz, opr_sz, a->index, fn_gvec);
366     tcg_temp_free_ptr(fpst);
367     return true;
368 }
369
370 static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
371 {
372     int opr_sz;
373
374     if (!dc_isar_feature(aa32_fhm, s)) {
375         return false;
376     }
377
378     /* UNDEF accesses to D16-D31 if they don't exist. */
379     if (!dc_isar_feature(aa32_simd_r32, s) &&
380         ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
381         return false;
382     }
383
384     if (a->vd & a->q) {
385         return false;
386     }
387
388     if (!vfp_access_check(s)) {
389         return true;
390     }
391
392     opr_sz = (1 + a->q) * 8;
393     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
394                        vfp_reg_offset(a->q, a->vn),
395                        vfp_reg_offset(a->q, a->rm),
396                        cpu_env, opr_sz, opr_sz,
397                        (a->index << 2) | a->s, /* is_2 == 0 */
398                        gen_helper_gvec_fmlal_idx_a32);
399     return true;
400 }
401
402 static struct {
403     int nregs;
404     int interleave;
405     int spacing;
406 } const neon_ls_element_type[11] = {
407     {1, 4, 1},
408     {1, 4, 2},
409     {4, 1, 1},
410     {2, 2, 2},
411     {1, 3, 1},
412     {1, 3, 2},
413     {3, 1, 1},
414     {1, 1, 1},
415     {1, 2, 1},
416     {1, 2, 2},
417     {2, 1, 1}
418 };
419
420 static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
421                                       int stride)
422 {
423     if (rm != 15) {
424         TCGv_i32 base;
425
426         base = load_reg(s, rn);
427         if (rm == 13) {
428             tcg_gen_addi_i32(base, base, stride);
429         } else {
430             TCGv_i32 index;
431             index = load_reg(s, rm);
432             tcg_gen_add_i32(base, base, index);
433             tcg_temp_free_i32(index);
434         }
435         store_reg(s, rn, base);
436     }
437 }
438
439 static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
440 {
441     /* Neon load/store multiple structures */
442     int nregs, interleave, spacing, reg, n;
443     MemOp endian = s->be_data;
444     int mmu_idx = get_mem_index(s);
445     int size = a->size;
446     TCGv_i64 tmp64;
447     TCGv_i32 addr, tmp;
448
449     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
450         return false;
451     }
452
453     /* UNDEF accesses to D16-D31 if they don't exist */
454     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
455         return false;
456     }
457     if (a->itype > 10) {
458         return false;
459     }
460     /* Catch UNDEF cases for bad values of align field */
461     switch (a->itype & 0xc) {
462     case 4:
463         if (a->align >= 2) {
464             return false;
465         }
466         break;
467     case 8:
468         if (a->align == 3) {
469             return false;
470         }
471         break;
472     default:
473         break;
474     }
475     nregs = neon_ls_element_type[a->itype].nregs;
476     interleave = neon_ls_element_type[a->itype].interleave;
477     spacing = neon_ls_element_type[a->itype].spacing;
478     if (size == 3 && (interleave | spacing) != 1) {
479         return false;
480     }
481
482     if (!vfp_access_check(s)) {
483         return true;
484     }
485
486     /* For our purposes, bytes are always little-endian.  */
487     if (size == 0) {
488         endian = MO_LE;
489     }
490     /*
491      * Consecutive little-endian elements from a single register
492      * can be promoted to a larger little-endian operation.
493      */
494     if (interleave == 1 && endian == MO_LE) {
495         size = 3;
496     }
497     tmp64 = tcg_temp_new_i64();
498     addr = tcg_temp_new_i32();
499     tmp = tcg_const_i32(1 << size);
500     load_reg_var(s, addr, a->rn);
501     for (reg = 0; reg < nregs; reg++) {
502         for (n = 0; n < 8 >> size; n++) {
503             int xs;
504             for (xs = 0; xs < interleave; xs++) {
505                 int tt = a->vd + reg + spacing * xs;
506
507                 if (a->l) {
508                     gen_aa32_ld_i64(s, tmp64, addr, mmu_idx, endian | size);
509                     neon_store_element64(tt, n, size, tmp64);
510                 } else {
511                     neon_load_element64(tmp64, tt, n, size);
512                     gen_aa32_st_i64(s, tmp64, addr, mmu_idx, endian | size);
513                 }
514                 tcg_gen_add_i32(addr, addr, tmp);
515             }
516         }
517     }
518     tcg_temp_free_i32(addr);
519     tcg_temp_free_i32(tmp);
520     tcg_temp_free_i64(tmp64);
521
522     gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
523     return true;
524 }
525
526 static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
527 {
528     /* Neon load single structure to all lanes */
529     int reg, stride, vec_size;
530     int vd = a->vd;
531     int size = a->size;
532     int nregs = a->n + 1;
533     TCGv_i32 addr, tmp;
534
535     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
536         return false;
537     }
538
539     /* UNDEF accesses to D16-D31 if they don't exist */
540     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
541         return false;
542     }
543
544     if (size == 3) {
545         if (nregs != 4 || a->a == 0) {
546             return false;
547         }
548         /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
549         size = 2;
550     }
551     if (nregs == 1 && a->a == 1 && size == 0) {
552         return false;
553     }
554     if (nregs == 3 && a->a == 1) {
555         return false;
556     }
557
558     if (!vfp_access_check(s)) {
559         return true;
560     }
561
562     /*
563      * VLD1 to all lanes: T bit indicates how many Dregs to write.
564      * VLD2/3/4 to all lanes: T bit indicates register stride.
565      */
566     stride = a->t ? 2 : 1;
567     vec_size = nregs == 1 ? stride * 8 : 8;
568
569     tmp = tcg_temp_new_i32();
570     addr = tcg_temp_new_i32();
571     load_reg_var(s, addr, a->rn);
572     for (reg = 0; reg < nregs; reg++) {
573         gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s),
574                         s->be_data | size);
575         if ((vd & 1) && vec_size == 16) {
576             /*
577              * We cannot write 16 bytes at once because the
578              * destination is unaligned.
579              */
580             tcg_gen_gvec_dup_i32(size, neon_reg_offset(vd, 0),
581                                  8, 8, tmp);
582             tcg_gen_gvec_mov(0, neon_reg_offset(vd + 1, 0),
583                              neon_reg_offset(vd, 0), 8, 8);
584         } else {
585             tcg_gen_gvec_dup_i32(size, neon_reg_offset(vd, 0),
586                                  vec_size, vec_size, tmp);
587         }
588         tcg_gen_addi_i32(addr, addr, 1 << size);
589         vd += stride;
590     }
591     tcg_temp_free_i32(tmp);
592     tcg_temp_free_i32(addr);
593
594     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
595
596     return true;
597 }
598
599 static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
600 {
601     /* Neon load/store single structure to one lane */
602     int reg;
603     int nregs = a->n + 1;
604     int vd = a->vd;
605     TCGv_i32 addr, tmp;
606
607     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
608         return false;
609     }
610
611     /* UNDEF accesses to D16-D31 if they don't exist */
612     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
613         return false;
614     }
615
616     /* Catch the UNDEF cases. This is unavoidably a bit messy. */
617     switch (nregs) {
618     case 1:
619         if (((a->align & (1 << a->size)) != 0) ||
620             (a->size == 2 && ((a->align & 3) == 1 || (a->align & 3) == 2))) {
621             return false;
622         }
623         break;
624     case 3:
625         if ((a->align & 1) != 0) {
626             return false;
627         }
628         /* fall through */
629     case 2:
630         if (a->size == 2 && (a->align & 2) != 0) {
631             return false;
632         }
633         break;
634     case 4:
635         if ((a->size == 2) && ((a->align & 3) == 3)) {
636             return false;
637         }
638         break;
639     default:
640         abort();
641     }
642     if ((vd + a->stride * (nregs - 1)) > 31) {
643         /*
644          * Attempts to write off the end of the register file are
645          * UNPREDICTABLE; we choose to UNDEF because otherwise we would
646          * access off the end of the array that holds the register data.
647          */
648         return false;
649     }
650
651     if (!vfp_access_check(s)) {
652         return true;
653     }
654
655     tmp = tcg_temp_new_i32();
656     addr = tcg_temp_new_i32();
657     load_reg_var(s, addr, a->rn);
658     /*
659      * TODO: if we implemented alignment exceptions, we should check
660      * addr against the alignment encoded in a->align here.
661      */
662     for (reg = 0; reg < nregs; reg++) {
663         if (a->l) {
664             gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s),
665                             s->be_data | a->size);
666             neon_store_element(vd, a->reg_idx, a->size, tmp);
667         } else { /* Store */
668             neon_load_element(tmp, vd, a->reg_idx, a->size);
669             gen_aa32_st_i32(s, tmp, addr, get_mem_index(s),
670                             s->be_data | a->size);
671         }
672         vd += a->stride;
673         tcg_gen_addi_i32(addr, addr, 1 << a->size);
674     }
675     tcg_temp_free_i32(addr);
676     tcg_temp_free_i32(tmp);
677
678     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
679
680     return true;
681 }
682
683 static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
684 {
685     int vec_size = a->q ? 16 : 8;
686     int rd_ofs = neon_reg_offset(a->vd, 0);
687     int rn_ofs = neon_reg_offset(a->vn, 0);
688     int rm_ofs = neon_reg_offset(a->vm, 0);
689
690     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
691         return false;
692     }
693
694     /* UNDEF accesses to D16-D31 if they don't exist. */
695     if (!dc_isar_feature(aa32_simd_r32, s) &&
696         ((a->vd | a->vn | a->vm) & 0x10)) {
697         return false;
698     }
699
700     if ((a->vn | a->vm | a->vd) & a->q) {
701         return false;
702     }
703
704     if (!vfp_access_check(s)) {
705         return true;
706     }
707
708     fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
709     return true;
710 }
711
712 #define DO_3SAME(INSN, FUNC)                                            \
713     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
714     {                                                                   \
715         return do_3same(s, a, FUNC);                                    \
716     }
717
718 DO_3SAME(VADD, tcg_gen_gvec_add)
719 DO_3SAME(VSUB, tcg_gen_gvec_sub)
720 DO_3SAME(VAND, tcg_gen_gvec_and)
721 DO_3SAME(VBIC, tcg_gen_gvec_andc)
722 DO_3SAME(VORR, tcg_gen_gvec_or)
723 DO_3SAME(VORN, tcg_gen_gvec_orc)
724 DO_3SAME(VEOR, tcg_gen_gvec_xor)
725 DO_3SAME(VSHL_S, gen_gvec_sshl)
726 DO_3SAME(VSHL_U, gen_gvec_ushl)
727 DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
728 DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
729 DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
730 DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
731
732 /* These insns are all gvec_bitsel but with the inputs in various orders. */
733 #define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
734     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
735                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
736                                 uint32_t oprsz, uint32_t maxsz)         \
737     {                                                                   \
738         tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
739     }                                                                   \
740     DO_3SAME(INSN, gen_##INSN##_3s)
741
742 DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
743 DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
744 DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
745
746 #define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
747     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
748     {                                                                   \
749         if (a->size == 3) {                                             \
750             return false;                                               \
751         }                                                               \
752         return do_3same(s, a, FUNC);                                    \
753     }
754
755 DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
756 DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
757 DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
758 DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
759 DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
760 DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
761 DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
762 DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
763 DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
764 DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
765 DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
766 DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
767
768 #define DO_3SAME_CMP(INSN, COND)                                        \
769     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
770                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
771                                 uint32_t oprsz, uint32_t maxsz)         \
772     {                                                                   \
773         tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
774     }                                                                   \
775     DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
776
777 DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
778 DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
779 DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
780 DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
781 DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
782
783 #define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
784     static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
785                          uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
786     {                                                                      \
787         tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
788     }
789
790 WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
791
792 static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
793 {
794     if (a->size != 0) {
795         return false;
796     }
797     return do_3same(s, a, gen_VMUL_p_3s);
798 }
799
800 #define DO_VQRDMLAH(INSN, FUNC)                                         \
801     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
802     {                                                                   \
803         if (!dc_isar_feature(aa32_rdm, s)) {                            \
804             return false;                                               \
805         }                                                               \
806         if (a->size != 1 && a->size != 2) {                             \
807             return false;                                               \
808         }                                                               \
809         return do_3same(s, a, FUNC);                                    \
810     }
811
812 DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
813 DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
814
815 #define DO_SHA1(NAME, FUNC)                                             \
816     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
817     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
818     {                                                                   \
819         if (!dc_isar_feature(aa32_sha1, s)) {                           \
820             return false;                                               \
821         }                                                               \
822         return do_3same(s, a, gen_##NAME##_3s);                         \
823     }
824
825 DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
826 DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
827 DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
828 DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
829
830 #define DO_SHA2(NAME, FUNC)                                             \
831     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
832     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
833     {                                                                   \
834         if (!dc_isar_feature(aa32_sha2, s)) {                           \
835             return false;                                               \
836         }                                                               \
837         return do_3same(s, a, gen_##NAME##_3s);                         \
838     }
839
840 DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
841 DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
842 DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
843
844 #define DO_3SAME_64(INSN, FUNC)                                         \
845     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
846                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
847                                 uint32_t oprsz, uint32_t maxsz)         \
848     {                                                                   \
849         static const GVecGen3 op = { .fni8 = FUNC };                    \
850         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op);      \
851     }                                                                   \
852     DO_3SAME(INSN, gen_##INSN##_3s)
853
854 #define DO_3SAME_64_ENV(INSN, FUNC)                                     \
855     static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)    \
856     {                                                                   \
857         FUNC(d, cpu_env, n, m);                                         \
858     }                                                                   \
859     DO_3SAME_64(INSN, gen_##INSN##_elt)
860
861 DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
862 DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
863 DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
864 DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
865 DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
866 DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
867
868 #define DO_3SAME_32(INSN, FUNC)                                         \
869     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
870                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
871                                 uint32_t oprsz, uint32_t maxsz)         \
872     {                                                                   \
873         static const GVecGen3 ops[4] = {                                \
874             { .fni4 = gen_helper_neon_##FUNC##8 },                      \
875             { .fni4 = gen_helper_neon_##FUNC##16 },                     \
876             { .fni4 = gen_helper_neon_##FUNC##32 },                     \
877             { 0 },                                                      \
878         };                                                              \
879         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
880     }                                                                   \
881     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
882     {                                                                   \
883         if (a->size > 2) {                                              \
884             return false;                                               \
885         }                                                               \
886         return do_3same(s, a, gen_##INSN##_3s);                         \
887     }
888
889 /*
890  * Some helper functions need to be passed the cpu_env. In order
891  * to use those with the gvec APIs like tcg_gen_gvec_3() we need
892  * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
893  * and which call a NeonGenTwoOpEnvFn().
894  */
895 #define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
896     static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
897     {                                                                   \
898         FUNC(d, cpu_env, n, m);                                         \
899     }
900
901 #define DO_3SAME_32_ENV(INSN, FUNC)                                     \
902     WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8);        \
903     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16);      \
904     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32);      \
905     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
906                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
907                                 uint32_t oprsz, uint32_t maxsz)         \
908     {                                                                   \
909         static const GVecGen3 ops[4] = {                                \
910             { .fni4 = gen_##INSN##_tramp8 },                            \
911             { .fni4 = gen_##INSN##_tramp16 },                           \
912             { .fni4 = gen_##INSN##_tramp32 },                           \
913             { 0 },                                                      \
914         };                                                              \
915         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
916     }                                                                   \
917     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
918     {                                                                   \
919         if (a->size > 2) {                                              \
920             return false;                                               \
921         }                                                               \
922         return do_3same(s, a, gen_##INSN##_3s);                         \
923     }
924
925 DO_3SAME_32(VHADD_S, hadd_s)
926 DO_3SAME_32(VHADD_U, hadd_u)
927 DO_3SAME_32(VHSUB_S, hsub_s)
928 DO_3SAME_32(VHSUB_U, hsub_u)
929 DO_3SAME_32(VRHADD_S, rhadd_s)
930 DO_3SAME_32(VRHADD_U, rhadd_u)
931 DO_3SAME_32(VRSHL_S, rshl_s)
932 DO_3SAME_32(VRSHL_U, rshl_u)
933
934 DO_3SAME_32_ENV(VQSHL_S, qshl_s)
935 DO_3SAME_32_ENV(VQSHL_U, qshl_u)
936 DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
937 DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
938
939 static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn)
940 {
941     /* Operations handled pairwise 32 bits at a time */
942     TCGv_i32 tmp, tmp2, tmp3;
943
944     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
945         return false;
946     }
947
948     /* UNDEF accesses to D16-D31 if they don't exist. */
949     if (!dc_isar_feature(aa32_simd_r32, s) &&
950         ((a->vd | a->vn | a->vm) & 0x10)) {
951         return false;
952     }
953
954     if (a->size == 3) {
955         return false;
956     }
957
958     if (!vfp_access_check(s)) {
959         return true;
960     }
961
962     assert(a->q == 0); /* enforced by decode patterns */
963
964     /*
965      * Note that we have to be careful not to clobber the source operands
966      * in the "vm == vd" case by storing the result of the first pass too
967      * early. Since Q is 0 there are always just two passes, so instead
968      * of a complicated loop over each pass we just unroll.
969      */
970     tmp = neon_load_reg(a->vn, 0);
971     tmp2 = neon_load_reg(a->vn, 1);
972     fn(tmp, tmp, tmp2);
973     tcg_temp_free_i32(tmp2);
974
975     tmp3 = neon_load_reg(a->vm, 0);
976     tmp2 = neon_load_reg(a->vm, 1);
977     fn(tmp3, tmp3, tmp2);
978     tcg_temp_free_i32(tmp2);
979
980     neon_store_reg(a->vd, 0, tmp);
981     neon_store_reg(a->vd, 1, tmp3);
982     return true;
983 }
984
985 #define DO_3SAME_PAIR(INSN, func)                                       \
986     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
987     {                                                                   \
988         static NeonGenTwoOpFn * const fns[] = {                         \
989             gen_helper_neon_##func##8,                                  \
990             gen_helper_neon_##func##16,                                 \
991             gen_helper_neon_##func##32,                                 \
992         };                                                              \
993         if (a->size > 2) {                                              \
994             return false;                                               \
995         }                                                               \
996         return do_3same_pair(s, a, fns[a->size]);                       \
997     }
998
999 /* 32-bit pairwise ops end up the same as the elementwise versions.  */
1000 #define gen_helper_neon_pmax_s32  tcg_gen_smax_i32
1001 #define gen_helper_neon_pmax_u32  tcg_gen_umax_i32
1002 #define gen_helper_neon_pmin_s32  tcg_gen_smin_i32
1003 #define gen_helper_neon_pmin_u32  tcg_gen_umin_i32
1004 #define gen_helper_neon_padd_u32  tcg_gen_add_i32
1005
1006 DO_3SAME_PAIR(VPMAX_S, pmax_s)
1007 DO_3SAME_PAIR(VPMIN_S, pmin_s)
1008 DO_3SAME_PAIR(VPMAX_U, pmax_u)
1009 DO_3SAME_PAIR(VPMIN_U, pmin_u)
1010 DO_3SAME_PAIR(VPADD, padd_u)
1011
1012 #define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
1013     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16);    \
1014     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32);    \
1015     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
1016                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
1017                                 uint32_t oprsz, uint32_t maxsz)         \
1018     {                                                                   \
1019         static const GVecGen3 ops[2] = {                                \
1020             { .fni4 = gen_##INSN##_tramp16 },                           \
1021             { .fni4 = gen_##INSN##_tramp32 },                           \
1022         };                                                              \
1023         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
1024     }                                                                   \
1025     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1026     {                                                                   \
1027         if (a->size != 1 && a->size != 2) {                             \
1028             return false;                                               \
1029         }                                                               \
1030         return do_3same(s, a, gen_##INSN##_3s);                         \
1031     }
1032
1033 DO_3SAME_VQDMULH(VQDMULH, qdmulh)
1034 DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
1035
1036 static bool do_3same_fp(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn,
1037                         bool reads_vd)
1038 {
1039     /*
1040      * FP operations handled elementwise 32 bits at a time.
1041      * If reads_vd is true then the old value of Vd will be
1042      * loaded before calling the callback function. This is
1043      * used for multiply-accumulate type operations.
1044      */
1045     TCGv_i32 tmp, tmp2;
1046     int pass;
1047
1048     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1049         return false;
1050     }
1051
1052     /* UNDEF accesses to D16-D31 if they don't exist. */
1053     if (!dc_isar_feature(aa32_simd_r32, s) &&
1054         ((a->vd | a->vn | a->vm) & 0x10)) {
1055         return false;
1056     }
1057
1058     if ((a->vn | a->vm | a->vd) & a->q) {
1059         return false;
1060     }
1061
1062     if (!vfp_access_check(s)) {
1063         return true;
1064     }
1065
1066     TCGv_ptr fpstatus = fpstatus_ptr(FPST_STD);
1067     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1068         tmp = neon_load_reg(a->vn, pass);
1069         tmp2 = neon_load_reg(a->vm, pass);
1070         if (reads_vd) {
1071             TCGv_i32 tmp_rd = neon_load_reg(a->vd, pass);
1072             fn(tmp_rd, tmp, tmp2, fpstatus);
1073             neon_store_reg(a->vd, pass, tmp_rd);
1074             tcg_temp_free_i32(tmp);
1075         } else {
1076             fn(tmp, tmp, tmp2, fpstatus);
1077             neon_store_reg(a->vd, pass, tmp);
1078         }
1079         tcg_temp_free_i32(tmp2);
1080     }
1081     tcg_temp_free_ptr(fpstatus);
1082     return true;
1083 }
1084
1085 #define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC)                              \
1086     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
1087                          uint32_t rn_ofs, uint32_t rm_ofs,              \
1088                          uint32_t oprsz, uint32_t maxsz)                \
1089     {                                                                   \
1090         TCGv_ptr fpst = fpstatus_ptr(FPST);                             \
1091         tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
1092                            oprsz, maxsz, 0, FUNC);                      \
1093         tcg_temp_free_ptr(fpst);                                        \
1094     }
1095
1096 #define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC)                                 \
1097     WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC)                 \
1098     WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC)             \
1099     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
1100     {                                                                   \
1101         if (a->size != 0) {                                             \
1102             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
1103                 return false;                                           \
1104             }                                                           \
1105             return do_3same(s, a, gen_##INSN##_fp16_3s);                \
1106         }                                                               \
1107         return do_3same(s, a, gen_##INSN##_fp32_3s);                    \
1108     }
1109
1110
1111 DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
1112 DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
1113 DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
1114 DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
1115 DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
1116 DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
1117 DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
1118 DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
1119 DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
1120 DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
1121 DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
1122 DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
1123 DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
1124
1125 WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
1126 WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
1127 WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
1128 WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
1129
1130 static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
1131 {
1132     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1133         return false;
1134     }
1135
1136     if (a->size != 0) {
1137         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1138             return false;
1139         }
1140         return do_3same(s, a, gen_VMAXNM_fp16_3s);
1141     }
1142     return do_3same(s, a, gen_VMAXNM_fp32_3s);
1143 }
1144
1145 static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
1146 {
1147     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1148         return false;
1149     }
1150
1151     if (a->size != 0) {
1152         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1153             return false;
1154         }
1155         return do_3same(s, a, gen_VMINNM_fp16_3s);
1156     }
1157     return do_3same(s, a, gen_VMINNM_fp32_3s);
1158 }
1159
1160 WRAP_ENV_FN(gen_VRECPS_tramp, gen_helper_recps_f32)
1161
1162 static void gen_VRECPS_fp_3s(unsigned vece, uint32_t rd_ofs,
1163                              uint32_t rn_ofs, uint32_t rm_ofs,
1164                              uint32_t oprsz, uint32_t maxsz)
1165 {
1166     static const GVecGen3 ops = { .fni4 = gen_VRECPS_tramp };
1167     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops);
1168 }
1169
1170 static bool trans_VRECPS_fp_3s(DisasContext *s, arg_3same *a)
1171 {
1172     if (a->size != 0) {
1173         /* TODO fp16 support */
1174         return false;
1175     }
1176
1177     return do_3same(s, a, gen_VRECPS_fp_3s);
1178 }
1179
1180 WRAP_ENV_FN(gen_VRSQRTS_tramp, gen_helper_rsqrts_f32)
1181
1182 static void gen_VRSQRTS_fp_3s(unsigned vece, uint32_t rd_ofs,
1183                               uint32_t rn_ofs, uint32_t rm_ofs,
1184                               uint32_t oprsz, uint32_t maxsz)
1185 {
1186     static const GVecGen3 ops = { .fni4 = gen_VRSQRTS_tramp };
1187     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops);
1188 }
1189
1190 static bool trans_VRSQRTS_fp_3s(DisasContext *s, arg_3same *a)
1191 {
1192     if (a->size != 0) {
1193         /* TODO fp16 support */
1194         return false;
1195     }
1196
1197     return do_3same(s, a, gen_VRSQRTS_fp_3s);
1198 }
1199
1200 static void gen_VFMA_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
1201                             TCGv_ptr fpstatus)
1202 {
1203     gen_helper_vfp_muladds(vd, vn, vm, vd, fpstatus);
1204 }
1205
1206 static bool trans_VFMA_fp_3s(DisasContext *s, arg_3same *a)
1207 {
1208     if (!dc_isar_feature(aa32_simdfmac, s)) {
1209         return false;
1210     }
1211
1212     if (a->size != 0) {
1213         /* TODO fp16 support */
1214         return false;
1215     }
1216
1217     return do_3same_fp(s, a, gen_VFMA_fp_3s, true);
1218 }
1219
1220 static void gen_VFMS_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
1221                             TCGv_ptr fpstatus)
1222 {
1223     gen_helper_vfp_negs(vn, vn);
1224     gen_helper_vfp_muladds(vd, vn, vm, vd, fpstatus);
1225 }
1226
1227 static bool trans_VFMS_fp_3s(DisasContext *s, arg_3same *a)
1228 {
1229     if (!dc_isar_feature(aa32_simdfmac, s)) {
1230         return false;
1231     }
1232
1233     if (a->size != 0) {
1234         /* TODO fp16 support */
1235         return false;
1236     }
1237
1238     return do_3same_fp(s, a, gen_VFMS_fp_3s, true);
1239 }
1240
1241 static bool do_3same_fp_pair(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn)
1242 {
1243     /* FP operations handled pairwise 32 bits at a time */
1244     TCGv_i32 tmp, tmp2, tmp3;
1245     TCGv_ptr fpstatus;
1246
1247     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1248         return false;
1249     }
1250
1251     /* UNDEF accesses to D16-D31 if they don't exist. */
1252     if (!dc_isar_feature(aa32_simd_r32, s) &&
1253         ((a->vd | a->vn | a->vm) & 0x10)) {
1254         return false;
1255     }
1256
1257     if (!vfp_access_check(s)) {
1258         return true;
1259     }
1260
1261     assert(a->q == 0); /* enforced by decode patterns */
1262
1263     /*
1264      * Note that we have to be careful not to clobber the source operands
1265      * in the "vm == vd" case by storing the result of the first pass too
1266      * early. Since Q is 0 there are always just two passes, so instead
1267      * of a complicated loop over each pass we just unroll.
1268      */
1269     fpstatus = fpstatus_ptr(FPST_STD);
1270     tmp = neon_load_reg(a->vn, 0);
1271     tmp2 = neon_load_reg(a->vn, 1);
1272     fn(tmp, tmp, tmp2, fpstatus);
1273     tcg_temp_free_i32(tmp2);
1274
1275     tmp3 = neon_load_reg(a->vm, 0);
1276     tmp2 = neon_load_reg(a->vm, 1);
1277     fn(tmp3, tmp3, tmp2, fpstatus);
1278     tcg_temp_free_i32(tmp2);
1279     tcg_temp_free_ptr(fpstatus);
1280
1281     neon_store_reg(a->vd, 0, tmp);
1282     neon_store_reg(a->vd, 1, tmp3);
1283     return true;
1284 }
1285
1286 /*
1287  * For all the functions using this macro, size == 1 means fp16,
1288  * which is an architecture extension we don't implement yet.
1289  */
1290 #define DO_3S_FP_PAIR(INSN,FUNC)                                    \
1291     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
1292     {                                                               \
1293         if (a->size != 0) {                                         \
1294             /* TODO fp16 support */                                 \
1295             return false;                                           \
1296         }                                                           \
1297         return do_3same_fp_pair(s, a, FUNC);                        \
1298     }
1299
1300 DO_3S_FP_PAIR(VPADD, gen_helper_vfp_adds)
1301 DO_3S_FP_PAIR(VPMAX, gen_helper_vfp_maxs)
1302 DO_3S_FP_PAIR(VPMIN, gen_helper_vfp_mins)
1303
1304 static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
1305 {
1306     /* Handle a 2-reg-shift insn which can be vectorized. */
1307     int vec_size = a->q ? 16 : 8;
1308     int rd_ofs = neon_reg_offset(a->vd, 0);
1309     int rm_ofs = neon_reg_offset(a->vm, 0);
1310
1311     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1312         return false;
1313     }
1314
1315     /* UNDEF accesses to D16-D31 if they don't exist. */
1316     if (!dc_isar_feature(aa32_simd_r32, s) &&
1317         ((a->vd | a->vm) & 0x10)) {
1318         return false;
1319     }
1320
1321     if ((a->vm | a->vd) & a->q) {
1322         return false;
1323     }
1324
1325     if (!vfp_access_check(s)) {
1326         return true;
1327     }
1328
1329     fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
1330     return true;
1331 }
1332
1333 #define DO_2SH(INSN, FUNC)                                              \
1334     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1335     {                                                                   \
1336         return do_vector_2sh(s, a, FUNC);                               \
1337     }                                                                   \
1338
1339 DO_2SH(VSHL, tcg_gen_gvec_shli)
1340 DO_2SH(VSLI, gen_gvec_sli)
1341 DO_2SH(VSRI, gen_gvec_sri)
1342 DO_2SH(VSRA_S, gen_gvec_ssra)
1343 DO_2SH(VSRA_U, gen_gvec_usra)
1344 DO_2SH(VRSHR_S, gen_gvec_srshr)
1345 DO_2SH(VRSHR_U, gen_gvec_urshr)
1346 DO_2SH(VRSRA_S, gen_gvec_srsra)
1347 DO_2SH(VRSRA_U, gen_gvec_ursra)
1348
1349 static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
1350 {
1351     /* Signed shift out of range results in all-sign-bits */
1352     a->shift = MIN(a->shift, (8 << a->size) - 1);
1353     return do_vector_2sh(s, a, tcg_gen_gvec_sari);
1354 }
1355
1356 static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
1357                             int64_t shift, uint32_t oprsz, uint32_t maxsz)
1358 {
1359     tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
1360 }
1361
1362 static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
1363 {
1364     /* Shift out of range is architecturally valid and results in zero. */
1365     if (a->shift >= (8 << a->size)) {
1366         return do_vector_2sh(s, a, gen_zero_rd_2sh);
1367     } else {
1368         return do_vector_2sh(s, a, tcg_gen_gvec_shri);
1369     }
1370 }
1371
1372 static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
1373                              NeonGenTwo64OpEnvFn *fn)
1374 {
1375     /*
1376      * 2-reg-and-shift operations, size == 3 case, where the
1377      * function needs to be passed cpu_env.
1378      */
1379     TCGv_i64 constimm;
1380     int pass;
1381
1382     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1383         return false;
1384     }
1385
1386     /* UNDEF accesses to D16-D31 if they don't exist. */
1387     if (!dc_isar_feature(aa32_simd_r32, s) &&
1388         ((a->vd | a->vm) & 0x10)) {
1389         return false;
1390     }
1391
1392     if ((a->vm | a->vd) & a->q) {
1393         return false;
1394     }
1395
1396     if (!vfp_access_check(s)) {
1397         return true;
1398     }
1399
1400     /*
1401      * To avoid excessive duplication of ops we implement shift
1402      * by immediate using the variable shift operations.
1403      */
1404     constimm = tcg_const_i64(dup_const(a->size, a->shift));
1405
1406     for (pass = 0; pass < a->q + 1; pass++) {
1407         TCGv_i64 tmp = tcg_temp_new_i64();
1408
1409         neon_load_reg64(tmp, a->vm + pass);
1410         fn(tmp, cpu_env, tmp, constimm);
1411         neon_store_reg64(tmp, a->vd + pass);
1412         tcg_temp_free_i64(tmp);
1413     }
1414     tcg_temp_free_i64(constimm);
1415     return true;
1416 }
1417
1418 static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
1419                              NeonGenTwoOpEnvFn *fn)
1420 {
1421     /*
1422      * 2-reg-and-shift operations, size < 3 case, where the
1423      * helper needs to be passed cpu_env.
1424      */
1425     TCGv_i32 constimm;
1426     int pass;
1427
1428     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1429         return false;
1430     }
1431
1432     /* UNDEF accesses to D16-D31 if they don't exist. */
1433     if (!dc_isar_feature(aa32_simd_r32, s) &&
1434         ((a->vd | a->vm) & 0x10)) {
1435         return false;
1436     }
1437
1438     if ((a->vm | a->vd) & a->q) {
1439         return false;
1440     }
1441
1442     if (!vfp_access_check(s)) {
1443         return true;
1444     }
1445
1446     /*
1447      * To avoid excessive duplication of ops we implement shift
1448      * by immediate using the variable shift operations.
1449      */
1450     constimm = tcg_const_i32(dup_const(a->size, a->shift));
1451
1452     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1453         TCGv_i32 tmp = neon_load_reg(a->vm, pass);
1454         fn(tmp, cpu_env, tmp, constimm);
1455         neon_store_reg(a->vd, pass, tmp);
1456     }
1457     tcg_temp_free_i32(constimm);
1458     return true;
1459 }
1460
1461 #define DO_2SHIFT_ENV(INSN, FUNC)                                       \
1462     static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
1463     {                                                                   \
1464         return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
1465     }                                                                   \
1466     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1467     {                                                                   \
1468         static NeonGenTwoOpEnvFn * const fns[] = {                      \
1469             gen_helper_neon_##FUNC##8,                                  \
1470             gen_helper_neon_##FUNC##16,                                 \
1471             gen_helper_neon_##FUNC##32,                                 \
1472         };                                                              \
1473         assert(a->size < ARRAY_SIZE(fns));                              \
1474         return do_2shift_env_32(s, a, fns[a->size]);                    \
1475     }
1476
1477 DO_2SHIFT_ENV(VQSHLU, qshlu_s)
1478 DO_2SHIFT_ENV(VQSHL_U, qshl_u)
1479 DO_2SHIFT_ENV(VQSHL_S, qshl_s)
1480
1481 static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
1482                                 NeonGenTwo64OpFn *shiftfn,
1483                                 NeonGenNarrowEnvFn *narrowfn)
1484 {
1485     /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
1486     TCGv_i64 constimm, rm1, rm2;
1487     TCGv_i32 rd;
1488
1489     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1490         return false;
1491     }
1492
1493     /* UNDEF accesses to D16-D31 if they don't exist. */
1494     if (!dc_isar_feature(aa32_simd_r32, s) &&
1495         ((a->vd | a->vm) & 0x10)) {
1496         return false;
1497     }
1498
1499     if (a->vm & 1) {
1500         return false;
1501     }
1502
1503     if (!vfp_access_check(s)) {
1504         return true;
1505     }
1506
1507     /*
1508      * This is always a right shift, and the shiftfn is always a
1509      * left-shift helper, which thus needs the negated shift count.
1510      */
1511     constimm = tcg_const_i64(-a->shift);
1512     rm1 = tcg_temp_new_i64();
1513     rm2 = tcg_temp_new_i64();
1514
1515     /* Load both inputs first to avoid potential overwrite if rm == rd */
1516     neon_load_reg64(rm1, a->vm);
1517     neon_load_reg64(rm2, a->vm + 1);
1518
1519     shiftfn(rm1, rm1, constimm);
1520     rd = tcg_temp_new_i32();
1521     narrowfn(rd, cpu_env, rm1);
1522     neon_store_reg(a->vd, 0, rd);
1523
1524     shiftfn(rm2, rm2, constimm);
1525     rd = tcg_temp_new_i32();
1526     narrowfn(rd, cpu_env, rm2);
1527     neon_store_reg(a->vd, 1, rd);
1528
1529     tcg_temp_free_i64(rm1);
1530     tcg_temp_free_i64(rm2);
1531     tcg_temp_free_i64(constimm);
1532
1533     return true;
1534 }
1535
1536 static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
1537                                 NeonGenTwoOpFn *shiftfn,
1538                                 NeonGenNarrowEnvFn *narrowfn)
1539 {
1540     /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
1541     TCGv_i32 constimm, rm1, rm2, rm3, rm4;
1542     TCGv_i64 rtmp;
1543     uint32_t imm;
1544
1545     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1546         return false;
1547     }
1548
1549     /* UNDEF accesses to D16-D31 if they don't exist. */
1550     if (!dc_isar_feature(aa32_simd_r32, s) &&
1551         ((a->vd | a->vm) & 0x10)) {
1552         return false;
1553     }
1554
1555     if (a->vm & 1) {
1556         return false;
1557     }
1558
1559     if (!vfp_access_check(s)) {
1560         return true;
1561     }
1562
1563     /*
1564      * This is always a right shift, and the shiftfn is always a
1565      * left-shift helper, which thus needs the negated shift count
1566      * duplicated into each lane of the immediate value.
1567      */
1568     if (a->size == 1) {
1569         imm = (uint16_t)(-a->shift);
1570         imm |= imm << 16;
1571     } else {
1572         /* size == 2 */
1573         imm = -a->shift;
1574     }
1575     constimm = tcg_const_i32(imm);
1576
1577     /* Load all inputs first to avoid potential overwrite */
1578     rm1 = neon_load_reg(a->vm, 0);
1579     rm2 = neon_load_reg(a->vm, 1);
1580     rm3 = neon_load_reg(a->vm + 1, 0);
1581     rm4 = neon_load_reg(a->vm + 1, 1);
1582     rtmp = tcg_temp_new_i64();
1583
1584     shiftfn(rm1, rm1, constimm);
1585     shiftfn(rm2, rm2, constimm);
1586
1587     tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
1588     tcg_temp_free_i32(rm2);
1589
1590     narrowfn(rm1, cpu_env, rtmp);
1591     neon_store_reg(a->vd, 0, rm1);
1592
1593     shiftfn(rm3, rm3, constimm);
1594     shiftfn(rm4, rm4, constimm);
1595     tcg_temp_free_i32(constimm);
1596
1597     tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
1598     tcg_temp_free_i32(rm4);
1599
1600     narrowfn(rm3, cpu_env, rtmp);
1601     tcg_temp_free_i64(rtmp);
1602     neon_store_reg(a->vd, 1, rm3);
1603     return true;
1604 }
1605
1606 #define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
1607     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1608     {                                                                   \
1609         return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
1610     }
1611 #define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
1612     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1613     {                                                                   \
1614         return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
1615     }
1616
1617 static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1618 {
1619     tcg_gen_extrl_i64_i32(dest, src);
1620 }
1621
1622 static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1623 {
1624     gen_helper_neon_narrow_u16(dest, src);
1625 }
1626
1627 static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1628 {
1629     gen_helper_neon_narrow_u8(dest, src);
1630 }
1631
1632 DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
1633 DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
1634 DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
1635
1636 DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
1637 DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
1638 DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
1639
1640 DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
1641 DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
1642 DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
1643
1644 DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
1645 DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
1646 DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
1647 DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
1648 DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
1649 DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
1650
1651 DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
1652 DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
1653 DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
1654
1655 DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
1656 DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
1657 DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
1658
1659 DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
1660 DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
1661 DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
1662
1663 static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
1664                          NeonGenWidenFn *widenfn, bool u)
1665 {
1666     TCGv_i64 tmp;
1667     TCGv_i32 rm0, rm1;
1668     uint64_t widen_mask = 0;
1669
1670     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1671         return false;
1672     }
1673
1674     /* UNDEF accesses to D16-D31 if they don't exist. */
1675     if (!dc_isar_feature(aa32_simd_r32, s) &&
1676         ((a->vd | a->vm) & 0x10)) {
1677         return false;
1678     }
1679
1680     if (a->vd & 1) {
1681         return false;
1682     }
1683
1684     if (!vfp_access_check(s)) {
1685         return true;
1686     }
1687
1688     /*
1689      * This is a widen-and-shift operation. The shift is always less
1690      * than the width of the source type, so after widening the input
1691      * vector we can simply shift the whole 64-bit widened register,
1692      * and then clear the potential overflow bits resulting from left
1693      * bits of the narrow input appearing as right bits of the left
1694      * neighbour narrow input. Calculate a mask of bits to clear.
1695      */
1696     if ((a->shift != 0) && (a->size < 2 || u)) {
1697         int esize = 8 << a->size;
1698         widen_mask = MAKE_64BIT_MASK(0, esize);
1699         widen_mask >>= esize - a->shift;
1700         widen_mask = dup_const(a->size + 1, widen_mask);
1701     }
1702
1703     rm0 = neon_load_reg(a->vm, 0);
1704     rm1 = neon_load_reg(a->vm, 1);
1705     tmp = tcg_temp_new_i64();
1706
1707     widenfn(tmp, rm0);
1708     tcg_temp_free_i32(rm0);
1709     if (a->shift != 0) {
1710         tcg_gen_shli_i64(tmp, tmp, a->shift);
1711         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1712     }
1713     neon_store_reg64(tmp, a->vd);
1714
1715     widenfn(tmp, rm1);
1716     tcg_temp_free_i32(rm1);
1717     if (a->shift != 0) {
1718         tcg_gen_shli_i64(tmp, tmp, a->shift);
1719         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1720     }
1721     neon_store_reg64(tmp, a->vd + 1);
1722     tcg_temp_free_i64(tmp);
1723     return true;
1724 }
1725
1726 static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
1727 {
1728     static NeonGenWidenFn * const widenfn[] = {
1729         gen_helper_neon_widen_s8,
1730         gen_helper_neon_widen_s16,
1731         tcg_gen_ext_i32_i64,
1732     };
1733     return do_vshll_2sh(s, a, widenfn[a->size], false);
1734 }
1735
1736 static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
1737 {
1738     static NeonGenWidenFn * const widenfn[] = {
1739         gen_helper_neon_widen_u8,
1740         gen_helper_neon_widen_u16,
1741         tcg_gen_extu_i32_i64,
1742     };
1743     return do_vshll_2sh(s, a, widenfn[a->size], true);
1744 }
1745
1746 static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
1747                       NeonGenTwoSingleOpFn *fn)
1748 {
1749     /* FP operations in 2-reg-and-shift group */
1750     TCGv_i32 tmp, shiftv;
1751     TCGv_ptr fpstatus;
1752     int pass;
1753
1754     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1755         return false;
1756     }
1757
1758     /* UNDEF accesses to D16-D31 if they don't exist. */
1759     if (!dc_isar_feature(aa32_simd_r32, s) &&
1760         ((a->vd | a->vm) & 0x10)) {
1761         return false;
1762     }
1763
1764     if ((a->vm | a->vd) & a->q) {
1765         return false;
1766     }
1767
1768     if (!vfp_access_check(s)) {
1769         return true;
1770     }
1771
1772     fpstatus = fpstatus_ptr(FPST_STD);
1773     shiftv = tcg_const_i32(a->shift);
1774     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1775         tmp = neon_load_reg(a->vm, pass);
1776         fn(tmp, tmp, shiftv, fpstatus);
1777         neon_store_reg(a->vd, pass, tmp);
1778     }
1779     tcg_temp_free_ptr(fpstatus);
1780     tcg_temp_free_i32(shiftv);
1781     return true;
1782 }
1783
1784 #define DO_FP_2SH(INSN, FUNC)                                           \
1785     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1786     {                                                                   \
1787         return do_fp_2sh(s, a, FUNC);                                   \
1788     }
1789
1790 DO_FP_2SH(VCVT_SF, gen_helper_vfp_sltos)
1791 DO_FP_2SH(VCVT_UF, gen_helper_vfp_ultos)
1792 DO_FP_2SH(VCVT_FS, gen_helper_vfp_tosls_round_to_zero)
1793 DO_FP_2SH(VCVT_FU, gen_helper_vfp_touls_round_to_zero)
1794
1795 static uint64_t asimd_imm_const(uint32_t imm, int cmode, int op)
1796 {
1797     /*
1798      * Expand the encoded constant.
1799      * Note that cmode = 2,3,4,5,6,7,10,11,12,13 imm=0 is UNPREDICTABLE.
1800      * We choose to not special-case this and will behave as if a
1801      * valid constant encoding of 0 had been given.
1802      * cmode = 15 op = 1 must UNDEF; we assume decode has handled that.
1803      */
1804     switch (cmode) {
1805     case 0: case 1:
1806         /* no-op */
1807         break;
1808     case 2: case 3:
1809         imm <<= 8;
1810         break;
1811     case 4: case 5:
1812         imm <<= 16;
1813         break;
1814     case 6: case 7:
1815         imm <<= 24;
1816         break;
1817     case 8: case 9:
1818         imm |= imm << 16;
1819         break;
1820     case 10: case 11:
1821         imm = (imm << 8) | (imm << 24);
1822         break;
1823     case 12:
1824         imm = (imm << 8) | 0xff;
1825         break;
1826     case 13:
1827         imm = (imm << 16) | 0xffff;
1828         break;
1829     case 14:
1830         if (op) {
1831             /*
1832              * This is the only case where the top and bottom 32 bits
1833              * of the encoded constant differ.
1834              */
1835             uint64_t imm64 = 0;
1836             int n;
1837
1838             for (n = 0; n < 8; n++) {
1839                 if (imm & (1 << n)) {
1840                     imm64 |= (0xffULL << (n * 8));
1841                 }
1842             }
1843             return imm64;
1844         }
1845         imm |= (imm << 8) | (imm << 16) | (imm << 24);
1846         break;
1847     case 15:
1848         imm = ((imm & 0x80) << 24) | ((imm & 0x3f) << 19)
1849             | ((imm & 0x40) ? (0x1f << 25) : (1 << 30));
1850         break;
1851     }
1852     if (op) {
1853         imm = ~imm;
1854     }
1855     return dup_const(MO_32, imm);
1856 }
1857
1858 static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
1859                         GVecGen2iFn *fn)
1860 {
1861     uint64_t imm;
1862     int reg_ofs, vec_size;
1863
1864     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1865         return false;
1866     }
1867
1868     /* UNDEF accesses to D16-D31 if they don't exist. */
1869     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
1870         return false;
1871     }
1872
1873     if (a->vd & a->q) {
1874         return false;
1875     }
1876
1877     if (!vfp_access_check(s)) {
1878         return true;
1879     }
1880
1881     reg_ofs = neon_reg_offset(a->vd, 0);
1882     vec_size = a->q ? 16 : 8;
1883     imm = asimd_imm_const(a->imm, a->cmode, a->op);
1884
1885     fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
1886     return true;
1887 }
1888
1889 static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
1890                         int64_t c, uint32_t oprsz, uint32_t maxsz)
1891 {
1892     tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
1893 }
1894
1895 static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
1896 {
1897     /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
1898     GVecGen2iFn *fn;
1899
1900     if ((a->cmode & 1) && a->cmode < 12) {
1901         /* for op=1, the imm will be inverted, so BIC becomes AND. */
1902         fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
1903     } else {
1904         /* There is one unallocated cmode/op combination in this space */
1905         if (a->cmode == 15 && a->op == 1) {
1906             return false;
1907         }
1908         fn = gen_VMOV_1r;
1909     }
1910     return do_1reg_imm(s, a, fn);
1911 }
1912
1913 static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
1914                            NeonGenWidenFn *widenfn,
1915                            NeonGenTwo64OpFn *opfn,
1916                            bool src1_wide)
1917 {
1918     /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
1919     TCGv_i64 rn0_64, rn1_64, rm_64;
1920     TCGv_i32 rm;
1921
1922     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1923         return false;
1924     }
1925
1926     /* UNDEF accesses to D16-D31 if they don't exist. */
1927     if (!dc_isar_feature(aa32_simd_r32, s) &&
1928         ((a->vd | a->vn | a->vm) & 0x10)) {
1929         return false;
1930     }
1931
1932     if (!widenfn || !opfn) {
1933         /* size == 3 case, which is an entirely different insn group */
1934         return false;
1935     }
1936
1937     if ((a->vd & 1) || (src1_wide && (a->vn & 1))) {
1938         return false;
1939     }
1940
1941     if (!vfp_access_check(s)) {
1942         return true;
1943     }
1944
1945     rn0_64 = tcg_temp_new_i64();
1946     rn1_64 = tcg_temp_new_i64();
1947     rm_64 = tcg_temp_new_i64();
1948
1949     if (src1_wide) {
1950         neon_load_reg64(rn0_64, a->vn);
1951     } else {
1952         TCGv_i32 tmp = neon_load_reg(a->vn, 0);
1953         widenfn(rn0_64, tmp);
1954         tcg_temp_free_i32(tmp);
1955     }
1956     rm = neon_load_reg(a->vm, 0);
1957
1958     widenfn(rm_64, rm);
1959     tcg_temp_free_i32(rm);
1960     opfn(rn0_64, rn0_64, rm_64);
1961
1962     /*
1963      * Load second pass inputs before storing the first pass result, to
1964      * avoid incorrect results if a narrow input overlaps with the result.
1965      */
1966     if (src1_wide) {
1967         neon_load_reg64(rn1_64, a->vn + 1);
1968     } else {
1969         TCGv_i32 tmp = neon_load_reg(a->vn, 1);
1970         widenfn(rn1_64, tmp);
1971         tcg_temp_free_i32(tmp);
1972     }
1973     rm = neon_load_reg(a->vm, 1);
1974
1975     neon_store_reg64(rn0_64, a->vd);
1976
1977     widenfn(rm_64, rm);
1978     tcg_temp_free_i32(rm);
1979     opfn(rn1_64, rn1_64, rm_64);
1980     neon_store_reg64(rn1_64, a->vd + 1);
1981
1982     tcg_temp_free_i64(rn0_64);
1983     tcg_temp_free_i64(rn1_64);
1984     tcg_temp_free_i64(rm_64);
1985
1986     return true;
1987 }
1988
1989 #define DO_PREWIDEN(INSN, S, EXT, OP, SRC1WIDE)                         \
1990     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1991     {                                                                   \
1992         static NeonGenWidenFn * const widenfn[] = {                     \
1993             gen_helper_neon_widen_##S##8,                               \
1994             gen_helper_neon_widen_##S##16,                              \
1995             tcg_gen_##EXT##_i32_i64,                                    \
1996             NULL,                                                       \
1997         };                                                              \
1998         static NeonGenTwo64OpFn * const addfn[] = {                     \
1999             gen_helper_neon_##OP##l_u16,                                \
2000             gen_helper_neon_##OP##l_u32,                                \
2001             tcg_gen_##OP##_i64,                                         \
2002             NULL,                                                       \
2003         };                                                              \
2004         return do_prewiden_3d(s, a, widenfn[a->size],                   \
2005                               addfn[a->size], SRC1WIDE);                \
2006     }
2007
2008 DO_PREWIDEN(VADDL_S, s, ext, add, false)
2009 DO_PREWIDEN(VADDL_U, u, extu, add, false)
2010 DO_PREWIDEN(VSUBL_S, s, ext, sub, false)
2011 DO_PREWIDEN(VSUBL_U, u, extu, sub, false)
2012 DO_PREWIDEN(VADDW_S, s, ext, add, true)
2013 DO_PREWIDEN(VADDW_U, u, extu, add, true)
2014 DO_PREWIDEN(VSUBW_S, s, ext, sub, true)
2015 DO_PREWIDEN(VSUBW_U, u, extu, sub, true)
2016
2017 static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
2018                          NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
2019 {
2020     /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
2021     TCGv_i64 rn_64, rm_64;
2022     TCGv_i32 rd0, rd1;
2023
2024     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2025         return false;
2026     }
2027
2028     /* UNDEF accesses to D16-D31 if they don't exist. */
2029     if (!dc_isar_feature(aa32_simd_r32, s) &&
2030         ((a->vd | a->vn | a->vm) & 0x10)) {
2031         return false;
2032     }
2033
2034     if (!opfn || !narrowfn) {
2035         /* size == 3 case, which is an entirely different insn group */
2036         return false;
2037     }
2038
2039     if ((a->vn | a->vm) & 1) {
2040         return false;
2041     }
2042
2043     if (!vfp_access_check(s)) {
2044         return true;
2045     }
2046
2047     rn_64 = tcg_temp_new_i64();
2048     rm_64 = tcg_temp_new_i64();
2049     rd0 = tcg_temp_new_i32();
2050     rd1 = tcg_temp_new_i32();
2051
2052     neon_load_reg64(rn_64, a->vn);
2053     neon_load_reg64(rm_64, a->vm);
2054
2055     opfn(rn_64, rn_64, rm_64);
2056
2057     narrowfn(rd0, rn_64);
2058
2059     neon_load_reg64(rn_64, a->vn + 1);
2060     neon_load_reg64(rm_64, a->vm + 1);
2061
2062     opfn(rn_64, rn_64, rm_64);
2063
2064     narrowfn(rd1, rn_64);
2065
2066     neon_store_reg(a->vd, 0, rd0);
2067     neon_store_reg(a->vd, 1, rd1);
2068
2069     tcg_temp_free_i64(rn_64);
2070     tcg_temp_free_i64(rm_64);
2071
2072     return true;
2073 }
2074
2075 #define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
2076     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2077     {                                                                   \
2078         static NeonGenTwo64OpFn * const addfn[] = {                     \
2079             gen_helper_neon_##OP##l_u16,                                \
2080             gen_helper_neon_##OP##l_u32,                                \
2081             tcg_gen_##OP##_i64,                                         \
2082             NULL,                                                       \
2083         };                                                              \
2084         static NeonGenNarrowFn * const narrowfn[] = {                   \
2085             gen_helper_neon_##NARROWTYPE##_high_u8,                     \
2086             gen_helper_neon_##NARROWTYPE##_high_u16,                    \
2087             EXTOP,                                                      \
2088             NULL,                                                       \
2089         };                                                              \
2090         return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
2091     }
2092
2093 static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
2094 {
2095     tcg_gen_addi_i64(rn, rn, 1u << 31);
2096     tcg_gen_extrh_i64_i32(rd, rn);
2097 }
2098
2099 DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
2100 DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
2101 DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
2102 DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
2103
2104 static bool do_long_3d(DisasContext *s, arg_3diff *a,
2105                        NeonGenTwoOpWidenFn *opfn,
2106                        NeonGenTwo64OpFn *accfn)
2107 {
2108     /*
2109      * 3-regs different lengths, long operations.
2110      * These perform an operation on two inputs that returns a double-width
2111      * result, and then possibly perform an accumulation operation of
2112      * that result into the double-width destination.
2113      */
2114     TCGv_i64 rd0, rd1, tmp;
2115     TCGv_i32 rn, rm;
2116
2117     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2118         return false;
2119     }
2120
2121     /* UNDEF accesses to D16-D31 if they don't exist. */
2122     if (!dc_isar_feature(aa32_simd_r32, s) &&
2123         ((a->vd | a->vn | a->vm) & 0x10)) {
2124         return false;
2125     }
2126
2127     if (!opfn) {
2128         /* size == 3 case, which is an entirely different insn group */
2129         return false;
2130     }
2131
2132     if (a->vd & 1) {
2133         return false;
2134     }
2135
2136     if (!vfp_access_check(s)) {
2137         return true;
2138     }
2139
2140     rd0 = tcg_temp_new_i64();
2141     rd1 = tcg_temp_new_i64();
2142
2143     rn = neon_load_reg(a->vn, 0);
2144     rm = neon_load_reg(a->vm, 0);
2145     opfn(rd0, rn, rm);
2146     tcg_temp_free_i32(rn);
2147     tcg_temp_free_i32(rm);
2148
2149     rn = neon_load_reg(a->vn, 1);
2150     rm = neon_load_reg(a->vm, 1);
2151     opfn(rd1, rn, rm);
2152     tcg_temp_free_i32(rn);
2153     tcg_temp_free_i32(rm);
2154
2155     /* Don't store results until after all loads: they might overlap */
2156     if (accfn) {
2157         tmp = tcg_temp_new_i64();
2158         neon_load_reg64(tmp, a->vd);
2159         accfn(tmp, tmp, rd0);
2160         neon_store_reg64(tmp, a->vd);
2161         neon_load_reg64(tmp, a->vd + 1);
2162         accfn(tmp, tmp, rd1);
2163         neon_store_reg64(tmp, a->vd + 1);
2164         tcg_temp_free_i64(tmp);
2165     } else {
2166         neon_store_reg64(rd0, a->vd);
2167         neon_store_reg64(rd1, a->vd + 1);
2168     }
2169
2170     tcg_temp_free_i64(rd0);
2171     tcg_temp_free_i64(rd1);
2172
2173     return true;
2174 }
2175
2176 static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
2177 {
2178     static NeonGenTwoOpWidenFn * const opfn[] = {
2179         gen_helper_neon_abdl_s16,
2180         gen_helper_neon_abdl_s32,
2181         gen_helper_neon_abdl_s64,
2182         NULL,
2183     };
2184
2185     return do_long_3d(s, a, opfn[a->size], NULL);
2186 }
2187
2188 static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
2189 {
2190     static NeonGenTwoOpWidenFn * const opfn[] = {
2191         gen_helper_neon_abdl_u16,
2192         gen_helper_neon_abdl_u32,
2193         gen_helper_neon_abdl_u64,
2194         NULL,
2195     };
2196
2197     return do_long_3d(s, a, opfn[a->size], NULL);
2198 }
2199
2200 static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
2201 {
2202     static NeonGenTwoOpWidenFn * const opfn[] = {
2203         gen_helper_neon_abdl_s16,
2204         gen_helper_neon_abdl_s32,
2205         gen_helper_neon_abdl_s64,
2206         NULL,
2207     };
2208     static NeonGenTwo64OpFn * const addfn[] = {
2209         gen_helper_neon_addl_u16,
2210         gen_helper_neon_addl_u32,
2211         tcg_gen_add_i64,
2212         NULL,
2213     };
2214
2215     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2216 }
2217
2218 static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
2219 {
2220     static NeonGenTwoOpWidenFn * const opfn[] = {
2221         gen_helper_neon_abdl_u16,
2222         gen_helper_neon_abdl_u32,
2223         gen_helper_neon_abdl_u64,
2224         NULL,
2225     };
2226     static NeonGenTwo64OpFn * const addfn[] = {
2227         gen_helper_neon_addl_u16,
2228         gen_helper_neon_addl_u32,
2229         tcg_gen_add_i64,
2230         NULL,
2231     };
2232
2233     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2234 }
2235
2236 static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2237 {
2238     TCGv_i32 lo = tcg_temp_new_i32();
2239     TCGv_i32 hi = tcg_temp_new_i32();
2240
2241     tcg_gen_muls2_i32(lo, hi, rn, rm);
2242     tcg_gen_concat_i32_i64(rd, lo, hi);
2243
2244     tcg_temp_free_i32(lo);
2245     tcg_temp_free_i32(hi);
2246 }
2247
2248 static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2249 {
2250     TCGv_i32 lo = tcg_temp_new_i32();
2251     TCGv_i32 hi = tcg_temp_new_i32();
2252
2253     tcg_gen_mulu2_i32(lo, hi, rn, rm);
2254     tcg_gen_concat_i32_i64(rd, lo, hi);
2255
2256     tcg_temp_free_i32(lo);
2257     tcg_temp_free_i32(hi);
2258 }
2259
2260 static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
2261 {
2262     static NeonGenTwoOpWidenFn * const opfn[] = {
2263         gen_helper_neon_mull_s8,
2264         gen_helper_neon_mull_s16,
2265         gen_mull_s32,
2266         NULL,
2267     };
2268
2269     return do_long_3d(s, a, opfn[a->size], NULL);
2270 }
2271
2272 static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
2273 {
2274     static NeonGenTwoOpWidenFn * const opfn[] = {
2275         gen_helper_neon_mull_u8,
2276         gen_helper_neon_mull_u16,
2277         gen_mull_u32,
2278         NULL,
2279     };
2280
2281     return do_long_3d(s, a, opfn[a->size], NULL);
2282 }
2283
2284 #define DO_VMLAL(INSN,MULL,ACC)                                         \
2285     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2286     {                                                                   \
2287         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2288             gen_helper_neon_##MULL##8,                                  \
2289             gen_helper_neon_##MULL##16,                                 \
2290             gen_##MULL##32,                                             \
2291             NULL,                                                       \
2292         };                                                              \
2293         static NeonGenTwo64OpFn * const accfn[] = {                     \
2294             gen_helper_neon_##ACC##l_u16,                               \
2295             gen_helper_neon_##ACC##l_u32,                               \
2296             tcg_gen_##ACC##_i64,                                        \
2297             NULL,                                                       \
2298         };                                                              \
2299         return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
2300     }
2301
2302 DO_VMLAL(VMLAL_S,mull_s,add)
2303 DO_VMLAL(VMLAL_U,mull_u,add)
2304 DO_VMLAL(VMLSL_S,mull_s,sub)
2305 DO_VMLAL(VMLSL_U,mull_u,sub)
2306
2307 static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2308 {
2309     gen_helper_neon_mull_s16(rd, rn, rm);
2310     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rd, rd);
2311 }
2312
2313 static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2314 {
2315     gen_mull_s32(rd, rn, rm);
2316     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rd, rd);
2317 }
2318
2319 static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
2320 {
2321     static NeonGenTwoOpWidenFn * const opfn[] = {
2322         NULL,
2323         gen_VQDMULL_16,
2324         gen_VQDMULL_32,
2325         NULL,
2326     };
2327
2328     return do_long_3d(s, a, opfn[a->size], NULL);
2329 }
2330
2331 static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2332 {
2333     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2334 }
2335
2336 static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2337 {
2338     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2339 }
2340
2341 static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
2342 {
2343     static NeonGenTwoOpWidenFn * const opfn[] = {
2344         NULL,
2345         gen_VQDMULL_16,
2346         gen_VQDMULL_32,
2347         NULL,
2348     };
2349     static NeonGenTwo64OpFn * const accfn[] = {
2350         NULL,
2351         gen_VQDMLAL_acc_16,
2352         gen_VQDMLAL_acc_32,
2353         NULL,
2354     };
2355
2356     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2357 }
2358
2359 static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2360 {
2361     gen_helper_neon_negl_u32(rm, rm);
2362     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2363 }
2364
2365 static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2366 {
2367     tcg_gen_neg_i64(rm, rm);
2368     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2369 }
2370
2371 static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
2372 {
2373     static NeonGenTwoOpWidenFn * const opfn[] = {
2374         NULL,
2375         gen_VQDMULL_16,
2376         gen_VQDMULL_32,
2377         NULL,
2378     };
2379     static NeonGenTwo64OpFn * const accfn[] = {
2380         NULL,
2381         gen_VQDMLSL_acc_16,
2382         gen_VQDMLSL_acc_32,
2383         NULL,
2384     };
2385
2386     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2387 }
2388
2389 static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
2390 {
2391     gen_helper_gvec_3 *fn_gvec;
2392
2393     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2394         return false;
2395     }
2396
2397     /* UNDEF accesses to D16-D31 if they don't exist. */
2398     if (!dc_isar_feature(aa32_simd_r32, s) &&
2399         ((a->vd | a->vn | a->vm) & 0x10)) {
2400         return false;
2401     }
2402
2403     if (a->vd & 1) {
2404         return false;
2405     }
2406
2407     switch (a->size) {
2408     case 0:
2409         fn_gvec = gen_helper_neon_pmull_h;
2410         break;
2411     case 2:
2412         if (!dc_isar_feature(aa32_pmull, s)) {
2413             return false;
2414         }
2415         fn_gvec = gen_helper_gvec_pmull_q;
2416         break;
2417     default:
2418         return false;
2419     }
2420
2421     if (!vfp_access_check(s)) {
2422         return true;
2423     }
2424
2425     tcg_gen_gvec_3_ool(neon_reg_offset(a->vd, 0),
2426                        neon_reg_offset(a->vn, 0),
2427                        neon_reg_offset(a->vm, 0),
2428                        16, 16, 0, fn_gvec);
2429     return true;
2430 }
2431
2432 static void gen_neon_dup_low16(TCGv_i32 var)
2433 {
2434     TCGv_i32 tmp = tcg_temp_new_i32();
2435     tcg_gen_ext16u_i32(var, var);
2436     tcg_gen_shli_i32(tmp, var, 16);
2437     tcg_gen_or_i32(var, var, tmp);
2438     tcg_temp_free_i32(tmp);
2439 }
2440
2441 static void gen_neon_dup_high16(TCGv_i32 var)
2442 {
2443     TCGv_i32 tmp = tcg_temp_new_i32();
2444     tcg_gen_andi_i32(var, var, 0xffff0000);
2445     tcg_gen_shri_i32(tmp, var, 16);
2446     tcg_gen_or_i32(var, var, tmp);
2447     tcg_temp_free_i32(tmp);
2448 }
2449
2450 static inline TCGv_i32 neon_get_scalar(int size, int reg)
2451 {
2452     TCGv_i32 tmp;
2453     if (size == 1) {
2454         tmp = neon_load_reg(reg & 7, reg >> 4);
2455         if (reg & 8) {
2456             gen_neon_dup_high16(tmp);
2457         } else {
2458             gen_neon_dup_low16(tmp);
2459         }
2460     } else {
2461         tmp = neon_load_reg(reg & 15, reg >> 4);
2462     }
2463     return tmp;
2464 }
2465
2466 static bool do_2scalar(DisasContext *s, arg_2scalar *a,
2467                        NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
2468 {
2469     /*
2470      * Two registers and a scalar: perform an operation between
2471      * the input elements and the scalar, and then possibly
2472      * perform an accumulation operation of that result into the
2473      * destination.
2474      */
2475     TCGv_i32 scalar;
2476     int pass;
2477
2478     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2479         return false;
2480     }
2481
2482     /* UNDEF accesses to D16-D31 if they don't exist. */
2483     if (!dc_isar_feature(aa32_simd_r32, s) &&
2484         ((a->vd | a->vn | a->vm) & 0x10)) {
2485         return false;
2486     }
2487
2488     if (!opfn) {
2489         /* Bad size (including size == 3, which is a different insn group) */
2490         return false;
2491     }
2492
2493     if (a->q && ((a->vd | a->vn) & 1)) {
2494         return false;
2495     }
2496
2497     if (!vfp_access_check(s)) {
2498         return true;
2499     }
2500
2501     scalar = neon_get_scalar(a->size, a->vm);
2502
2503     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2504         TCGv_i32 tmp = neon_load_reg(a->vn, pass);
2505         opfn(tmp, tmp, scalar);
2506         if (accfn) {
2507             TCGv_i32 rd = neon_load_reg(a->vd, pass);
2508             accfn(tmp, rd, tmp);
2509             tcg_temp_free_i32(rd);
2510         }
2511         neon_store_reg(a->vd, pass, tmp);
2512     }
2513     tcg_temp_free_i32(scalar);
2514     return true;
2515 }
2516
2517 static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
2518 {
2519     static NeonGenTwoOpFn * const opfn[] = {
2520         NULL,
2521         gen_helper_neon_mul_u16,
2522         tcg_gen_mul_i32,
2523         NULL,
2524     };
2525
2526     return do_2scalar(s, a, opfn[a->size], NULL);
2527 }
2528
2529 static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
2530 {
2531     static NeonGenTwoOpFn * const opfn[] = {
2532         NULL,
2533         gen_helper_neon_mul_u16,
2534         tcg_gen_mul_i32,
2535         NULL,
2536     };
2537     static NeonGenTwoOpFn * const accfn[] = {
2538         NULL,
2539         gen_helper_neon_add_u16,
2540         tcg_gen_add_i32,
2541         NULL,
2542     };
2543
2544     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2545 }
2546
2547 static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
2548 {
2549     static NeonGenTwoOpFn * const opfn[] = {
2550         NULL,
2551         gen_helper_neon_mul_u16,
2552         tcg_gen_mul_i32,
2553         NULL,
2554     };
2555     static NeonGenTwoOpFn * const accfn[] = {
2556         NULL,
2557         gen_helper_neon_sub_u16,
2558         tcg_gen_sub_i32,
2559         NULL,
2560     };
2561
2562     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2563 }
2564
2565 /*
2566  * Rather than have a float-specific version of do_2scalar just for
2567  * three insns, we wrap a NeonGenTwoSingleOpFn to turn it into
2568  * a NeonGenTwoOpFn.
2569  */
2570 #define WRAP_FP_FN(WRAPNAME, FUNC)                              \
2571     static void WRAPNAME(TCGv_i32 rd, TCGv_i32 rn, TCGv_i32 rm) \
2572     {                                                           \
2573         TCGv_ptr fpstatus = fpstatus_ptr(FPST_STD);             \
2574         FUNC(rd, rn, rm, fpstatus);                             \
2575         tcg_temp_free_ptr(fpstatus);                            \
2576     }
2577
2578 WRAP_FP_FN(gen_VMUL_F_mul, gen_helper_vfp_muls)
2579 WRAP_FP_FN(gen_VMUL_F_add, gen_helper_vfp_adds)
2580 WRAP_FP_FN(gen_VMUL_F_sub, gen_helper_vfp_subs)
2581
2582 static bool trans_VMUL_F_2sc(DisasContext *s, arg_2scalar *a)
2583 {
2584     static NeonGenTwoOpFn * const opfn[] = {
2585         NULL,
2586         NULL, /* TODO: fp16 support */
2587         gen_VMUL_F_mul,
2588         NULL,
2589     };
2590
2591     return do_2scalar(s, a, opfn[a->size], NULL);
2592 }
2593
2594 static bool trans_VMLA_F_2sc(DisasContext *s, arg_2scalar *a)
2595 {
2596     static NeonGenTwoOpFn * const opfn[] = {
2597         NULL,
2598         NULL, /* TODO: fp16 support */
2599         gen_VMUL_F_mul,
2600         NULL,
2601     };
2602     static NeonGenTwoOpFn * const accfn[] = {
2603         NULL,
2604         NULL, /* TODO: fp16 support */
2605         gen_VMUL_F_add,
2606         NULL,
2607     };
2608
2609     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2610 }
2611
2612 static bool trans_VMLS_F_2sc(DisasContext *s, arg_2scalar *a)
2613 {
2614     static NeonGenTwoOpFn * const opfn[] = {
2615         NULL,
2616         NULL, /* TODO: fp16 support */
2617         gen_VMUL_F_mul,
2618         NULL,
2619     };
2620     static NeonGenTwoOpFn * const accfn[] = {
2621         NULL,
2622         NULL, /* TODO: fp16 support */
2623         gen_VMUL_F_sub,
2624         NULL,
2625     };
2626
2627     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2628 }
2629
2630 WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
2631 WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
2632 WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
2633 WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
2634
2635 static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
2636 {
2637     static NeonGenTwoOpFn * const opfn[] = {
2638         NULL,
2639         gen_VQDMULH_16,
2640         gen_VQDMULH_32,
2641         NULL,
2642     };
2643
2644     return do_2scalar(s, a, opfn[a->size], NULL);
2645 }
2646
2647 static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
2648 {
2649     static NeonGenTwoOpFn * const opfn[] = {
2650         NULL,
2651         gen_VQRDMULH_16,
2652         gen_VQRDMULH_32,
2653         NULL,
2654     };
2655
2656     return do_2scalar(s, a, opfn[a->size], NULL);
2657 }
2658
2659 static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
2660                             NeonGenThreeOpEnvFn *opfn)
2661 {
2662     /*
2663      * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
2664      * performs a kind of fused op-then-accumulate using a helper
2665      * function that takes all of rd, rn and the scalar at once.
2666      */
2667     TCGv_i32 scalar;
2668     int pass;
2669
2670     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2671         return false;
2672     }
2673
2674     if (!dc_isar_feature(aa32_rdm, s)) {
2675         return false;
2676     }
2677
2678     /* UNDEF accesses to D16-D31 if they don't exist. */
2679     if (!dc_isar_feature(aa32_simd_r32, s) &&
2680         ((a->vd | a->vn | a->vm) & 0x10)) {
2681         return false;
2682     }
2683
2684     if (!opfn) {
2685         /* Bad size (including size == 3, which is a different insn group) */
2686         return false;
2687     }
2688
2689     if (a->q && ((a->vd | a->vn) & 1)) {
2690         return false;
2691     }
2692
2693     if (!vfp_access_check(s)) {
2694         return true;
2695     }
2696
2697     scalar = neon_get_scalar(a->size, a->vm);
2698
2699     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2700         TCGv_i32 rn = neon_load_reg(a->vn, pass);
2701         TCGv_i32 rd = neon_load_reg(a->vd, pass);
2702         opfn(rd, cpu_env, rn, scalar, rd);
2703         tcg_temp_free_i32(rn);
2704         neon_store_reg(a->vd, pass, rd);
2705     }
2706     tcg_temp_free_i32(scalar);
2707
2708     return true;
2709 }
2710
2711 static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
2712 {
2713     static NeonGenThreeOpEnvFn *opfn[] = {
2714         NULL,
2715         gen_helper_neon_qrdmlah_s16,
2716         gen_helper_neon_qrdmlah_s32,
2717         NULL,
2718     };
2719     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2720 }
2721
2722 static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
2723 {
2724     static NeonGenThreeOpEnvFn *opfn[] = {
2725         NULL,
2726         gen_helper_neon_qrdmlsh_s16,
2727         gen_helper_neon_qrdmlsh_s32,
2728         NULL,
2729     };
2730     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2731 }
2732
2733 static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
2734                             NeonGenTwoOpWidenFn *opfn,
2735                             NeonGenTwo64OpFn *accfn)
2736 {
2737     /*
2738      * Two registers and a scalar, long operations: perform an
2739      * operation on the input elements and the scalar which produces
2740      * a double-width result, and then possibly perform an accumulation
2741      * operation of that result into the destination.
2742      */
2743     TCGv_i32 scalar, rn;
2744     TCGv_i64 rn0_64, rn1_64;
2745
2746     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2747         return false;
2748     }
2749
2750     /* UNDEF accesses to D16-D31 if they don't exist. */
2751     if (!dc_isar_feature(aa32_simd_r32, s) &&
2752         ((a->vd | a->vn | a->vm) & 0x10)) {
2753         return false;
2754     }
2755
2756     if (!opfn) {
2757         /* Bad size (including size == 3, which is a different insn group) */
2758         return false;
2759     }
2760
2761     if (a->vd & 1) {
2762         return false;
2763     }
2764
2765     if (!vfp_access_check(s)) {
2766         return true;
2767     }
2768
2769     scalar = neon_get_scalar(a->size, a->vm);
2770
2771     /* Load all inputs before writing any outputs, in case of overlap */
2772     rn = neon_load_reg(a->vn, 0);
2773     rn0_64 = tcg_temp_new_i64();
2774     opfn(rn0_64, rn, scalar);
2775     tcg_temp_free_i32(rn);
2776
2777     rn = neon_load_reg(a->vn, 1);
2778     rn1_64 = tcg_temp_new_i64();
2779     opfn(rn1_64, rn, scalar);
2780     tcg_temp_free_i32(rn);
2781     tcg_temp_free_i32(scalar);
2782
2783     if (accfn) {
2784         TCGv_i64 t64 = tcg_temp_new_i64();
2785         neon_load_reg64(t64, a->vd);
2786         accfn(t64, t64, rn0_64);
2787         neon_store_reg64(t64, a->vd);
2788         neon_load_reg64(t64, a->vd + 1);
2789         accfn(t64, t64, rn1_64);
2790         neon_store_reg64(t64, a->vd + 1);
2791         tcg_temp_free_i64(t64);
2792     } else {
2793         neon_store_reg64(rn0_64, a->vd);
2794         neon_store_reg64(rn1_64, a->vd + 1);
2795     }
2796     tcg_temp_free_i64(rn0_64);
2797     tcg_temp_free_i64(rn1_64);
2798     return true;
2799 }
2800
2801 static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
2802 {
2803     static NeonGenTwoOpWidenFn * const opfn[] = {
2804         NULL,
2805         gen_helper_neon_mull_s16,
2806         gen_mull_s32,
2807         NULL,
2808     };
2809
2810     return do_2scalar_long(s, a, opfn[a->size], NULL);
2811 }
2812
2813 static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
2814 {
2815     static NeonGenTwoOpWidenFn * const opfn[] = {
2816         NULL,
2817         gen_helper_neon_mull_u16,
2818         gen_mull_u32,
2819         NULL,
2820     };
2821
2822     return do_2scalar_long(s, a, opfn[a->size], NULL);
2823 }
2824
2825 #define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
2826     static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
2827     {                                                                   \
2828         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2829             NULL,                                                       \
2830             gen_helper_neon_##MULL##16,                                 \
2831             gen_##MULL##32,                                             \
2832             NULL,                                                       \
2833         };                                                              \
2834         static NeonGenTwo64OpFn * const accfn[] = {                     \
2835             NULL,                                                       \
2836             gen_helper_neon_##ACC##l_u32,                               \
2837             tcg_gen_##ACC##_i64,                                        \
2838             NULL,                                                       \
2839         };                                                              \
2840         return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
2841     }
2842
2843 DO_VMLAL_2SC(VMLAL_S, mull_s, add)
2844 DO_VMLAL_2SC(VMLAL_U, mull_u, add)
2845 DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
2846 DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
2847
2848 static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
2849 {
2850     static NeonGenTwoOpWidenFn * const opfn[] = {
2851         NULL,
2852         gen_VQDMULL_16,
2853         gen_VQDMULL_32,
2854         NULL,
2855     };
2856
2857     return do_2scalar_long(s, a, opfn[a->size], NULL);
2858 }
2859
2860 static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
2861 {
2862     static NeonGenTwoOpWidenFn * const opfn[] = {
2863         NULL,
2864         gen_VQDMULL_16,
2865         gen_VQDMULL_32,
2866         NULL,
2867     };
2868     static NeonGenTwo64OpFn * const accfn[] = {
2869         NULL,
2870         gen_VQDMLAL_acc_16,
2871         gen_VQDMLAL_acc_32,
2872         NULL,
2873     };
2874
2875     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2876 }
2877
2878 static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
2879 {
2880     static NeonGenTwoOpWidenFn * const opfn[] = {
2881         NULL,
2882         gen_VQDMULL_16,
2883         gen_VQDMULL_32,
2884         NULL,
2885     };
2886     static NeonGenTwo64OpFn * const accfn[] = {
2887         NULL,
2888         gen_VQDMLSL_acc_16,
2889         gen_VQDMLSL_acc_32,
2890         NULL,
2891     };
2892
2893     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2894 }
2895
2896 static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
2897 {
2898     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2899         return false;
2900     }
2901
2902     /* UNDEF accesses to D16-D31 if they don't exist. */
2903     if (!dc_isar_feature(aa32_simd_r32, s) &&
2904         ((a->vd | a->vn | a->vm) & 0x10)) {
2905         return false;
2906     }
2907
2908     if ((a->vn | a->vm | a->vd) & a->q) {
2909         return false;
2910     }
2911
2912     if (a->imm > 7 && !a->q) {
2913         return false;
2914     }
2915
2916     if (!vfp_access_check(s)) {
2917         return true;
2918     }
2919
2920     if (!a->q) {
2921         /* Extract 64 bits from <Vm:Vn> */
2922         TCGv_i64 left, right, dest;
2923
2924         left = tcg_temp_new_i64();
2925         right = tcg_temp_new_i64();
2926         dest = tcg_temp_new_i64();
2927
2928         neon_load_reg64(right, a->vn);
2929         neon_load_reg64(left, a->vm);
2930         tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
2931         neon_store_reg64(dest, a->vd);
2932
2933         tcg_temp_free_i64(left);
2934         tcg_temp_free_i64(right);
2935         tcg_temp_free_i64(dest);
2936     } else {
2937         /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
2938         TCGv_i64 left, middle, right, destleft, destright;
2939
2940         left = tcg_temp_new_i64();
2941         middle = tcg_temp_new_i64();
2942         right = tcg_temp_new_i64();
2943         destleft = tcg_temp_new_i64();
2944         destright = tcg_temp_new_i64();
2945
2946         if (a->imm < 8) {
2947             neon_load_reg64(right, a->vn);
2948             neon_load_reg64(middle, a->vn + 1);
2949             tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
2950             neon_load_reg64(left, a->vm);
2951             tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
2952         } else {
2953             neon_load_reg64(right, a->vn + 1);
2954             neon_load_reg64(middle, a->vm);
2955             tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
2956             neon_load_reg64(left, a->vm + 1);
2957             tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
2958         }
2959
2960         neon_store_reg64(destright, a->vd);
2961         neon_store_reg64(destleft, a->vd + 1);
2962
2963         tcg_temp_free_i64(destright);
2964         tcg_temp_free_i64(destleft);
2965         tcg_temp_free_i64(right);
2966         tcg_temp_free_i64(middle);
2967         tcg_temp_free_i64(left);
2968     }
2969     return true;
2970 }
2971
2972 static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
2973 {
2974     int n;
2975     TCGv_i32 tmp, tmp2, tmp3, tmp4;
2976     TCGv_ptr ptr1;
2977
2978     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2979         return false;
2980     }
2981
2982     /* UNDEF accesses to D16-D31 if they don't exist. */
2983     if (!dc_isar_feature(aa32_simd_r32, s) &&
2984         ((a->vd | a->vn | a->vm) & 0x10)) {
2985         return false;
2986     }
2987
2988     if (!vfp_access_check(s)) {
2989         return true;
2990     }
2991
2992     n = a->len + 1;
2993     if ((a->vn + n) > 32) {
2994         /*
2995          * This is UNPREDICTABLE; we choose to UNDEF to avoid the
2996          * helper function running off the end of the register file.
2997          */
2998         return false;
2999     }
3000     n <<= 3;
3001     if (a->op) {
3002         tmp = neon_load_reg(a->vd, 0);
3003     } else {
3004         tmp = tcg_temp_new_i32();
3005         tcg_gen_movi_i32(tmp, 0);
3006     }
3007     tmp2 = neon_load_reg(a->vm, 0);
3008     ptr1 = vfp_reg_ptr(true, a->vn);
3009     tmp4 = tcg_const_i32(n);
3010     gen_helper_neon_tbl(tmp2, tmp2, tmp, ptr1, tmp4);
3011     tcg_temp_free_i32(tmp);
3012     if (a->op) {
3013         tmp = neon_load_reg(a->vd, 1);
3014     } else {
3015         tmp = tcg_temp_new_i32();
3016         tcg_gen_movi_i32(tmp, 0);
3017     }
3018     tmp3 = neon_load_reg(a->vm, 1);
3019     gen_helper_neon_tbl(tmp3, tmp3, tmp, ptr1, tmp4);
3020     tcg_temp_free_i32(tmp4);
3021     tcg_temp_free_ptr(ptr1);
3022     neon_store_reg(a->vd, 0, tmp2);
3023     neon_store_reg(a->vd, 1, tmp3);
3024     tcg_temp_free_i32(tmp);
3025     return true;
3026 }
3027
3028 static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
3029 {
3030     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3031         return false;
3032     }
3033
3034     /* UNDEF accesses to D16-D31 if they don't exist. */
3035     if (!dc_isar_feature(aa32_simd_r32, s) &&
3036         ((a->vd | a->vm) & 0x10)) {
3037         return false;
3038     }
3039
3040     if (a->vd & a->q) {
3041         return false;
3042     }
3043
3044     if (!vfp_access_check(s)) {
3045         return true;
3046     }
3047
3048     tcg_gen_gvec_dup_mem(a->size, neon_reg_offset(a->vd, 0),
3049                          neon_element_offset(a->vm, a->index, a->size),
3050                          a->q ? 16 : 8, a->q ? 16 : 8);
3051     return true;
3052 }
3053
3054 static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
3055 {
3056     int pass, half;
3057
3058     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3059         return false;
3060     }
3061
3062     /* UNDEF accesses to D16-D31 if they don't exist. */
3063     if (!dc_isar_feature(aa32_simd_r32, s) &&
3064         ((a->vd | a->vm) & 0x10)) {
3065         return false;
3066     }
3067
3068     if ((a->vd | a->vm) & a->q) {
3069         return false;
3070     }
3071
3072     if (a->size == 3) {
3073         return false;
3074     }
3075
3076     if (!vfp_access_check(s)) {
3077         return true;
3078     }
3079
3080     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
3081         TCGv_i32 tmp[2];
3082
3083         for (half = 0; half < 2; half++) {
3084             tmp[half] = neon_load_reg(a->vm, pass * 2 + half);
3085             switch (a->size) {
3086             case 0:
3087                 tcg_gen_bswap32_i32(tmp[half], tmp[half]);
3088                 break;
3089             case 1:
3090                 gen_swap_half(tmp[half], tmp[half]);
3091                 break;
3092             case 2:
3093                 break;
3094             default:
3095                 g_assert_not_reached();
3096             }
3097         }
3098         neon_store_reg(a->vd, pass * 2, tmp[1]);
3099         neon_store_reg(a->vd, pass * 2 + 1, tmp[0]);
3100     }
3101     return true;
3102 }
3103
3104 static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
3105                               NeonGenWidenFn *widenfn,
3106                               NeonGenTwo64OpFn *opfn,
3107                               NeonGenTwo64OpFn *accfn)
3108 {
3109     /*
3110      * Pairwise long operations: widen both halves of the pair,
3111      * combine the pairs with the opfn, and then possibly accumulate
3112      * into the destination with the accfn.
3113      */
3114     int pass;
3115
3116     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3117         return false;
3118     }
3119
3120     /* UNDEF accesses to D16-D31 if they don't exist. */
3121     if (!dc_isar_feature(aa32_simd_r32, s) &&
3122         ((a->vd | a->vm) & 0x10)) {
3123         return false;
3124     }
3125
3126     if ((a->vd | a->vm) & a->q) {
3127         return false;
3128     }
3129
3130     if (!widenfn) {
3131         return false;
3132     }
3133
3134     if (!vfp_access_check(s)) {
3135         return true;
3136     }
3137
3138     for (pass = 0; pass < a->q + 1; pass++) {
3139         TCGv_i32 tmp;
3140         TCGv_i64 rm0_64, rm1_64, rd_64;
3141
3142         rm0_64 = tcg_temp_new_i64();
3143         rm1_64 = tcg_temp_new_i64();
3144         rd_64 = tcg_temp_new_i64();
3145         tmp = neon_load_reg(a->vm, pass * 2);
3146         widenfn(rm0_64, tmp);
3147         tcg_temp_free_i32(tmp);
3148         tmp = neon_load_reg(a->vm, pass * 2 + 1);
3149         widenfn(rm1_64, tmp);
3150         tcg_temp_free_i32(tmp);
3151         opfn(rd_64, rm0_64, rm1_64);
3152         tcg_temp_free_i64(rm0_64);
3153         tcg_temp_free_i64(rm1_64);
3154
3155         if (accfn) {
3156             TCGv_i64 tmp64 = tcg_temp_new_i64();
3157             neon_load_reg64(tmp64, a->vd + pass);
3158             accfn(rd_64, tmp64, rd_64);
3159             tcg_temp_free_i64(tmp64);
3160         }
3161         neon_store_reg64(rd_64, a->vd + pass);
3162         tcg_temp_free_i64(rd_64);
3163     }
3164     return true;
3165 }
3166
3167 static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
3168 {
3169     static NeonGenWidenFn * const widenfn[] = {
3170         gen_helper_neon_widen_s8,
3171         gen_helper_neon_widen_s16,
3172         tcg_gen_ext_i32_i64,
3173         NULL,
3174     };
3175     static NeonGenTwo64OpFn * const opfn[] = {
3176         gen_helper_neon_paddl_u16,
3177         gen_helper_neon_paddl_u32,
3178         tcg_gen_add_i64,
3179         NULL,
3180     };
3181
3182     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3183 }
3184
3185 static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
3186 {
3187     static NeonGenWidenFn * const widenfn[] = {
3188         gen_helper_neon_widen_u8,
3189         gen_helper_neon_widen_u16,
3190         tcg_gen_extu_i32_i64,
3191         NULL,
3192     };
3193     static NeonGenTwo64OpFn * const opfn[] = {
3194         gen_helper_neon_paddl_u16,
3195         gen_helper_neon_paddl_u32,
3196         tcg_gen_add_i64,
3197         NULL,
3198     };
3199
3200     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3201 }
3202
3203 static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
3204 {
3205     static NeonGenWidenFn * const widenfn[] = {
3206         gen_helper_neon_widen_s8,
3207         gen_helper_neon_widen_s16,
3208         tcg_gen_ext_i32_i64,
3209         NULL,
3210     };
3211     static NeonGenTwo64OpFn * const opfn[] = {
3212         gen_helper_neon_paddl_u16,
3213         gen_helper_neon_paddl_u32,
3214         tcg_gen_add_i64,
3215         NULL,
3216     };
3217     static NeonGenTwo64OpFn * const accfn[] = {
3218         gen_helper_neon_addl_u16,
3219         gen_helper_neon_addl_u32,
3220         tcg_gen_add_i64,
3221         NULL,
3222     };
3223
3224     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3225                              accfn[a->size]);
3226 }
3227
3228 static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
3229 {
3230     static NeonGenWidenFn * const widenfn[] = {
3231         gen_helper_neon_widen_u8,
3232         gen_helper_neon_widen_u16,
3233         tcg_gen_extu_i32_i64,
3234         NULL,
3235     };
3236     static NeonGenTwo64OpFn * const opfn[] = {
3237         gen_helper_neon_paddl_u16,
3238         gen_helper_neon_paddl_u32,
3239         tcg_gen_add_i64,
3240         NULL,
3241     };
3242     static NeonGenTwo64OpFn * const accfn[] = {
3243         gen_helper_neon_addl_u16,
3244         gen_helper_neon_addl_u32,
3245         tcg_gen_add_i64,
3246         NULL,
3247     };
3248
3249     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3250                              accfn[a->size]);
3251 }
3252
3253 typedef void ZipFn(TCGv_ptr, TCGv_ptr);
3254
3255 static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
3256                        ZipFn *fn)
3257 {
3258     TCGv_ptr pd, pm;
3259
3260     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3261         return false;
3262     }
3263
3264     /* UNDEF accesses to D16-D31 if they don't exist. */
3265     if (!dc_isar_feature(aa32_simd_r32, s) &&
3266         ((a->vd | a->vm) & 0x10)) {
3267         return false;
3268     }
3269
3270     if ((a->vd | a->vm) & a->q) {
3271         return false;
3272     }
3273
3274     if (!fn) {
3275         /* Bad size or size/q combination */
3276         return false;
3277     }
3278
3279     if (!vfp_access_check(s)) {
3280         return true;
3281     }
3282
3283     pd = vfp_reg_ptr(true, a->vd);
3284     pm = vfp_reg_ptr(true, a->vm);
3285     fn(pd, pm);
3286     tcg_temp_free_ptr(pd);
3287     tcg_temp_free_ptr(pm);
3288     return true;
3289 }
3290
3291 static bool trans_VUZP(DisasContext *s, arg_2misc *a)
3292 {
3293     static ZipFn * const fn[2][4] = {
3294         {
3295             gen_helper_neon_unzip8,
3296             gen_helper_neon_unzip16,
3297             NULL,
3298             NULL,
3299         }, {
3300             gen_helper_neon_qunzip8,
3301             gen_helper_neon_qunzip16,
3302             gen_helper_neon_qunzip32,
3303             NULL,
3304         }
3305     };
3306     return do_zip_uzp(s, a, fn[a->q][a->size]);
3307 }
3308
3309 static bool trans_VZIP(DisasContext *s, arg_2misc *a)
3310 {
3311     static ZipFn * const fn[2][4] = {
3312         {
3313             gen_helper_neon_zip8,
3314             gen_helper_neon_zip16,
3315             NULL,
3316             NULL,
3317         }, {
3318             gen_helper_neon_qzip8,
3319             gen_helper_neon_qzip16,
3320             gen_helper_neon_qzip32,
3321             NULL,
3322         }
3323     };
3324     return do_zip_uzp(s, a, fn[a->q][a->size]);
3325 }
3326
3327 static bool do_vmovn(DisasContext *s, arg_2misc *a,
3328                      NeonGenNarrowEnvFn *narrowfn)
3329 {
3330     TCGv_i64 rm;
3331     TCGv_i32 rd0, rd1;
3332
3333     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3334         return false;
3335     }
3336
3337     /* UNDEF accesses to D16-D31 if they don't exist. */
3338     if (!dc_isar_feature(aa32_simd_r32, s) &&
3339         ((a->vd | a->vm) & 0x10)) {
3340         return false;
3341     }
3342
3343     if (a->vm & 1) {
3344         return false;
3345     }
3346
3347     if (!narrowfn) {
3348         return false;
3349     }
3350
3351     if (!vfp_access_check(s)) {
3352         return true;
3353     }
3354
3355     rm = tcg_temp_new_i64();
3356     rd0 = tcg_temp_new_i32();
3357     rd1 = tcg_temp_new_i32();
3358
3359     neon_load_reg64(rm, a->vm);
3360     narrowfn(rd0, cpu_env, rm);
3361     neon_load_reg64(rm, a->vm + 1);
3362     narrowfn(rd1, cpu_env, rm);
3363     neon_store_reg(a->vd, 0, rd0);
3364     neon_store_reg(a->vd, 1, rd1);
3365     tcg_temp_free_i64(rm);
3366     return true;
3367 }
3368
3369 #define DO_VMOVN(INSN, FUNC)                                    \
3370     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3371     {                                                           \
3372         static NeonGenNarrowEnvFn * const narrowfn[] = {        \
3373             FUNC##8,                                            \
3374             FUNC##16,                                           \
3375             FUNC##32,                                           \
3376             NULL,                                               \
3377         };                                                      \
3378         return do_vmovn(s, a, narrowfn[a->size]);               \
3379     }
3380
3381 DO_VMOVN(VMOVN, gen_neon_narrow_u)
3382 DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
3383 DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
3384 DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
3385
3386 static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
3387 {
3388     TCGv_i32 rm0, rm1;
3389     TCGv_i64 rd;
3390     static NeonGenWidenFn * const widenfns[] = {
3391         gen_helper_neon_widen_u8,
3392         gen_helper_neon_widen_u16,
3393         tcg_gen_extu_i32_i64,
3394         NULL,
3395     };
3396     NeonGenWidenFn *widenfn = widenfns[a->size];
3397
3398     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3399         return false;
3400     }
3401
3402     /* UNDEF accesses to D16-D31 if they don't exist. */
3403     if (!dc_isar_feature(aa32_simd_r32, s) &&
3404         ((a->vd | a->vm) & 0x10)) {
3405         return false;
3406     }
3407
3408     if (a->vd & 1) {
3409         return false;
3410     }
3411
3412     if (!widenfn) {
3413         return false;
3414     }
3415
3416     if (!vfp_access_check(s)) {
3417         return true;
3418     }
3419
3420     rd = tcg_temp_new_i64();
3421
3422     rm0 = neon_load_reg(a->vm, 0);
3423     rm1 = neon_load_reg(a->vm, 1);
3424
3425     widenfn(rd, rm0);
3426     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3427     neon_store_reg64(rd, a->vd);
3428     widenfn(rd, rm1);
3429     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3430     neon_store_reg64(rd, a->vd + 1);
3431
3432     tcg_temp_free_i64(rd);
3433     tcg_temp_free_i32(rm0);
3434     tcg_temp_free_i32(rm1);
3435     return true;
3436 }
3437
3438 static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
3439 {
3440     TCGv_ptr fpst;
3441     TCGv_i32 ahp, tmp, tmp2, tmp3;
3442
3443     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3444         !dc_isar_feature(aa32_fp16_spconv, s)) {
3445         return false;
3446     }
3447
3448     /* UNDEF accesses to D16-D31 if they don't exist. */
3449     if (!dc_isar_feature(aa32_simd_r32, s) &&
3450         ((a->vd | a->vm) & 0x10)) {
3451         return false;
3452     }
3453
3454     if ((a->vm & 1) || (a->size != 1)) {
3455         return false;
3456     }
3457
3458     if (!vfp_access_check(s)) {
3459         return true;
3460     }
3461
3462     fpst = fpstatus_ptr(FPST_STD);
3463     ahp = get_ahp_flag();
3464     tmp = neon_load_reg(a->vm, 0);
3465     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3466     tmp2 = neon_load_reg(a->vm, 1);
3467     gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
3468     tcg_gen_shli_i32(tmp2, tmp2, 16);
3469     tcg_gen_or_i32(tmp2, tmp2, tmp);
3470     tcg_temp_free_i32(tmp);
3471     tmp = neon_load_reg(a->vm, 2);
3472     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3473     tmp3 = neon_load_reg(a->vm, 3);
3474     neon_store_reg(a->vd, 0, tmp2);
3475     gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
3476     tcg_gen_shli_i32(tmp3, tmp3, 16);
3477     tcg_gen_or_i32(tmp3, tmp3, tmp);
3478     neon_store_reg(a->vd, 1, tmp3);
3479     tcg_temp_free_i32(tmp);
3480     tcg_temp_free_i32(ahp);
3481     tcg_temp_free_ptr(fpst);
3482
3483     return true;
3484 }
3485
3486 static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
3487 {
3488     TCGv_ptr fpst;
3489     TCGv_i32 ahp, tmp, tmp2, tmp3;
3490
3491     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3492         !dc_isar_feature(aa32_fp16_spconv, s)) {
3493         return false;
3494     }
3495
3496     /* UNDEF accesses to D16-D31 if they don't exist. */
3497     if (!dc_isar_feature(aa32_simd_r32, s) &&
3498         ((a->vd | a->vm) & 0x10)) {
3499         return false;
3500     }
3501
3502     if ((a->vd & 1) || (a->size != 1)) {
3503         return false;
3504     }
3505
3506     if (!vfp_access_check(s)) {
3507         return true;
3508     }
3509
3510     fpst = fpstatus_ptr(FPST_STD);
3511     ahp = get_ahp_flag();
3512     tmp3 = tcg_temp_new_i32();
3513     tmp = neon_load_reg(a->vm, 0);
3514     tmp2 = neon_load_reg(a->vm, 1);
3515     tcg_gen_ext16u_i32(tmp3, tmp);
3516     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3517     neon_store_reg(a->vd, 0, tmp3);
3518     tcg_gen_shri_i32(tmp, tmp, 16);
3519     gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
3520     neon_store_reg(a->vd, 1, tmp);
3521     tmp3 = tcg_temp_new_i32();
3522     tcg_gen_ext16u_i32(tmp3, tmp2);
3523     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3524     neon_store_reg(a->vd, 2, tmp3);
3525     tcg_gen_shri_i32(tmp2, tmp2, 16);
3526     gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
3527     neon_store_reg(a->vd, 3, tmp2);
3528     tcg_temp_free_i32(ahp);
3529     tcg_temp_free_ptr(fpst);
3530
3531     return true;
3532 }
3533
3534 static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
3535 {
3536     int vec_size = a->q ? 16 : 8;
3537     int rd_ofs = neon_reg_offset(a->vd, 0);
3538     int rm_ofs = neon_reg_offset(a->vm, 0);
3539
3540     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3541         return false;
3542     }
3543
3544     /* UNDEF accesses to D16-D31 if they don't exist. */
3545     if (!dc_isar_feature(aa32_simd_r32, s) &&
3546         ((a->vd | a->vm) & 0x10)) {
3547         return false;
3548     }
3549
3550     if (a->size == 3) {
3551         return false;
3552     }
3553
3554     if ((a->vd | a->vm) & a->q) {
3555         return false;
3556     }
3557
3558     if (!vfp_access_check(s)) {
3559         return true;
3560     }
3561
3562     fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size);
3563
3564     return true;
3565 }
3566
3567 #define DO_2MISC_VEC(INSN, FN)                                  \
3568     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3569     {                                                           \
3570        &nb