target/arm: Convert Neon VCVT fixed-point to gvec
[qemu.git] / target / arm / translate-neon.c.inc
1 /*
2  *  ARM translation: AArch32 Neon instructions
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *  Copyright (c) 2005-2007 CodeSourcery
6  *  Copyright (c) 2007 OpenedHand, Ltd.
7  *  Copyright (c) 2020 Linaro, Ltd.
8  *
9  * This library is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2 of the License, or (at your option) any later version.
13  *
14  * This library is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
21  */
22
23 /*
24  * This file is intended to be included from translate.c; it uses
25  * some macros and definitions provided by that file.
26  * It might be possible to convert it to a standalone .c file eventually.
27  */
28
29 static inline int plus1(DisasContext *s, int x)
30 {
31     return x + 1;
32 }
33
34 static inline int rsub_64(DisasContext *s, int x)
35 {
36     return 64 - x;
37 }
38
39 static inline int rsub_32(DisasContext *s, int x)
40 {
41     return 32 - x;
42 }
43 static inline int rsub_16(DisasContext *s, int x)
44 {
45     return 16 - x;
46 }
47 static inline int rsub_8(DisasContext *s, int x)
48 {
49     return 8 - x;
50 }
51
52 /* Include the generated Neon decoder */
53 #include "decode-neon-dp.c.inc"
54 #include "decode-neon-ls.c.inc"
55 #include "decode-neon-shared.c.inc"
56
57 /* Return the offset of a 2**SIZE piece of a NEON register, at index ELE,
58  * where 0 is the least significant end of the register.
59  */
60 static inline long
61 neon_element_offset(int reg, int element, MemOp size)
62 {
63     int element_size = 1 << size;
64     int ofs = element * element_size;
65 #ifdef HOST_WORDS_BIGENDIAN
66     /* Calculate the offset assuming fully little-endian,
67      * then XOR to account for the order of the 8-byte units.
68      */
69     if (element_size < 8) {
70         ofs ^= 8 - element_size;
71     }
72 #endif
73     return neon_reg_offset(reg, 0) + ofs;
74 }
75
76 static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
77 {
78     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
79
80     switch (mop) {
81     case MO_UB:
82         tcg_gen_ld8u_i32(var, cpu_env, offset);
83         break;
84     case MO_UW:
85         tcg_gen_ld16u_i32(var, cpu_env, offset);
86         break;
87     case MO_UL:
88         tcg_gen_ld_i32(var, cpu_env, offset);
89         break;
90     default:
91         g_assert_not_reached();
92     }
93 }
94
95 static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
96 {
97     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
98
99     switch (mop) {
100     case MO_UB:
101         tcg_gen_ld8u_i64(var, cpu_env, offset);
102         break;
103     case MO_UW:
104         tcg_gen_ld16u_i64(var, cpu_env, offset);
105         break;
106     case MO_UL:
107         tcg_gen_ld32u_i64(var, cpu_env, offset);
108         break;
109     case MO_Q:
110         tcg_gen_ld_i64(var, cpu_env, offset);
111         break;
112     default:
113         g_assert_not_reached();
114     }
115 }
116
117 static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
118 {
119     long offset = neon_element_offset(reg, ele, size);
120
121     switch (size) {
122     case MO_8:
123         tcg_gen_st8_i32(var, cpu_env, offset);
124         break;
125     case MO_16:
126         tcg_gen_st16_i32(var, cpu_env, offset);
127         break;
128     case MO_32:
129         tcg_gen_st_i32(var, cpu_env, offset);
130         break;
131     default:
132         g_assert_not_reached();
133     }
134 }
135
136 static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
137 {
138     long offset = neon_element_offset(reg, ele, size);
139
140     switch (size) {
141     case MO_8:
142         tcg_gen_st8_i64(var, cpu_env, offset);
143         break;
144     case MO_16:
145         tcg_gen_st16_i64(var, cpu_env, offset);
146         break;
147     case MO_32:
148         tcg_gen_st32_i64(var, cpu_env, offset);
149         break;
150     case MO_64:
151         tcg_gen_st_i64(var, cpu_env, offset);
152         break;
153     default:
154         g_assert_not_reached();
155     }
156 }
157
158 static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
159 {
160     int opr_sz;
161     TCGv_ptr fpst;
162     gen_helper_gvec_3_ptr *fn_gvec_ptr;
163
164     if (!dc_isar_feature(aa32_vcma, s)
165         || (!a->size && !dc_isar_feature(aa32_fp16_arith, s))) {
166         return false;
167     }
168
169     /* UNDEF accesses to D16-D31 if they don't exist. */
170     if (!dc_isar_feature(aa32_simd_r32, s) &&
171         ((a->vd | a->vn | a->vm) & 0x10)) {
172         return false;
173     }
174
175     if ((a->vn | a->vm | a->vd) & a->q) {
176         return false;
177     }
178
179     if (!vfp_access_check(s)) {
180         return true;
181     }
182
183     opr_sz = (1 + a->q) * 8;
184     fpst = fpstatus_ptr(a->size == 0 ? FPST_STD_F16 : FPST_STD);
185     fn_gvec_ptr = a->size ? gen_helper_gvec_fcmlas : gen_helper_gvec_fcmlah;
186     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
187                        vfp_reg_offset(1, a->vn),
188                        vfp_reg_offset(1, a->vm),
189                        fpst, opr_sz, opr_sz, a->rot,
190                        fn_gvec_ptr);
191     tcg_temp_free_ptr(fpst);
192     return true;
193 }
194
195 static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
196 {
197     int opr_sz;
198     TCGv_ptr fpst;
199     gen_helper_gvec_3_ptr *fn_gvec_ptr;
200
201     if (!dc_isar_feature(aa32_vcma, s)
202         || (!a->size && !dc_isar_feature(aa32_fp16_arith, s))) {
203         return false;
204     }
205
206     /* UNDEF accesses to D16-D31 if they don't exist. */
207     if (!dc_isar_feature(aa32_simd_r32, s) &&
208         ((a->vd | a->vn | a->vm) & 0x10)) {
209         return false;
210     }
211
212     if ((a->vn | a->vm | a->vd) & a->q) {
213         return false;
214     }
215
216     if (!vfp_access_check(s)) {
217         return true;
218     }
219
220     opr_sz = (1 + a->q) * 8;
221     fpst = fpstatus_ptr(a->size == 0 ? FPST_STD_F16 : FPST_STD);
222     fn_gvec_ptr = a->size ? gen_helper_gvec_fcadds : gen_helper_gvec_fcaddh;
223     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
224                        vfp_reg_offset(1, a->vn),
225                        vfp_reg_offset(1, a->vm),
226                        fpst, opr_sz, opr_sz, a->rot,
227                        fn_gvec_ptr);
228     tcg_temp_free_ptr(fpst);
229     return true;
230 }
231
232 static bool trans_VDOT(DisasContext *s, arg_VDOT *a)
233 {
234     int opr_sz;
235     gen_helper_gvec_3 *fn_gvec;
236
237     if (!dc_isar_feature(aa32_dp, s)) {
238         return false;
239     }
240
241     /* UNDEF accesses to D16-D31 if they don't exist. */
242     if (!dc_isar_feature(aa32_simd_r32, s) &&
243         ((a->vd | a->vn | a->vm) & 0x10)) {
244         return false;
245     }
246
247     if ((a->vn | a->vm | a->vd) & a->q) {
248         return false;
249     }
250
251     if (!vfp_access_check(s)) {
252         return true;
253     }
254
255     opr_sz = (1 + a->q) * 8;
256     fn_gvec = a->u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b;
257     tcg_gen_gvec_3_ool(vfp_reg_offset(1, a->vd),
258                        vfp_reg_offset(1, a->vn),
259                        vfp_reg_offset(1, a->vm),
260                        opr_sz, opr_sz, 0, fn_gvec);
261     return true;
262 }
263
264 static bool trans_VFML(DisasContext *s, arg_VFML *a)
265 {
266     int opr_sz;
267
268     if (!dc_isar_feature(aa32_fhm, s)) {
269         return false;
270     }
271
272     /* UNDEF accesses to D16-D31 if they don't exist. */
273     if (!dc_isar_feature(aa32_simd_r32, s) &&
274         (a->vd & 0x10)) {
275         return false;
276     }
277
278     if (a->vd & a->q) {
279         return false;
280     }
281
282     if (!vfp_access_check(s)) {
283         return true;
284     }
285
286     opr_sz = (1 + a->q) * 8;
287     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
288                        vfp_reg_offset(a->q, a->vn),
289                        vfp_reg_offset(a->q, a->vm),
290                        cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
291                        gen_helper_gvec_fmlal_a32);
292     return true;
293 }
294
295 static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
296 {
297     gen_helper_gvec_3_ptr *fn_gvec_ptr;
298     int opr_sz;
299     TCGv_ptr fpst;
300
301     if (!dc_isar_feature(aa32_vcma, s)) {
302         return false;
303     }
304     if (a->size == 0 && !dc_isar_feature(aa32_fp16_arith, s)) {
305         return false;
306     }
307
308     /* UNDEF accesses to D16-D31 if they don't exist. */
309     if (!dc_isar_feature(aa32_simd_r32, s) &&
310         ((a->vd | a->vn | a->vm) & 0x10)) {
311         return false;
312     }
313
314     if ((a->vd | a->vn) & a->q) {
315         return false;
316     }
317
318     if (!vfp_access_check(s)) {
319         return true;
320     }
321
322     fn_gvec_ptr = (a->size ? gen_helper_gvec_fcmlas_idx
323                    : gen_helper_gvec_fcmlah_idx);
324     opr_sz = (1 + a->q) * 8;
325     fpst = fpstatus_ptr(a->size == 0 ? FPST_STD_F16 : FPST_STD);
326     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
327                        vfp_reg_offset(1, a->vn),
328                        vfp_reg_offset(1, a->vm),
329                        fpst, opr_sz, opr_sz,
330                        (a->index << 2) | a->rot, fn_gvec_ptr);
331     tcg_temp_free_ptr(fpst);
332     return true;
333 }
334
335 static bool trans_VDOT_scalar(DisasContext *s, arg_VDOT_scalar *a)
336 {
337     gen_helper_gvec_3 *fn_gvec;
338     int opr_sz;
339     TCGv_ptr fpst;
340
341     if (!dc_isar_feature(aa32_dp, s)) {
342         return false;
343     }
344
345     /* UNDEF accesses to D16-D31 if they don't exist. */
346     if (!dc_isar_feature(aa32_simd_r32, s) &&
347         ((a->vd | a->vn) & 0x10)) {
348         return false;
349     }
350
351     if ((a->vd | a->vn) & a->q) {
352         return false;
353     }
354
355     if (!vfp_access_check(s)) {
356         return true;
357     }
358
359     fn_gvec = a->u ? gen_helper_gvec_udot_idx_b : gen_helper_gvec_sdot_idx_b;
360     opr_sz = (1 + a->q) * 8;
361     fpst = fpstatus_ptr(FPST_STD);
362     tcg_gen_gvec_3_ool(vfp_reg_offset(1, a->vd),
363                        vfp_reg_offset(1, a->vn),
364                        vfp_reg_offset(1, a->rm),
365                        opr_sz, opr_sz, a->index, fn_gvec);
366     tcg_temp_free_ptr(fpst);
367     return true;
368 }
369
370 static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
371 {
372     int opr_sz;
373
374     if (!dc_isar_feature(aa32_fhm, s)) {
375         return false;
376     }
377
378     /* UNDEF accesses to D16-D31 if they don't exist. */
379     if (!dc_isar_feature(aa32_simd_r32, s) &&
380         ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
381         return false;
382     }
383
384     if (a->vd & a->q) {
385         return false;
386     }
387
388     if (!vfp_access_check(s)) {
389         return true;
390     }
391
392     opr_sz = (1 + a->q) * 8;
393     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
394                        vfp_reg_offset(a->q, a->vn),
395                        vfp_reg_offset(a->q, a->rm),
396                        cpu_env, opr_sz, opr_sz,
397                        (a->index << 2) | a->s, /* is_2 == 0 */
398                        gen_helper_gvec_fmlal_idx_a32);
399     return true;
400 }
401
402 static struct {
403     int nregs;
404     int interleave;
405     int spacing;
406 } const neon_ls_element_type[11] = {
407     {1, 4, 1},
408     {1, 4, 2},
409     {4, 1, 1},
410     {2, 2, 2},
411     {1, 3, 1},
412     {1, 3, 2},
413     {3, 1, 1},
414     {1, 1, 1},
415     {1, 2, 1},
416     {1, 2, 2},
417     {2, 1, 1}
418 };
419
420 static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
421                                       int stride)
422 {
423     if (rm != 15) {
424         TCGv_i32 base;
425
426         base = load_reg(s, rn);
427         if (rm == 13) {
428             tcg_gen_addi_i32(base, base, stride);
429         } else {
430             TCGv_i32 index;
431             index = load_reg(s, rm);
432             tcg_gen_add_i32(base, base, index);
433             tcg_temp_free_i32(index);
434         }
435         store_reg(s, rn, base);
436     }
437 }
438
439 static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
440 {
441     /* Neon load/store multiple structures */
442     int nregs, interleave, spacing, reg, n;
443     MemOp endian = s->be_data;
444     int mmu_idx = get_mem_index(s);
445     int size = a->size;
446     TCGv_i64 tmp64;
447     TCGv_i32 addr, tmp;
448
449     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
450         return false;
451     }
452
453     /* UNDEF accesses to D16-D31 if they don't exist */
454     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
455         return false;
456     }
457     if (a->itype > 10) {
458         return false;
459     }
460     /* Catch UNDEF cases for bad values of align field */
461     switch (a->itype & 0xc) {
462     case 4:
463         if (a->align >= 2) {
464             return false;
465         }
466         break;
467     case 8:
468         if (a->align == 3) {
469             return false;
470         }
471         break;
472     default:
473         break;
474     }
475     nregs = neon_ls_element_type[a->itype].nregs;
476     interleave = neon_ls_element_type[a->itype].interleave;
477     spacing = neon_ls_element_type[a->itype].spacing;
478     if (size == 3 && (interleave | spacing) != 1) {
479         return false;
480     }
481
482     if (!vfp_access_check(s)) {
483         return true;
484     }
485
486     /* For our purposes, bytes are always little-endian.  */
487     if (size == 0) {
488         endian = MO_LE;
489     }
490     /*
491      * Consecutive little-endian elements from a single register
492      * can be promoted to a larger little-endian operation.
493      */
494     if (interleave == 1 && endian == MO_LE) {
495         size = 3;
496     }
497     tmp64 = tcg_temp_new_i64();
498     addr = tcg_temp_new_i32();
499     tmp = tcg_const_i32(1 << size);
500     load_reg_var(s, addr, a->rn);
501     for (reg = 0; reg < nregs; reg++) {
502         for (n = 0; n < 8 >> size; n++) {
503             int xs;
504             for (xs = 0; xs < interleave; xs++) {
505                 int tt = a->vd + reg + spacing * xs;
506
507                 if (a->l) {
508                     gen_aa32_ld_i64(s, tmp64, addr, mmu_idx, endian | size);
509                     neon_store_element64(tt, n, size, tmp64);
510                 } else {
511                     neon_load_element64(tmp64, tt, n, size);
512                     gen_aa32_st_i64(s, tmp64, addr, mmu_idx, endian | size);
513                 }
514                 tcg_gen_add_i32(addr, addr, tmp);
515             }
516         }
517     }
518     tcg_temp_free_i32(addr);
519     tcg_temp_free_i32(tmp);
520     tcg_temp_free_i64(tmp64);
521
522     gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
523     return true;
524 }
525
526 static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
527 {
528     /* Neon load single structure to all lanes */
529     int reg, stride, vec_size;
530     int vd = a->vd;
531     int size = a->size;
532     int nregs = a->n + 1;
533     TCGv_i32 addr, tmp;
534
535     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
536         return false;
537     }
538
539     /* UNDEF accesses to D16-D31 if they don't exist */
540     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
541         return false;
542     }
543
544     if (size == 3) {
545         if (nregs != 4 || a->a == 0) {
546             return false;
547         }
548         /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
549         size = 2;
550     }
551     if (nregs == 1 && a->a == 1 && size == 0) {
552         return false;
553     }
554     if (nregs == 3 && a->a == 1) {
555         return false;
556     }
557
558     if (!vfp_access_check(s)) {
559         return true;
560     }
561
562     /*
563      * VLD1 to all lanes: T bit indicates how many Dregs to write.
564      * VLD2/3/4 to all lanes: T bit indicates register stride.
565      */
566     stride = a->t ? 2 : 1;
567     vec_size = nregs == 1 ? stride * 8 : 8;
568
569     tmp = tcg_temp_new_i32();
570     addr = tcg_temp_new_i32();
571     load_reg_var(s, addr, a->rn);
572     for (reg = 0; reg < nregs; reg++) {
573         gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s),
574                         s->be_data | size);
575         if ((vd & 1) && vec_size == 16) {
576             /*
577              * We cannot write 16 bytes at once because the
578              * destination is unaligned.
579              */
580             tcg_gen_gvec_dup_i32(size, neon_reg_offset(vd, 0),
581                                  8, 8, tmp);
582             tcg_gen_gvec_mov(0, neon_reg_offset(vd + 1, 0),
583                              neon_reg_offset(vd, 0), 8, 8);
584         } else {
585             tcg_gen_gvec_dup_i32(size, neon_reg_offset(vd, 0),
586                                  vec_size, vec_size, tmp);
587         }
588         tcg_gen_addi_i32(addr, addr, 1 << size);
589         vd += stride;
590     }
591     tcg_temp_free_i32(tmp);
592     tcg_temp_free_i32(addr);
593
594     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
595
596     return true;
597 }
598
599 static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
600 {
601     /* Neon load/store single structure to one lane */
602     int reg;
603     int nregs = a->n + 1;
604     int vd = a->vd;
605     TCGv_i32 addr, tmp;
606
607     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
608         return false;
609     }
610
611     /* UNDEF accesses to D16-D31 if they don't exist */
612     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
613         return false;
614     }
615
616     /* Catch the UNDEF cases. This is unavoidably a bit messy. */
617     switch (nregs) {
618     case 1:
619         if (((a->align & (1 << a->size)) != 0) ||
620             (a->size == 2 && ((a->align & 3) == 1 || (a->align & 3) == 2))) {
621             return false;
622         }
623         break;
624     case 3:
625         if ((a->align & 1) != 0) {
626             return false;
627         }
628         /* fall through */
629     case 2:
630         if (a->size == 2 && (a->align & 2) != 0) {
631             return false;
632         }
633         break;
634     case 4:
635         if ((a->size == 2) && ((a->align & 3) == 3)) {
636             return false;
637         }
638         break;
639     default:
640         abort();
641     }
642     if ((vd + a->stride * (nregs - 1)) > 31) {
643         /*
644          * Attempts to write off the end of the register file are
645          * UNPREDICTABLE; we choose to UNDEF because otherwise we would
646          * access off the end of the array that holds the register data.
647          */
648         return false;
649     }
650
651     if (!vfp_access_check(s)) {
652         return true;
653     }
654
655     tmp = tcg_temp_new_i32();
656     addr = tcg_temp_new_i32();
657     load_reg_var(s, addr, a->rn);
658     /*
659      * TODO: if we implemented alignment exceptions, we should check
660      * addr against the alignment encoded in a->align here.
661      */
662     for (reg = 0; reg < nregs; reg++) {
663         if (a->l) {
664             gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s),
665                             s->be_data | a->size);
666             neon_store_element(vd, a->reg_idx, a->size, tmp);
667         } else { /* Store */
668             neon_load_element(tmp, vd, a->reg_idx, a->size);
669             gen_aa32_st_i32(s, tmp, addr, get_mem_index(s),
670                             s->be_data | a->size);
671         }
672         vd += a->stride;
673         tcg_gen_addi_i32(addr, addr, 1 << a->size);
674     }
675     tcg_temp_free_i32(addr);
676     tcg_temp_free_i32(tmp);
677
678     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
679
680     return true;
681 }
682
683 static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
684 {
685     int vec_size = a->q ? 16 : 8;
686     int rd_ofs = neon_reg_offset(a->vd, 0);
687     int rn_ofs = neon_reg_offset(a->vn, 0);
688     int rm_ofs = neon_reg_offset(a->vm, 0);
689
690     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
691         return false;
692     }
693
694     /* UNDEF accesses to D16-D31 if they don't exist. */
695     if (!dc_isar_feature(aa32_simd_r32, s) &&
696         ((a->vd | a->vn | a->vm) & 0x10)) {
697         return false;
698     }
699
700     if ((a->vn | a->vm | a->vd) & a->q) {
701         return false;
702     }
703
704     if (!vfp_access_check(s)) {
705         return true;
706     }
707
708     fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
709     return true;
710 }
711
712 #define DO_3SAME(INSN, FUNC)                                            \
713     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
714     {                                                                   \
715         return do_3same(s, a, FUNC);                                    \
716     }
717
718 DO_3SAME(VADD, tcg_gen_gvec_add)
719 DO_3SAME(VSUB, tcg_gen_gvec_sub)
720 DO_3SAME(VAND, tcg_gen_gvec_and)
721 DO_3SAME(VBIC, tcg_gen_gvec_andc)
722 DO_3SAME(VORR, tcg_gen_gvec_or)
723 DO_3SAME(VORN, tcg_gen_gvec_orc)
724 DO_3SAME(VEOR, tcg_gen_gvec_xor)
725 DO_3SAME(VSHL_S, gen_gvec_sshl)
726 DO_3SAME(VSHL_U, gen_gvec_ushl)
727 DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
728 DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
729 DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
730 DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
731
732 /* These insns are all gvec_bitsel but with the inputs in various orders. */
733 #define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
734     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
735                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
736                                 uint32_t oprsz, uint32_t maxsz)         \
737     {                                                                   \
738         tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
739     }                                                                   \
740     DO_3SAME(INSN, gen_##INSN##_3s)
741
742 DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
743 DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
744 DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
745
746 #define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
747     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
748     {                                                                   \
749         if (a->size == 3) {                                             \
750             return false;                                               \
751         }                                                               \
752         return do_3same(s, a, FUNC);                                    \
753     }
754
755 DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
756 DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
757 DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
758 DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
759 DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
760 DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
761 DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
762 DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
763 DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
764 DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
765 DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
766 DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
767
768 #define DO_3SAME_CMP(INSN, COND)                                        \
769     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
770                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
771                                 uint32_t oprsz, uint32_t maxsz)         \
772     {                                                                   \
773         tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
774     }                                                                   \
775     DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
776
777 DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
778 DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
779 DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
780 DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
781 DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
782
783 #define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
784     static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
785                          uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
786     {                                                                      \
787         tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
788     }
789
790 WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
791
792 static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
793 {
794     if (a->size != 0) {
795         return false;
796     }
797     return do_3same(s, a, gen_VMUL_p_3s);
798 }
799
800 #define DO_VQRDMLAH(INSN, FUNC)                                         \
801     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
802     {                                                                   \
803         if (!dc_isar_feature(aa32_rdm, s)) {                            \
804             return false;                                               \
805         }                                                               \
806         if (a->size != 1 && a->size != 2) {                             \
807             return false;                                               \
808         }                                                               \
809         return do_3same(s, a, FUNC);                                    \
810     }
811
812 DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
813 DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
814
815 #define DO_SHA1(NAME, FUNC)                                             \
816     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
817     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
818     {                                                                   \
819         if (!dc_isar_feature(aa32_sha1, s)) {                           \
820             return false;                                               \
821         }                                                               \
822         return do_3same(s, a, gen_##NAME##_3s);                         \
823     }
824
825 DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
826 DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
827 DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
828 DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
829
830 #define DO_SHA2(NAME, FUNC)                                             \
831     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
832     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
833     {                                                                   \
834         if (!dc_isar_feature(aa32_sha2, s)) {                           \
835             return false;                                               \
836         }                                                               \
837         return do_3same(s, a, gen_##NAME##_3s);                         \
838     }
839
840 DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
841 DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
842 DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
843
844 #define DO_3SAME_64(INSN, FUNC)                                         \
845     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
846                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
847                                 uint32_t oprsz, uint32_t maxsz)         \
848     {                                                                   \
849         static const GVecGen3 op = { .fni8 = FUNC };                    \
850         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op);      \
851     }                                                                   \
852     DO_3SAME(INSN, gen_##INSN##_3s)
853
854 #define DO_3SAME_64_ENV(INSN, FUNC)                                     \
855     static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)    \
856     {                                                                   \
857         FUNC(d, cpu_env, n, m);                                         \
858     }                                                                   \
859     DO_3SAME_64(INSN, gen_##INSN##_elt)
860
861 DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
862 DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
863 DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
864 DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
865 DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
866 DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
867
868 #define DO_3SAME_32(INSN, FUNC)                                         \
869     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
870                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
871                                 uint32_t oprsz, uint32_t maxsz)         \
872     {                                                                   \
873         static const GVecGen3 ops[4] = {                                \
874             { .fni4 = gen_helper_neon_##FUNC##8 },                      \
875             { .fni4 = gen_helper_neon_##FUNC##16 },                     \
876             { .fni4 = gen_helper_neon_##FUNC##32 },                     \
877             { 0 },                                                      \
878         };                                                              \
879         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
880     }                                                                   \
881     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
882     {                                                                   \
883         if (a->size > 2) {                                              \
884             return false;                                               \
885         }                                                               \
886         return do_3same(s, a, gen_##INSN##_3s);                         \
887     }
888
889 /*
890  * Some helper functions need to be passed the cpu_env. In order
891  * to use those with the gvec APIs like tcg_gen_gvec_3() we need
892  * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
893  * and which call a NeonGenTwoOpEnvFn().
894  */
895 #define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
896     static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
897     {                                                                   \
898         FUNC(d, cpu_env, n, m);                                         \
899     }
900
901 #define DO_3SAME_32_ENV(INSN, FUNC)                                     \
902     WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8);        \
903     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16);      \
904     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32);      \
905     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
906                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
907                                 uint32_t oprsz, uint32_t maxsz)         \
908     {                                                                   \
909         static const GVecGen3 ops[4] = {                                \
910             { .fni4 = gen_##INSN##_tramp8 },                            \
911             { .fni4 = gen_##INSN##_tramp16 },                           \
912             { .fni4 = gen_##INSN##_tramp32 },                           \
913             { 0 },                                                      \
914         };                                                              \
915         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
916     }                                                                   \
917     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
918     {                                                                   \
919         if (a->size > 2) {                                              \
920             return false;                                               \
921         }                                                               \
922         return do_3same(s, a, gen_##INSN##_3s);                         \
923     }
924
925 DO_3SAME_32(VHADD_S, hadd_s)
926 DO_3SAME_32(VHADD_U, hadd_u)
927 DO_3SAME_32(VHSUB_S, hsub_s)
928 DO_3SAME_32(VHSUB_U, hsub_u)
929 DO_3SAME_32(VRHADD_S, rhadd_s)
930 DO_3SAME_32(VRHADD_U, rhadd_u)
931 DO_3SAME_32(VRSHL_S, rshl_s)
932 DO_3SAME_32(VRSHL_U, rshl_u)
933
934 DO_3SAME_32_ENV(VQSHL_S, qshl_s)
935 DO_3SAME_32_ENV(VQSHL_U, qshl_u)
936 DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
937 DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
938
939 static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn)
940 {
941     /* Operations handled pairwise 32 bits at a time */
942     TCGv_i32 tmp, tmp2, tmp3;
943
944     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
945         return false;
946     }
947
948     /* UNDEF accesses to D16-D31 if they don't exist. */
949     if (!dc_isar_feature(aa32_simd_r32, s) &&
950         ((a->vd | a->vn | a->vm) & 0x10)) {
951         return false;
952     }
953
954     if (a->size == 3) {
955         return false;
956     }
957
958     if (!vfp_access_check(s)) {
959         return true;
960     }
961
962     assert(a->q == 0); /* enforced by decode patterns */
963
964     /*
965      * Note that we have to be careful not to clobber the source operands
966      * in the "vm == vd" case by storing the result of the first pass too
967      * early. Since Q is 0 there are always just two passes, so instead
968      * of a complicated loop over each pass we just unroll.
969      */
970     tmp = neon_load_reg(a->vn, 0);
971     tmp2 = neon_load_reg(a->vn, 1);
972     fn(tmp, tmp, tmp2);
973     tcg_temp_free_i32(tmp2);
974
975     tmp3 = neon_load_reg(a->vm, 0);
976     tmp2 = neon_load_reg(a->vm, 1);
977     fn(tmp3, tmp3, tmp2);
978     tcg_temp_free_i32(tmp2);
979
980     neon_store_reg(a->vd, 0, tmp);
981     neon_store_reg(a->vd, 1, tmp3);
982     return true;
983 }
984
985 #define DO_3SAME_PAIR(INSN, func)                                       \
986     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
987     {                                                                   \
988         static NeonGenTwoOpFn * const fns[] = {                         \
989             gen_helper_neon_##func##8,                                  \
990             gen_helper_neon_##func##16,                                 \
991             gen_helper_neon_##func##32,                                 \
992         };                                                              \
993         if (a->size > 2) {                                              \
994             return false;                                               \
995         }                                                               \
996         return do_3same_pair(s, a, fns[a->size]);                       \
997     }
998
999 /* 32-bit pairwise ops end up the same as the elementwise versions.  */
1000 #define gen_helper_neon_pmax_s32  tcg_gen_smax_i32
1001 #define gen_helper_neon_pmax_u32  tcg_gen_umax_i32
1002 #define gen_helper_neon_pmin_s32  tcg_gen_smin_i32
1003 #define gen_helper_neon_pmin_u32  tcg_gen_umin_i32
1004 #define gen_helper_neon_padd_u32  tcg_gen_add_i32
1005
1006 DO_3SAME_PAIR(VPMAX_S, pmax_s)
1007 DO_3SAME_PAIR(VPMIN_S, pmin_s)
1008 DO_3SAME_PAIR(VPMAX_U, pmax_u)
1009 DO_3SAME_PAIR(VPMIN_U, pmin_u)
1010 DO_3SAME_PAIR(VPADD, padd_u)
1011
1012 #define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
1013     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16);    \
1014     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32);    \
1015     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
1016                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
1017                                 uint32_t oprsz, uint32_t maxsz)         \
1018     {                                                                   \
1019         static const GVecGen3 ops[2] = {                                \
1020             { .fni4 = gen_##INSN##_tramp16 },                           \
1021             { .fni4 = gen_##INSN##_tramp32 },                           \
1022         };                                                              \
1023         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
1024     }                                                                   \
1025     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1026     {                                                                   \
1027         if (a->size != 1 && a->size != 2) {                             \
1028             return false;                                               \
1029         }                                                               \
1030         return do_3same(s, a, gen_##INSN##_3s);                         \
1031     }
1032
1033 DO_3SAME_VQDMULH(VQDMULH, qdmulh)
1034 DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
1035
1036 #define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC)                              \
1037     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
1038                          uint32_t rn_ofs, uint32_t rm_ofs,              \
1039                          uint32_t oprsz, uint32_t maxsz)                \
1040     {                                                                   \
1041         TCGv_ptr fpst = fpstatus_ptr(FPST);                             \
1042         tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
1043                            oprsz, maxsz, 0, FUNC);                      \
1044         tcg_temp_free_ptr(fpst);                                        \
1045     }
1046
1047 #define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC)                                 \
1048     WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC)                 \
1049     WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC)             \
1050     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
1051     {                                                                   \
1052         if (a->size != 0) {                                             \
1053             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
1054                 return false;                                           \
1055             }                                                           \
1056             return do_3same(s, a, gen_##INSN##_fp16_3s);                \
1057         }                                                               \
1058         return do_3same(s, a, gen_##INSN##_fp32_3s);                    \
1059     }
1060
1061
1062 DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
1063 DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
1064 DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
1065 DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
1066 DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
1067 DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
1068 DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
1069 DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
1070 DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
1071 DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
1072 DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
1073 DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
1074 DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
1075 DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
1076 DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
1077 DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
1078 DO_3S_FP_GVEC(VRSQRTS, gen_helper_gvec_rsqrts_nf_s, gen_helper_gvec_rsqrts_nf_h)
1079
1080 WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
1081 WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
1082 WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
1083 WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
1084
1085 static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
1086 {
1087     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1088         return false;
1089     }
1090
1091     if (a->size != 0) {
1092         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1093             return false;
1094         }
1095         return do_3same(s, a, gen_VMAXNM_fp16_3s);
1096     }
1097     return do_3same(s, a, gen_VMAXNM_fp32_3s);
1098 }
1099
1100 static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
1101 {
1102     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1103         return false;
1104     }
1105
1106     if (a->size != 0) {
1107         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1108             return false;
1109         }
1110         return do_3same(s, a, gen_VMINNM_fp16_3s);
1111     }
1112     return do_3same(s, a, gen_VMINNM_fp32_3s);
1113 }
1114
1115 static bool do_3same_fp_pair(DisasContext *s, arg_3same *a,
1116                              gen_helper_gvec_3_ptr *fn)
1117 {
1118     /* FP pairwise operations */
1119     TCGv_ptr fpstatus;
1120
1121     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1122         return false;
1123     }
1124
1125     /* UNDEF accesses to D16-D31 if they don't exist. */
1126     if (!dc_isar_feature(aa32_simd_r32, s) &&
1127         ((a->vd | a->vn | a->vm) & 0x10)) {
1128         return false;
1129     }
1130
1131     if (!vfp_access_check(s)) {
1132         return true;
1133     }
1134
1135     assert(a->q == 0); /* enforced by decode patterns */
1136
1137
1138     fpstatus = fpstatus_ptr(a->size != 0 ? FPST_STD_F16 : FPST_STD);
1139     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
1140                        vfp_reg_offset(1, a->vn),
1141                        vfp_reg_offset(1, a->vm),
1142                        fpstatus, 8, 8, 0, fn);
1143     tcg_temp_free_ptr(fpstatus);
1144
1145     return true;
1146 }
1147
1148 /*
1149  * For all the functions using this macro, size == 1 means fp16,
1150  * which is an architecture extension we don't implement yet.
1151  */
1152 #define DO_3S_FP_PAIR(INSN,FUNC)                                    \
1153     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
1154     {                                                               \
1155         if (a->size != 0) {                                         \
1156             if (!dc_isar_feature(aa32_fp16_arith, s)) {             \
1157                 return false;                                       \
1158             }                                                       \
1159             return do_3same_fp_pair(s, a, FUNC##h);                 \
1160         }                                                           \
1161         return do_3same_fp_pair(s, a, FUNC##s);                     \
1162     }
1163
1164 DO_3S_FP_PAIR(VPADD, gen_helper_neon_padd)
1165 DO_3S_FP_PAIR(VPMAX, gen_helper_neon_pmax)
1166 DO_3S_FP_PAIR(VPMIN, gen_helper_neon_pmin)
1167
1168 static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
1169 {
1170     /* Handle a 2-reg-shift insn which can be vectorized. */
1171     int vec_size = a->q ? 16 : 8;
1172     int rd_ofs = neon_reg_offset(a->vd, 0);
1173     int rm_ofs = neon_reg_offset(a->vm, 0);
1174
1175     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1176         return false;
1177     }
1178
1179     /* UNDEF accesses to D16-D31 if they don't exist. */
1180     if (!dc_isar_feature(aa32_simd_r32, s) &&
1181         ((a->vd | a->vm) & 0x10)) {
1182         return false;
1183     }
1184
1185     if ((a->vm | a->vd) & a->q) {
1186         return false;
1187     }
1188
1189     if (!vfp_access_check(s)) {
1190         return true;
1191     }
1192
1193     fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
1194     return true;
1195 }
1196
1197 #define DO_2SH(INSN, FUNC)                                              \
1198     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1199     {                                                                   \
1200         return do_vector_2sh(s, a, FUNC);                               \
1201     }                                                                   \
1202
1203 DO_2SH(VSHL, tcg_gen_gvec_shli)
1204 DO_2SH(VSLI, gen_gvec_sli)
1205 DO_2SH(VSRI, gen_gvec_sri)
1206 DO_2SH(VSRA_S, gen_gvec_ssra)
1207 DO_2SH(VSRA_U, gen_gvec_usra)
1208 DO_2SH(VRSHR_S, gen_gvec_srshr)
1209 DO_2SH(VRSHR_U, gen_gvec_urshr)
1210 DO_2SH(VRSRA_S, gen_gvec_srsra)
1211 DO_2SH(VRSRA_U, gen_gvec_ursra)
1212
1213 static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
1214 {
1215     /* Signed shift out of range results in all-sign-bits */
1216     a->shift = MIN(a->shift, (8 << a->size) - 1);
1217     return do_vector_2sh(s, a, tcg_gen_gvec_sari);
1218 }
1219
1220 static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
1221                             int64_t shift, uint32_t oprsz, uint32_t maxsz)
1222 {
1223     tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
1224 }
1225
1226 static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
1227 {
1228     /* Shift out of range is architecturally valid and results in zero. */
1229     if (a->shift >= (8 << a->size)) {
1230         return do_vector_2sh(s, a, gen_zero_rd_2sh);
1231     } else {
1232         return do_vector_2sh(s, a, tcg_gen_gvec_shri);
1233     }
1234 }
1235
1236 static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
1237                              NeonGenTwo64OpEnvFn *fn)
1238 {
1239     /*
1240      * 2-reg-and-shift operations, size == 3 case, where the
1241      * function needs to be passed cpu_env.
1242      */
1243     TCGv_i64 constimm;
1244     int pass;
1245
1246     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1247         return false;
1248     }
1249
1250     /* UNDEF accesses to D16-D31 if they don't exist. */
1251     if (!dc_isar_feature(aa32_simd_r32, s) &&
1252         ((a->vd | a->vm) & 0x10)) {
1253         return false;
1254     }
1255
1256     if ((a->vm | a->vd) & a->q) {
1257         return false;
1258     }
1259
1260     if (!vfp_access_check(s)) {
1261         return true;
1262     }
1263
1264     /*
1265      * To avoid excessive duplication of ops we implement shift
1266      * by immediate using the variable shift operations.
1267      */
1268     constimm = tcg_const_i64(dup_const(a->size, a->shift));
1269
1270     for (pass = 0; pass < a->q + 1; pass++) {
1271         TCGv_i64 tmp = tcg_temp_new_i64();
1272
1273         neon_load_reg64(tmp, a->vm + pass);
1274         fn(tmp, cpu_env, tmp, constimm);
1275         neon_store_reg64(tmp, a->vd + pass);
1276         tcg_temp_free_i64(tmp);
1277     }
1278     tcg_temp_free_i64(constimm);
1279     return true;
1280 }
1281
1282 static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
1283                              NeonGenTwoOpEnvFn *fn)
1284 {
1285     /*
1286      * 2-reg-and-shift operations, size < 3 case, where the
1287      * helper needs to be passed cpu_env.
1288      */
1289     TCGv_i32 constimm;
1290     int pass;
1291
1292     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1293         return false;
1294     }
1295
1296     /* UNDEF accesses to D16-D31 if they don't exist. */
1297     if (!dc_isar_feature(aa32_simd_r32, s) &&
1298         ((a->vd | a->vm) & 0x10)) {
1299         return false;
1300     }
1301
1302     if ((a->vm | a->vd) & a->q) {
1303         return false;
1304     }
1305
1306     if (!vfp_access_check(s)) {
1307         return true;
1308     }
1309
1310     /*
1311      * To avoid excessive duplication of ops we implement shift
1312      * by immediate using the variable shift operations.
1313      */
1314     constimm = tcg_const_i32(dup_const(a->size, a->shift));
1315
1316     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1317         TCGv_i32 tmp = neon_load_reg(a->vm, pass);
1318         fn(tmp, cpu_env, tmp, constimm);
1319         neon_store_reg(a->vd, pass, tmp);
1320     }
1321     tcg_temp_free_i32(constimm);
1322     return true;
1323 }
1324
1325 #define DO_2SHIFT_ENV(INSN, FUNC)                                       \
1326     static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
1327     {                                                                   \
1328         return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
1329     }                                                                   \
1330     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1331     {                                                                   \
1332         static NeonGenTwoOpEnvFn * const fns[] = {                      \
1333             gen_helper_neon_##FUNC##8,                                  \
1334             gen_helper_neon_##FUNC##16,                                 \
1335             gen_helper_neon_##FUNC##32,                                 \
1336         };                                                              \
1337         assert(a->size < ARRAY_SIZE(fns));                              \
1338         return do_2shift_env_32(s, a, fns[a->size]);                    \
1339     }
1340
1341 DO_2SHIFT_ENV(VQSHLU, qshlu_s)
1342 DO_2SHIFT_ENV(VQSHL_U, qshl_u)
1343 DO_2SHIFT_ENV(VQSHL_S, qshl_s)
1344
1345 static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
1346                                 NeonGenTwo64OpFn *shiftfn,
1347                                 NeonGenNarrowEnvFn *narrowfn)
1348 {
1349     /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
1350     TCGv_i64 constimm, rm1, rm2;
1351     TCGv_i32 rd;
1352
1353     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1354         return false;
1355     }
1356
1357     /* UNDEF accesses to D16-D31 if they don't exist. */
1358     if (!dc_isar_feature(aa32_simd_r32, s) &&
1359         ((a->vd | a->vm) & 0x10)) {
1360         return false;
1361     }
1362
1363     if (a->vm & 1) {
1364         return false;
1365     }
1366
1367     if (!vfp_access_check(s)) {
1368         return true;
1369     }
1370
1371     /*
1372      * This is always a right shift, and the shiftfn is always a
1373      * left-shift helper, which thus needs the negated shift count.
1374      */
1375     constimm = tcg_const_i64(-a->shift);
1376     rm1 = tcg_temp_new_i64();
1377     rm2 = tcg_temp_new_i64();
1378
1379     /* Load both inputs first to avoid potential overwrite if rm == rd */
1380     neon_load_reg64(rm1, a->vm);
1381     neon_load_reg64(rm2, a->vm + 1);
1382
1383     shiftfn(rm1, rm1, constimm);
1384     rd = tcg_temp_new_i32();
1385     narrowfn(rd, cpu_env, rm1);
1386     neon_store_reg(a->vd, 0, rd);
1387
1388     shiftfn(rm2, rm2, constimm);
1389     rd = tcg_temp_new_i32();
1390     narrowfn(rd, cpu_env, rm2);
1391     neon_store_reg(a->vd, 1, rd);
1392
1393     tcg_temp_free_i64(rm1);
1394     tcg_temp_free_i64(rm2);
1395     tcg_temp_free_i64(constimm);
1396
1397     return true;
1398 }
1399
1400 static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
1401                                 NeonGenTwoOpFn *shiftfn,
1402                                 NeonGenNarrowEnvFn *narrowfn)
1403 {
1404     /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
1405     TCGv_i32 constimm, rm1, rm2, rm3, rm4;
1406     TCGv_i64 rtmp;
1407     uint32_t imm;
1408
1409     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1410         return false;
1411     }
1412
1413     /* UNDEF accesses to D16-D31 if they don't exist. */
1414     if (!dc_isar_feature(aa32_simd_r32, s) &&
1415         ((a->vd | a->vm) & 0x10)) {
1416         return false;
1417     }
1418
1419     if (a->vm & 1) {
1420         return false;
1421     }
1422
1423     if (!vfp_access_check(s)) {
1424         return true;
1425     }
1426
1427     /*
1428      * This is always a right shift, and the shiftfn is always a
1429      * left-shift helper, which thus needs the negated shift count
1430      * duplicated into each lane of the immediate value.
1431      */
1432     if (a->size == 1) {
1433         imm = (uint16_t)(-a->shift);
1434         imm |= imm << 16;
1435     } else {
1436         /* size == 2 */
1437         imm = -a->shift;
1438     }
1439     constimm = tcg_const_i32(imm);
1440
1441     /* Load all inputs first to avoid potential overwrite */
1442     rm1 = neon_load_reg(a->vm, 0);
1443     rm2 = neon_load_reg(a->vm, 1);
1444     rm3 = neon_load_reg(a->vm + 1, 0);
1445     rm4 = neon_load_reg(a->vm + 1, 1);
1446     rtmp = tcg_temp_new_i64();
1447
1448     shiftfn(rm1, rm1, constimm);
1449     shiftfn(rm2, rm2, constimm);
1450
1451     tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
1452     tcg_temp_free_i32(rm2);
1453
1454     narrowfn(rm1, cpu_env, rtmp);
1455     neon_store_reg(a->vd, 0, rm1);
1456
1457     shiftfn(rm3, rm3, constimm);
1458     shiftfn(rm4, rm4, constimm);
1459     tcg_temp_free_i32(constimm);
1460
1461     tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
1462     tcg_temp_free_i32(rm4);
1463
1464     narrowfn(rm3, cpu_env, rtmp);
1465     tcg_temp_free_i64(rtmp);
1466     neon_store_reg(a->vd, 1, rm3);
1467     return true;
1468 }
1469
1470 #define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
1471     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1472     {                                                                   \
1473         return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
1474     }
1475 #define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
1476     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1477     {                                                                   \
1478         return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
1479     }
1480
1481 static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1482 {
1483     tcg_gen_extrl_i64_i32(dest, src);
1484 }
1485
1486 static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1487 {
1488     gen_helper_neon_narrow_u16(dest, src);
1489 }
1490
1491 static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1492 {
1493     gen_helper_neon_narrow_u8(dest, src);
1494 }
1495
1496 DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
1497 DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
1498 DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
1499
1500 DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
1501 DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
1502 DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
1503
1504 DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
1505 DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
1506 DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
1507
1508 DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
1509 DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
1510 DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
1511 DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
1512 DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
1513 DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
1514
1515 DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
1516 DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
1517 DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
1518
1519 DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
1520 DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
1521 DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
1522
1523 DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
1524 DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
1525 DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
1526
1527 static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
1528                          NeonGenWidenFn *widenfn, bool u)
1529 {
1530     TCGv_i64 tmp;
1531     TCGv_i32 rm0, rm1;
1532     uint64_t widen_mask = 0;
1533
1534     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1535         return false;
1536     }
1537
1538     /* UNDEF accesses to D16-D31 if they don't exist. */
1539     if (!dc_isar_feature(aa32_simd_r32, s) &&
1540         ((a->vd | a->vm) & 0x10)) {
1541         return false;
1542     }
1543
1544     if (a->vd & 1) {
1545         return false;
1546     }
1547
1548     if (!vfp_access_check(s)) {
1549         return true;
1550     }
1551
1552     /*
1553      * This is a widen-and-shift operation. The shift is always less
1554      * than the width of the source type, so after widening the input
1555      * vector we can simply shift the whole 64-bit widened register,
1556      * and then clear the potential overflow bits resulting from left
1557      * bits of the narrow input appearing as right bits of the left
1558      * neighbour narrow input. Calculate a mask of bits to clear.
1559      */
1560     if ((a->shift != 0) && (a->size < 2 || u)) {
1561         int esize = 8 << a->size;
1562         widen_mask = MAKE_64BIT_MASK(0, esize);
1563         widen_mask >>= esize - a->shift;
1564         widen_mask = dup_const(a->size + 1, widen_mask);
1565     }
1566
1567     rm0 = neon_load_reg(a->vm, 0);
1568     rm1 = neon_load_reg(a->vm, 1);
1569     tmp = tcg_temp_new_i64();
1570
1571     widenfn(tmp, rm0);
1572     tcg_temp_free_i32(rm0);
1573     if (a->shift != 0) {
1574         tcg_gen_shli_i64(tmp, tmp, a->shift);
1575         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1576     }
1577     neon_store_reg64(tmp, a->vd);
1578
1579     widenfn(tmp, rm1);
1580     tcg_temp_free_i32(rm1);
1581     if (a->shift != 0) {
1582         tcg_gen_shli_i64(tmp, tmp, a->shift);
1583         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1584     }
1585     neon_store_reg64(tmp, a->vd + 1);
1586     tcg_temp_free_i64(tmp);
1587     return true;
1588 }
1589
1590 static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
1591 {
1592     static NeonGenWidenFn * const widenfn[] = {
1593         gen_helper_neon_widen_s8,
1594         gen_helper_neon_widen_s16,
1595         tcg_gen_ext_i32_i64,
1596     };
1597     return do_vshll_2sh(s, a, widenfn[a->size], false);
1598 }
1599
1600 static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
1601 {
1602     static NeonGenWidenFn * const widenfn[] = {
1603         gen_helper_neon_widen_u8,
1604         gen_helper_neon_widen_u16,
1605         tcg_gen_extu_i32_i64,
1606     };
1607     return do_vshll_2sh(s, a, widenfn[a->size], true);
1608 }
1609
1610 static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
1611                       gen_helper_gvec_2_ptr *fn)
1612 {
1613     /* FP operations in 2-reg-and-shift group */
1614     int vec_size = a->q ? 16 : 8;
1615     int rd_ofs = neon_reg_offset(a->vd, 0);
1616     int rm_ofs = neon_reg_offset(a->vm, 0);
1617     TCGv_ptr fpst;
1618
1619     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1620         return false;
1621     }
1622
1623     if (a->size != 0) {
1624         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1625             return false;
1626         }
1627     }
1628
1629     /* UNDEF accesses to D16-D31 if they don't exist. */
1630     if (!dc_isar_feature(aa32_simd_r32, s) &&
1631         ((a->vd | a->vm) & 0x10)) {
1632         return false;
1633     }
1634
1635     if ((a->vm | a->vd) & a->q) {
1636         return false;
1637     }
1638
1639     if (!vfp_access_check(s)) {
1640         return true;
1641     }
1642
1643     fpst = fpstatus_ptr(a->size ? FPST_STD_F16 : FPST_STD);
1644     tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, vec_size, vec_size, a->shift, fn);
1645     tcg_temp_free_ptr(fpst);
1646     return true;
1647 }
1648
1649 #define DO_FP_2SH(INSN, FUNC)                                           \
1650     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1651     {                                                                   \
1652         return do_fp_2sh(s, a, FUNC);                                   \
1653     }
1654
1655 DO_FP_2SH(VCVT_SF, gen_helper_gvec_vcvt_sf)
1656 DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf)
1657 DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs)
1658 DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu)
1659
1660 static uint64_t asimd_imm_const(uint32_t imm, int cmode, int op)
1661 {
1662     /*
1663      * Expand the encoded constant.
1664      * Note that cmode = 2,3,4,5,6,7,10,11,12,13 imm=0 is UNPREDICTABLE.
1665      * We choose to not special-case this and will behave as if a
1666      * valid constant encoding of 0 had been given.
1667      * cmode = 15 op = 1 must UNDEF; we assume decode has handled that.
1668      */
1669     switch (cmode) {
1670     case 0: case 1:
1671         /* no-op */
1672         break;
1673     case 2: case 3:
1674         imm <<= 8;
1675         break;
1676     case 4: case 5:
1677         imm <<= 16;
1678         break;
1679     case 6: case 7:
1680         imm <<= 24;
1681         break;
1682     case 8: case 9:
1683         imm |= imm << 16;
1684         break;
1685     case 10: case 11:
1686         imm = (imm << 8) | (imm << 24);
1687         break;
1688     case 12:
1689         imm = (imm << 8) | 0xff;
1690         break;
1691     case 13:
1692         imm = (imm << 16) | 0xffff;
1693         break;
1694     case 14:
1695         if (op) {
1696             /*
1697              * This is the only case where the top and bottom 32 bits
1698              * of the encoded constant differ.
1699              */
1700             uint64_t imm64 = 0;
1701             int n;
1702
1703             for (n = 0; n < 8; n++) {
1704                 if (imm & (1 << n)) {
1705                     imm64 |= (0xffULL << (n * 8));
1706                 }
1707             }
1708             return imm64;
1709         }
1710         imm |= (imm << 8) | (imm << 16) | (imm << 24);
1711         break;
1712     case 15:
1713         imm = ((imm & 0x80) << 24) | ((imm & 0x3f) << 19)
1714             | ((imm & 0x40) ? (0x1f << 25) : (1 << 30));
1715         break;
1716     }
1717     if (op) {
1718         imm = ~imm;
1719     }
1720     return dup_const(MO_32, imm);
1721 }
1722
1723 static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
1724                         GVecGen2iFn *fn)
1725 {
1726     uint64_t imm;
1727     int reg_ofs, vec_size;
1728
1729     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1730         return false;
1731     }
1732
1733     /* UNDEF accesses to D16-D31 if they don't exist. */
1734     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
1735         return false;
1736     }
1737
1738     if (a->vd & a->q) {
1739         return false;
1740     }
1741
1742     if (!vfp_access_check(s)) {
1743         return true;
1744     }
1745
1746     reg_ofs = neon_reg_offset(a->vd, 0);
1747     vec_size = a->q ? 16 : 8;
1748     imm = asimd_imm_const(a->imm, a->cmode, a->op);
1749
1750     fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
1751     return true;
1752 }
1753
1754 static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
1755                         int64_t c, uint32_t oprsz, uint32_t maxsz)
1756 {
1757     tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
1758 }
1759
1760 static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
1761 {
1762     /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
1763     GVecGen2iFn *fn;
1764
1765     if ((a->cmode & 1) && a->cmode < 12) {
1766         /* for op=1, the imm will be inverted, so BIC becomes AND. */
1767         fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
1768     } else {
1769         /* There is one unallocated cmode/op combination in this space */
1770         if (a->cmode == 15 && a->op == 1) {
1771             return false;
1772         }
1773         fn = gen_VMOV_1r;
1774     }
1775     return do_1reg_imm(s, a, fn);
1776 }
1777
1778 static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
1779                            NeonGenWidenFn *widenfn,
1780                            NeonGenTwo64OpFn *opfn,
1781                            bool src1_wide)
1782 {
1783     /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
1784     TCGv_i64 rn0_64, rn1_64, rm_64;
1785     TCGv_i32 rm;
1786
1787     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1788         return false;
1789     }
1790
1791     /* UNDEF accesses to D16-D31 if they don't exist. */
1792     if (!dc_isar_feature(aa32_simd_r32, s) &&
1793         ((a->vd | a->vn | a->vm) & 0x10)) {
1794         return false;
1795     }
1796
1797     if (!widenfn || !opfn) {
1798         /* size == 3 case, which is an entirely different insn group */
1799         return false;
1800     }
1801
1802     if ((a->vd & 1) || (src1_wide && (a->vn & 1))) {
1803         return false;
1804     }
1805
1806     if (!vfp_access_check(s)) {
1807         return true;
1808     }
1809
1810     rn0_64 = tcg_temp_new_i64();
1811     rn1_64 = tcg_temp_new_i64();
1812     rm_64 = tcg_temp_new_i64();
1813
1814     if (src1_wide) {
1815         neon_load_reg64(rn0_64, a->vn);
1816     } else {
1817         TCGv_i32 tmp = neon_load_reg(a->vn, 0);
1818         widenfn(rn0_64, tmp);
1819         tcg_temp_free_i32(tmp);
1820     }
1821     rm = neon_load_reg(a->vm, 0);
1822
1823     widenfn(rm_64, rm);
1824     tcg_temp_free_i32(rm);
1825     opfn(rn0_64, rn0_64, rm_64);
1826
1827     /*
1828      * Load second pass inputs before storing the first pass result, to
1829      * avoid incorrect results if a narrow input overlaps with the result.
1830      */
1831     if (src1_wide) {
1832         neon_load_reg64(rn1_64, a->vn + 1);
1833     } else {
1834         TCGv_i32 tmp = neon_load_reg(a->vn, 1);
1835         widenfn(rn1_64, tmp);
1836         tcg_temp_free_i32(tmp);
1837     }
1838     rm = neon_load_reg(a->vm, 1);
1839
1840     neon_store_reg64(rn0_64, a->vd);
1841
1842     widenfn(rm_64, rm);
1843     tcg_temp_free_i32(rm);
1844     opfn(rn1_64, rn1_64, rm_64);
1845     neon_store_reg64(rn1_64, a->vd + 1);
1846
1847     tcg_temp_free_i64(rn0_64);
1848     tcg_temp_free_i64(rn1_64);
1849     tcg_temp_free_i64(rm_64);
1850
1851     return true;
1852 }
1853
1854 #define DO_PREWIDEN(INSN, S, EXT, OP, SRC1WIDE)                         \
1855     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1856     {                                                                   \
1857         static NeonGenWidenFn * const widenfn[] = {                     \
1858             gen_helper_neon_widen_##S##8,                               \
1859             gen_helper_neon_widen_##S##16,                              \
1860             tcg_gen_##EXT##_i32_i64,                                    \
1861             NULL,                                                       \
1862         };                                                              \
1863         static NeonGenTwo64OpFn * const addfn[] = {                     \
1864             gen_helper_neon_##OP##l_u16,                                \
1865             gen_helper_neon_##OP##l_u32,                                \
1866             tcg_gen_##OP##_i64,                                         \
1867             NULL,                                                       \
1868         };                                                              \
1869         return do_prewiden_3d(s, a, widenfn[a->size],                   \
1870                               addfn[a->size], SRC1WIDE);                \
1871     }
1872
1873 DO_PREWIDEN(VADDL_S, s, ext, add, false)
1874 DO_PREWIDEN(VADDL_U, u, extu, add, false)
1875 DO_PREWIDEN(VSUBL_S, s, ext, sub, false)
1876 DO_PREWIDEN(VSUBL_U, u, extu, sub, false)
1877 DO_PREWIDEN(VADDW_S, s, ext, add, true)
1878 DO_PREWIDEN(VADDW_U, u, extu, add, true)
1879 DO_PREWIDEN(VSUBW_S, s, ext, sub, true)
1880 DO_PREWIDEN(VSUBW_U, u, extu, sub, true)
1881
1882 static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
1883                          NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
1884 {
1885     /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
1886     TCGv_i64 rn_64, rm_64;
1887     TCGv_i32 rd0, rd1;
1888
1889     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1890         return false;
1891     }
1892
1893     /* UNDEF accesses to D16-D31 if they don't exist. */
1894     if (!dc_isar_feature(aa32_simd_r32, s) &&
1895         ((a->vd | a->vn | a->vm) & 0x10)) {
1896         return false;
1897     }
1898
1899     if (!opfn || !narrowfn) {
1900         /* size == 3 case, which is an entirely different insn group */
1901         return false;
1902     }
1903
1904     if ((a->vn | a->vm) & 1) {
1905         return false;
1906     }
1907
1908     if (!vfp_access_check(s)) {
1909         return true;
1910     }
1911
1912     rn_64 = tcg_temp_new_i64();
1913     rm_64 = tcg_temp_new_i64();
1914     rd0 = tcg_temp_new_i32();
1915     rd1 = tcg_temp_new_i32();
1916
1917     neon_load_reg64(rn_64, a->vn);
1918     neon_load_reg64(rm_64, a->vm);
1919
1920     opfn(rn_64, rn_64, rm_64);
1921
1922     narrowfn(rd0, rn_64);
1923
1924     neon_load_reg64(rn_64, a->vn + 1);
1925     neon_load_reg64(rm_64, a->vm + 1);
1926
1927     opfn(rn_64, rn_64, rm_64);
1928
1929     narrowfn(rd1, rn_64);
1930
1931     neon_store_reg(a->vd, 0, rd0);
1932     neon_store_reg(a->vd, 1, rd1);
1933
1934     tcg_temp_free_i64(rn_64);
1935     tcg_temp_free_i64(rm_64);
1936
1937     return true;
1938 }
1939
1940 #define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
1941     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1942     {                                                                   \
1943         static NeonGenTwo64OpFn * const addfn[] = {                     \
1944             gen_helper_neon_##OP##l_u16,                                \
1945             gen_helper_neon_##OP##l_u32,                                \
1946             tcg_gen_##OP##_i64,                                         \
1947             NULL,                                                       \
1948         };                                                              \
1949         static NeonGenNarrowFn * const narrowfn[] = {                   \
1950             gen_helper_neon_##NARROWTYPE##_high_u8,                     \
1951             gen_helper_neon_##NARROWTYPE##_high_u16,                    \
1952             EXTOP,                                                      \
1953             NULL,                                                       \
1954         };                                                              \
1955         return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
1956     }
1957
1958 static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
1959 {
1960     tcg_gen_addi_i64(rn, rn, 1u << 31);
1961     tcg_gen_extrh_i64_i32(rd, rn);
1962 }
1963
1964 DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
1965 DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
1966 DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
1967 DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
1968
1969 static bool do_long_3d(DisasContext *s, arg_3diff *a,
1970                        NeonGenTwoOpWidenFn *opfn,
1971                        NeonGenTwo64OpFn *accfn)
1972 {
1973     /*
1974      * 3-regs different lengths, long operations.
1975      * These perform an operation on two inputs that returns a double-width
1976      * result, and then possibly perform an accumulation operation of
1977      * that result into the double-width destination.
1978      */
1979     TCGv_i64 rd0, rd1, tmp;
1980     TCGv_i32 rn, rm;
1981
1982     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1983         return false;
1984     }
1985
1986     /* UNDEF accesses to D16-D31 if they don't exist. */
1987     if (!dc_isar_feature(aa32_simd_r32, s) &&
1988         ((a->vd | a->vn | a->vm) & 0x10)) {
1989         return false;
1990     }
1991
1992     if (!opfn) {
1993         /* size == 3 case, which is an entirely different insn group */
1994         return false;
1995     }
1996
1997     if (a->vd & 1) {
1998         return false;
1999     }
2000
2001     if (!vfp_access_check(s)) {
2002         return true;
2003     }
2004
2005     rd0 = tcg_temp_new_i64();
2006     rd1 = tcg_temp_new_i64();
2007
2008     rn = neon_load_reg(a->vn, 0);
2009     rm = neon_load_reg(a->vm, 0);
2010     opfn(rd0, rn, rm);
2011     tcg_temp_free_i32(rn);
2012     tcg_temp_free_i32(rm);
2013
2014     rn = neon_load_reg(a->vn, 1);
2015     rm = neon_load_reg(a->vm, 1);
2016     opfn(rd1, rn, rm);
2017     tcg_temp_free_i32(rn);
2018     tcg_temp_free_i32(rm);
2019
2020     /* Don't store results until after all loads: they might overlap */
2021     if (accfn) {
2022         tmp = tcg_temp_new_i64();
2023         neon_load_reg64(tmp, a->vd);
2024         accfn(tmp, tmp, rd0);
2025         neon_store_reg64(tmp, a->vd);
2026         neon_load_reg64(tmp, a->vd + 1);
2027         accfn(tmp, tmp, rd1);
2028         neon_store_reg64(tmp, a->vd + 1);
2029         tcg_temp_free_i64(tmp);
2030     } else {
2031         neon_store_reg64(rd0, a->vd);
2032         neon_store_reg64(rd1, a->vd + 1);
2033     }
2034
2035     tcg_temp_free_i64(rd0);
2036     tcg_temp_free_i64(rd1);
2037
2038     return true;
2039 }
2040
2041 static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
2042 {
2043     static NeonGenTwoOpWidenFn * const opfn[] = {
2044         gen_helper_neon_abdl_s16,
2045         gen_helper_neon_abdl_s32,
2046         gen_helper_neon_abdl_s64,
2047         NULL,
2048     };
2049
2050     return do_long_3d(s, a, opfn[a->size], NULL);
2051 }
2052
2053 static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
2054 {
2055     static NeonGenTwoOpWidenFn * const opfn[] = {
2056         gen_helper_neon_abdl_u16,
2057         gen_helper_neon_abdl_u32,
2058         gen_helper_neon_abdl_u64,
2059         NULL,
2060     };
2061
2062     return do_long_3d(s, a, opfn[a->size], NULL);
2063 }
2064
2065 static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
2066 {
2067     static NeonGenTwoOpWidenFn * const opfn[] = {
2068         gen_helper_neon_abdl_s16,
2069         gen_helper_neon_abdl_s32,
2070         gen_helper_neon_abdl_s64,
2071         NULL,
2072     };
2073     static NeonGenTwo64OpFn * const addfn[] = {
2074         gen_helper_neon_addl_u16,
2075         gen_helper_neon_addl_u32,
2076         tcg_gen_add_i64,
2077         NULL,
2078     };
2079
2080     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2081 }
2082
2083 static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
2084 {
2085     static NeonGenTwoOpWidenFn * const opfn[] = {
2086         gen_helper_neon_abdl_u16,
2087         gen_helper_neon_abdl_u32,
2088         gen_helper_neon_abdl_u64,
2089         NULL,
2090     };
2091     static NeonGenTwo64OpFn * const addfn[] = {
2092         gen_helper_neon_addl_u16,
2093         gen_helper_neon_addl_u32,
2094         tcg_gen_add_i64,
2095         NULL,
2096     };
2097
2098     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2099 }
2100
2101 static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2102 {
2103     TCGv_i32 lo = tcg_temp_new_i32();
2104     TCGv_i32 hi = tcg_temp_new_i32();
2105
2106     tcg_gen_muls2_i32(lo, hi, rn, rm);
2107     tcg_gen_concat_i32_i64(rd, lo, hi);
2108
2109     tcg_temp_free_i32(lo);
2110     tcg_temp_free_i32(hi);
2111 }
2112
2113 static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2114 {
2115     TCGv_i32 lo = tcg_temp_new_i32();
2116     TCGv_i32 hi = tcg_temp_new_i32();
2117
2118     tcg_gen_mulu2_i32(lo, hi, rn, rm);
2119     tcg_gen_concat_i32_i64(rd, lo, hi);
2120
2121     tcg_temp_free_i32(lo);
2122     tcg_temp_free_i32(hi);
2123 }
2124
2125 static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
2126 {
2127     static NeonGenTwoOpWidenFn * const opfn[] = {
2128         gen_helper_neon_mull_s8,
2129         gen_helper_neon_mull_s16,
2130         gen_mull_s32,
2131         NULL,
2132     };
2133
2134     return do_long_3d(s, a, opfn[a->size], NULL);
2135 }
2136
2137 static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
2138 {
2139     static NeonGenTwoOpWidenFn * const opfn[] = {
2140         gen_helper_neon_mull_u8,
2141         gen_helper_neon_mull_u16,
2142         gen_mull_u32,
2143         NULL,
2144     };
2145
2146     return do_long_3d(s, a, opfn[a->size], NULL);
2147 }
2148
2149 #define DO_VMLAL(INSN,MULL,ACC)                                         \
2150     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2151     {                                                                   \
2152         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2153             gen_helper_neon_##MULL##8,                                  \
2154             gen_helper_neon_##MULL##16,                                 \
2155             gen_##MULL##32,                                             \
2156             NULL,                                                       \
2157         };                                                              \
2158         static NeonGenTwo64OpFn * const accfn[] = {                     \
2159             gen_helper_neon_##ACC##l_u16,                               \
2160             gen_helper_neon_##ACC##l_u32,                               \
2161             tcg_gen_##ACC##_i64,                                        \
2162             NULL,                                                       \
2163         };                                                              \
2164         return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
2165     }
2166
2167 DO_VMLAL(VMLAL_S,mull_s,add)
2168 DO_VMLAL(VMLAL_U,mull_u,add)
2169 DO_VMLAL(VMLSL_S,mull_s,sub)
2170 DO_VMLAL(VMLSL_U,mull_u,sub)
2171
2172 static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2173 {
2174     gen_helper_neon_mull_s16(rd, rn, rm);
2175     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rd, rd);
2176 }
2177
2178 static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2179 {
2180     gen_mull_s32(rd, rn, rm);
2181     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rd, rd);
2182 }
2183
2184 static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
2185 {
2186     static NeonGenTwoOpWidenFn * const opfn[] = {
2187         NULL,
2188         gen_VQDMULL_16,
2189         gen_VQDMULL_32,
2190         NULL,
2191     };
2192
2193     return do_long_3d(s, a, opfn[a->size], NULL);
2194 }
2195
2196 static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2197 {
2198     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2199 }
2200
2201 static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2202 {
2203     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2204 }
2205
2206 static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
2207 {
2208     static NeonGenTwoOpWidenFn * const opfn[] = {
2209         NULL,
2210         gen_VQDMULL_16,
2211         gen_VQDMULL_32,
2212         NULL,
2213     };
2214     static NeonGenTwo64OpFn * const accfn[] = {
2215         NULL,
2216         gen_VQDMLAL_acc_16,
2217         gen_VQDMLAL_acc_32,
2218         NULL,
2219     };
2220
2221     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2222 }
2223
2224 static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2225 {
2226     gen_helper_neon_negl_u32(rm, rm);
2227     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2228 }
2229
2230 static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2231 {
2232     tcg_gen_neg_i64(rm, rm);
2233     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2234 }
2235
2236 static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
2237 {
2238     static NeonGenTwoOpWidenFn * const opfn[] = {
2239         NULL,
2240         gen_VQDMULL_16,
2241         gen_VQDMULL_32,
2242         NULL,
2243     };
2244     static NeonGenTwo64OpFn * const accfn[] = {
2245         NULL,
2246         gen_VQDMLSL_acc_16,
2247         gen_VQDMLSL_acc_32,
2248         NULL,
2249     };
2250
2251     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2252 }
2253
2254 static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
2255 {
2256     gen_helper_gvec_3 *fn_gvec;
2257
2258     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2259         return false;
2260     }
2261
2262     /* UNDEF accesses to D16-D31 if they don't exist. */
2263     if (!dc_isar_feature(aa32_simd_r32, s) &&
2264         ((a->vd | a->vn | a->vm) & 0x10)) {
2265         return false;
2266     }
2267
2268     if (a->vd & 1) {
2269         return false;
2270     }
2271
2272     switch (a->size) {
2273     case 0:
2274         fn_gvec = gen_helper_neon_pmull_h;
2275         break;
2276     case 2:
2277         if (!dc_isar_feature(aa32_pmull, s)) {
2278             return false;
2279         }
2280         fn_gvec = gen_helper_gvec_pmull_q;
2281         break;
2282     default:
2283         return false;
2284     }
2285
2286     if (!vfp_access_check(s)) {
2287         return true;
2288     }
2289
2290     tcg_gen_gvec_3_ool(neon_reg_offset(a->vd, 0),
2291                        neon_reg_offset(a->vn, 0),
2292                        neon_reg_offset(a->vm, 0),
2293                        16, 16, 0, fn_gvec);
2294     return true;
2295 }
2296
2297 static void gen_neon_dup_low16(TCGv_i32 var)
2298 {
2299     TCGv_i32 tmp = tcg_temp_new_i32();
2300     tcg_gen_ext16u_i32(var, var);
2301     tcg_gen_shli_i32(tmp, var, 16);
2302     tcg_gen_or_i32(var, var, tmp);
2303     tcg_temp_free_i32(tmp);
2304 }
2305
2306 static void gen_neon_dup_high16(TCGv_i32 var)
2307 {
2308     TCGv_i32 tmp = tcg_temp_new_i32();
2309     tcg_gen_andi_i32(var, var, 0xffff0000);
2310     tcg_gen_shri_i32(tmp, var, 16);
2311     tcg_gen_or_i32(var, var, tmp);
2312     tcg_temp_free_i32(tmp);
2313 }
2314
2315 static inline TCGv_i32 neon_get_scalar(int size, int reg)
2316 {
2317     TCGv_i32 tmp;
2318     if (size == 1) {
2319         tmp = neon_load_reg(reg & 7, reg >> 4);
2320         if (reg & 8) {
2321             gen_neon_dup_high16(tmp);
2322         } else {
2323             gen_neon_dup_low16(tmp);
2324         }
2325     } else {
2326         tmp = neon_load_reg(reg & 15, reg >> 4);
2327     }
2328     return tmp;
2329 }
2330
2331 static bool do_2scalar(DisasContext *s, arg_2scalar *a,
2332                        NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
2333 {
2334     /*
2335      * Two registers and a scalar: perform an operation between
2336      * the input elements and the scalar, and then possibly
2337      * perform an accumulation operation of that result into the
2338      * destination.
2339      */
2340     TCGv_i32 scalar;
2341     int pass;
2342
2343     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2344         return false;
2345     }
2346
2347     /* UNDEF accesses to D16-D31 if they don't exist. */
2348     if (!dc_isar_feature(aa32_simd_r32, s) &&
2349         ((a->vd | a->vn | a->vm) & 0x10)) {
2350         return false;
2351     }
2352
2353     if (!opfn) {
2354         /* Bad size (including size == 3, which is a different insn group) */
2355         return false;
2356     }
2357
2358     if (a->q && ((a->vd | a->vn) & 1)) {
2359         return false;
2360     }
2361
2362     if (!vfp_access_check(s)) {
2363         return true;
2364     }
2365
2366     scalar = neon_get_scalar(a->size, a->vm);
2367
2368     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2369         TCGv_i32 tmp = neon_load_reg(a->vn, pass);
2370         opfn(tmp, tmp, scalar);
2371         if (accfn) {
2372             TCGv_i32 rd = neon_load_reg(a->vd, pass);
2373             accfn(tmp, rd, tmp);
2374             tcg_temp_free_i32(rd);
2375         }
2376         neon_store_reg(a->vd, pass, tmp);
2377     }
2378     tcg_temp_free_i32(scalar);
2379     return true;
2380 }
2381
2382 static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
2383 {
2384     static NeonGenTwoOpFn * const opfn[] = {
2385         NULL,
2386         gen_helper_neon_mul_u16,
2387         tcg_gen_mul_i32,
2388         NULL,
2389     };
2390
2391     return do_2scalar(s, a, opfn[a->size], NULL);
2392 }
2393
2394 static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
2395 {
2396     static NeonGenTwoOpFn * const opfn[] = {
2397         NULL,
2398         gen_helper_neon_mul_u16,
2399         tcg_gen_mul_i32,
2400         NULL,
2401     };
2402     static NeonGenTwoOpFn * const accfn[] = {
2403         NULL,
2404         gen_helper_neon_add_u16,
2405         tcg_gen_add_i32,
2406         NULL,
2407     };
2408
2409     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2410 }
2411
2412 static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
2413 {
2414     static NeonGenTwoOpFn * const opfn[] = {
2415         NULL,
2416         gen_helper_neon_mul_u16,
2417         tcg_gen_mul_i32,
2418         NULL,
2419     };
2420     static NeonGenTwoOpFn * const accfn[] = {
2421         NULL,
2422         gen_helper_neon_sub_u16,
2423         tcg_gen_sub_i32,
2424         NULL,
2425     };
2426
2427     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2428 }
2429
2430 /*
2431  * Rather than have a float-specific version of do_2scalar just for
2432  * three insns, we wrap a NeonGenTwoSingleOpFn to turn it into
2433  * a NeonGenTwoOpFn.
2434  */
2435 #define WRAP_FP_FN(WRAPNAME, FUNC)                              \
2436     static void WRAPNAME(TCGv_i32 rd, TCGv_i32 rn, TCGv_i32 rm) \
2437     {                                                           \
2438         TCGv_ptr fpstatus = fpstatus_ptr(FPST_STD);             \
2439         FUNC(rd, rn, rm, fpstatus);                             \
2440         tcg_temp_free_ptr(fpstatus);                            \
2441     }
2442
2443 WRAP_FP_FN(gen_VMUL_F_mul, gen_helper_vfp_muls)
2444 WRAP_FP_FN(gen_VMUL_F_add, gen_helper_vfp_adds)
2445 WRAP_FP_FN(gen_VMUL_F_sub, gen_helper_vfp_subs)
2446
2447 static bool trans_VMUL_F_2sc(DisasContext *s, arg_2scalar *a)
2448 {
2449     static NeonGenTwoOpFn * const opfn[] = {
2450         NULL,
2451         NULL, /* TODO: fp16 support */
2452         gen_VMUL_F_mul,
2453         NULL,
2454     };
2455
2456     return do_2scalar(s, a, opfn[a->size], NULL);
2457 }
2458
2459 static bool trans_VMLA_F_2sc(DisasContext *s, arg_2scalar *a)
2460 {
2461     static NeonGenTwoOpFn * const opfn[] = {
2462         NULL,
2463         NULL, /* TODO: fp16 support */
2464         gen_VMUL_F_mul,
2465         NULL,
2466     };
2467     static NeonGenTwoOpFn * const accfn[] = {
2468         NULL,
2469         NULL, /* TODO: fp16 support */
2470         gen_VMUL_F_add,
2471         NULL,
2472     };
2473
2474     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2475 }
2476
2477 static bool trans_VMLS_F_2sc(DisasContext *s, arg_2scalar *a)
2478 {
2479     static NeonGenTwoOpFn * const opfn[] = {
2480         NULL,
2481         NULL, /* TODO: fp16 support */
2482         gen_VMUL_F_mul,
2483         NULL,
2484     };
2485     static NeonGenTwoOpFn * const accfn[] = {
2486         NULL,
2487         NULL, /* TODO: fp16 support */
2488         gen_VMUL_F_sub,
2489         NULL,
2490     };
2491
2492     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2493 }
2494
2495 WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
2496 WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
2497 WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
2498 WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
2499
2500 static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
2501 {
2502     static NeonGenTwoOpFn * const opfn[] = {
2503         NULL,
2504         gen_VQDMULH_16,
2505         gen_VQDMULH_32,
2506         NULL,
2507     };
2508
2509     return do_2scalar(s, a, opfn[a->size], NULL);
2510 }
2511
2512 static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
2513 {
2514     static NeonGenTwoOpFn * const opfn[] = {
2515         NULL,
2516         gen_VQRDMULH_16,
2517         gen_VQRDMULH_32,
2518         NULL,
2519     };
2520
2521     return do_2scalar(s, a, opfn[a->size], NULL);
2522 }
2523
2524 static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
2525                             NeonGenThreeOpEnvFn *opfn)
2526 {
2527     /*
2528      * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
2529      * performs a kind of fused op-then-accumulate using a helper
2530      * function that takes all of rd, rn and the scalar at once.
2531      */
2532     TCGv_i32 scalar;
2533     int pass;
2534
2535     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2536         return false;
2537     }
2538
2539     if (!dc_isar_feature(aa32_rdm, s)) {
2540         return false;
2541     }
2542
2543     /* UNDEF accesses to D16-D31 if they don't exist. */
2544     if (!dc_isar_feature(aa32_simd_r32, s) &&
2545         ((a->vd | a->vn | a->vm) & 0x10)) {
2546         return false;
2547     }
2548
2549     if (!opfn) {
2550         /* Bad size (including size == 3, which is a different insn group) */
2551         return false;
2552     }
2553
2554     if (a->q && ((a->vd | a->vn) & 1)) {
2555         return false;
2556     }
2557
2558     if (!vfp_access_check(s)) {
2559         return true;
2560     }
2561
2562     scalar = neon_get_scalar(a->size, a->vm);
2563
2564     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2565         TCGv_i32 rn = neon_load_reg(a->vn, pass);
2566         TCGv_i32 rd = neon_load_reg(a->vd, pass);
2567         opfn(rd, cpu_env, rn, scalar, rd);
2568         tcg_temp_free_i32(rn);
2569         neon_store_reg(a->vd, pass, rd);
2570     }
2571     tcg_temp_free_i32(scalar);
2572
2573     return true;
2574 }
2575
2576 static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
2577 {
2578     static NeonGenThreeOpEnvFn *opfn[] = {
2579         NULL,
2580         gen_helper_neon_qrdmlah_s16,
2581         gen_helper_neon_qrdmlah_s32,
2582         NULL,
2583     };
2584     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2585 }
2586
2587 static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
2588 {
2589     static NeonGenThreeOpEnvFn *opfn[] = {
2590         NULL,
2591         gen_helper_neon_qrdmlsh_s16,
2592         gen_helper_neon_qrdmlsh_s32,
2593         NULL,
2594     };
2595     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2596 }
2597
2598 static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
2599                             NeonGenTwoOpWidenFn *opfn,
2600                             NeonGenTwo64OpFn *accfn)
2601 {
2602     /*
2603      * Two registers and a scalar, long operations: perform an
2604      * operation on the input elements and the scalar which produces
2605      * a double-width result, and then possibly perform an accumulation
2606      * operation of that result into the destination.
2607      */
2608     TCGv_i32 scalar, rn;
2609     TCGv_i64 rn0_64, rn1_64;
2610
2611     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2612         return false;
2613     }
2614
2615     /* UNDEF accesses to D16-D31 if they don't exist. */
2616     if (!dc_isar_feature(aa32_simd_r32, s) &&
2617         ((a->vd | a->vn | a->vm) & 0x10)) {
2618         return false;
2619     }
2620
2621     if (!opfn) {
2622         /* Bad size (including size == 3, which is a different insn group) */
2623         return false;
2624     }
2625
2626     if (a->vd & 1) {
2627         return false;
2628     }
2629
2630     if (!vfp_access_check(s)) {
2631         return true;
2632     }
2633
2634     scalar = neon_get_scalar(a->size, a->vm);
2635
2636     /* Load all inputs before writing any outputs, in case of overlap */
2637     rn = neon_load_reg(a->vn, 0);
2638     rn0_64 = tcg_temp_new_i64();
2639     opfn(rn0_64, rn, scalar);
2640     tcg_temp_free_i32(rn);
2641
2642     rn = neon_load_reg(a->vn, 1);
2643     rn1_64 = tcg_temp_new_i64();
2644     opfn(rn1_64, rn, scalar);
2645     tcg_temp_free_i32(rn);
2646     tcg_temp_free_i32(scalar);
2647
2648     if (accfn) {
2649         TCGv_i64 t64 = tcg_temp_new_i64();
2650         neon_load_reg64(t64, a->vd);
2651         accfn(t64, t64, rn0_64);
2652         neon_store_reg64(t64, a->vd);
2653         neon_load_reg64(t64, a->vd + 1);
2654         accfn(t64, t64, rn1_64);
2655         neon_store_reg64(t64, a->vd + 1);
2656         tcg_temp_free_i64(t64);
2657     } else {
2658         neon_store_reg64(rn0_64, a->vd);
2659         neon_store_reg64(rn1_64, a->vd + 1);
2660     }
2661     tcg_temp_free_i64(rn0_64);
2662     tcg_temp_free_i64(rn1_64);
2663     return true;
2664 }
2665
2666 static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
2667 {
2668     static NeonGenTwoOpWidenFn * const opfn[] = {
2669         NULL,
2670         gen_helper_neon_mull_s16,
2671         gen_mull_s32,
2672         NULL,
2673     };
2674
2675     return do_2scalar_long(s, a, opfn[a->size], NULL);
2676 }
2677
2678 static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
2679 {
2680     static NeonGenTwoOpWidenFn * const opfn[] = {
2681         NULL,
2682         gen_helper_neon_mull_u16,
2683         gen_mull_u32,
2684         NULL,
2685     };
2686
2687     return do_2scalar_long(s, a, opfn[a->size], NULL);
2688 }
2689
2690 #define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
2691     static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
2692     {                                                                   \
2693         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2694             NULL,                                                       \
2695             gen_helper_neon_##MULL##16,                                 \
2696             gen_##MULL##32,                                             \
2697             NULL,                                                       \
2698         };                                                              \
2699         static NeonGenTwo64OpFn * const accfn[] = {                     \
2700             NULL,                                                       \
2701             gen_helper_neon_##ACC##l_u32,                               \
2702             tcg_gen_##ACC##_i64,                                        \
2703             NULL,                                                       \
2704         };                                                              \
2705         return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
2706     }
2707
2708 DO_VMLAL_2SC(VMLAL_S, mull_s, add)
2709 DO_VMLAL_2SC(VMLAL_U, mull_u, add)
2710 DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
2711 DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
2712
2713 static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
2714 {
2715     static NeonGenTwoOpWidenFn * const opfn[] = {
2716         NULL,
2717         gen_VQDMULL_16,
2718         gen_VQDMULL_32,
2719         NULL,
2720     };
2721
2722     return do_2scalar_long(s, a, opfn[a->size], NULL);
2723 }
2724
2725 static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
2726 {
2727     static NeonGenTwoOpWidenFn * const opfn[] = {
2728         NULL,
2729         gen_VQDMULL_16,
2730         gen_VQDMULL_32,
2731         NULL,
2732     };
2733     static NeonGenTwo64OpFn * const accfn[] = {
2734         NULL,
2735         gen_VQDMLAL_acc_16,
2736         gen_VQDMLAL_acc_32,
2737         NULL,
2738     };
2739
2740     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2741 }
2742
2743 static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
2744 {
2745     static NeonGenTwoOpWidenFn * const opfn[] = {
2746         NULL,
2747         gen_VQDMULL_16,
2748         gen_VQDMULL_32,
2749         NULL,
2750     };
2751     static NeonGenTwo64OpFn * const accfn[] = {
2752         NULL,
2753         gen_VQDMLSL_acc_16,
2754         gen_VQDMLSL_acc_32,
2755         NULL,
2756     };
2757
2758     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2759 }
2760
2761 static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
2762 {
2763     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2764         return false;
2765     }
2766
2767     /* UNDEF accesses to D16-D31 if they don't exist. */
2768     if (!dc_isar_feature(aa32_simd_r32, s) &&
2769         ((a->vd | a->vn | a->vm) & 0x10)) {
2770         return false;
2771     }
2772
2773     if ((a->vn | a->vm | a->vd) & a->q) {
2774         return false;
2775     }
2776
2777     if (a->imm > 7 && !a->q) {
2778         return false;
2779     }
2780
2781     if (!vfp_access_check(s)) {
2782         return true;
2783     }
2784
2785     if (!a->q) {
2786         /* Extract 64 bits from <Vm:Vn> */
2787         TCGv_i64 left, right, dest;
2788
2789         left = tcg_temp_new_i64();
2790         right = tcg_temp_new_i64();
2791         dest = tcg_temp_new_i64();
2792
2793         neon_load_reg64(right, a->vn);
2794         neon_load_reg64(left, a->vm);
2795         tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
2796         neon_store_reg64(dest, a->vd);
2797
2798         tcg_temp_free_i64(left);
2799         tcg_temp_free_i64(right);
2800         tcg_temp_free_i64(dest);
2801     } else {
2802         /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
2803         TCGv_i64 left, middle, right, destleft, destright;
2804
2805         left = tcg_temp_new_i64();
2806         middle = tcg_temp_new_i64();
2807         right = tcg_temp_new_i64();
2808         destleft = tcg_temp_new_i64();
2809         destright = tcg_temp_new_i64();
2810
2811         if (a->imm < 8) {
2812             neon_load_reg64(right, a->vn);
2813             neon_load_reg64(middle, a->vn + 1);
2814             tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
2815             neon_load_reg64(left, a->vm);
2816             tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
2817         } else {
2818             neon_load_reg64(right, a->vn + 1);
2819             neon_load_reg64(middle, a->vm);
2820             tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
2821             neon_load_reg64(left, a->vm + 1);
2822             tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
2823         }
2824
2825         neon_store_reg64(destright, a->vd);
2826         neon_store_reg64(destleft, a->vd + 1);
2827
2828         tcg_temp_free_i64(destright);
2829         tcg_temp_free_i64(destleft);
2830         tcg_temp_free_i64(right);
2831         tcg_temp_free_i64(middle);
2832         tcg_temp_free_i64(left);
2833     }
2834     return true;
2835 }
2836
2837 static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
2838 {
2839     int n;
2840     TCGv_i32 tmp, tmp2, tmp3, tmp4;
2841     TCGv_ptr ptr1;
2842
2843     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2844         return false;
2845     }
2846
2847     /* UNDEF accesses to D16-D31 if they don't exist. */
2848     if (!dc_isar_feature(aa32_simd_r32, s) &&
2849         ((a->vd | a->vn | a->vm) & 0x10)) {
2850         return false;
2851     }
2852
2853     if (!vfp_access_check(s)) {
2854         return true;
2855     }
2856
2857     n = a->len + 1;
2858     if ((a->vn + n) > 32) {
2859         /*
2860          * This is UNPREDICTABLE; we choose to UNDEF to avoid the
2861          * helper function running off the end of the register file.
2862          */
2863         return false;
2864     }
2865     n <<= 3;
2866     if (a->op) {
2867         tmp = neon_load_reg(a->vd, 0);
2868     } else {
2869         tmp = tcg_temp_new_i32();
2870         tcg_gen_movi_i32(tmp, 0);
2871     }
2872     tmp2 = neon_load_reg(a->vm, 0);
2873     ptr1 = vfp_reg_ptr(true, a->vn);
2874     tmp4 = tcg_const_i32(n);
2875     gen_helper_neon_tbl(tmp2, tmp2, tmp, ptr1, tmp4);
2876     tcg_temp_free_i32(tmp);
2877     if (a->op) {
2878         tmp = neon_load_reg(a->vd, 1);
2879     } else {
2880         tmp = tcg_temp_new_i32();
2881         tcg_gen_movi_i32(tmp, 0);
2882     }
2883     tmp3 = neon_load_reg(a->vm, 1);
2884     gen_helper_neon_tbl(tmp3, tmp3, tmp, ptr1, tmp4);
2885     tcg_temp_free_i32(tmp4);
2886     tcg_temp_free_ptr(ptr1);
2887     neon_store_reg(a->vd, 0, tmp2);
2888     neon_store_reg(a->vd, 1, tmp3);
2889     tcg_temp_free_i32(tmp);
2890     return true;
2891 }
2892
2893 static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
2894 {
2895     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2896         return false;
2897     }
2898
2899     /* UNDEF accesses to D16-D31 if they don't exist. */
2900     if (!dc_isar_feature(aa32_simd_r32, s) &&
2901         ((a->vd | a->vm) & 0x10)) {
2902         return false;
2903     }
2904
2905     if (a->vd & a->q) {
2906         return false;
2907     }
2908
2909     if (!vfp_access_check(s)) {
2910         return true;
2911     }
2912
2913     tcg_gen_gvec_dup_mem(a->size, neon_reg_offset(a->vd, 0),
2914                          neon_element_offset(a->vm, a->index, a->size),
2915                          a->q ? 16 : 8, a->q ? 16 : 8);
2916     return true;
2917 }
2918
2919 static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
2920 {
2921     int pass, half;
2922
2923     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2924         return false;
2925     }
2926
2927     /* UNDEF accesses to D16-D31 if they don't exist. */
2928     if (!dc_isar_feature(aa32_simd_r32, s) &&
2929         ((a->vd | a->vm) & 0x10)) {
2930         return false;
2931     }
2932
2933     if ((a->vd | a->vm) & a->q) {
2934         return false;
2935     }
2936
2937     if (a->size == 3) {
2938         return false;
2939     }
2940
2941     if (!vfp_access_check(s)) {
2942         return true;
2943     }
2944
2945     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
2946         TCGv_i32 tmp[2];
2947
2948         for (half = 0; half < 2; half++) {
2949             tmp[half] = neon_load_reg(a->vm, pass * 2 + half);
2950             switch (a->size) {
2951             case 0:
2952                 tcg_gen_bswap32_i32(tmp[half], tmp[half]);
2953                 break;
2954             case 1:
2955                 gen_swap_half(tmp[half], tmp[half]);
2956                 break;
2957             case 2:
2958                 break;
2959             default:
2960                 g_assert_not_reached();
2961             }
2962         }
2963         neon_store_reg(a->vd, pass * 2, tmp[1]);
2964         neon_store_reg(a->vd, pass * 2 + 1, tmp[0]);
2965     }
2966     return true;
2967 }
2968
2969 static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
2970                               NeonGenWidenFn *widenfn,
2971                               NeonGenTwo64OpFn *opfn,
2972                               NeonGenTwo64OpFn *accfn)
2973 {
2974     /*
2975      * Pairwise long operations: widen both halves of the pair,
2976      * combine the pairs with the opfn, and then possibly accumulate
2977      * into the destination with the accfn.
2978      */
2979     int pass;
2980
2981     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2982         return false;
2983     }
2984
2985     /* UNDEF accesses to D16-D31 if they don't exist. */
2986     if (!dc_isar_feature(aa32_simd_r32, s) &&
2987         ((a->vd | a->vm) & 0x10)) {
2988         return false;
2989     }
2990
2991     if ((a->vd | a->vm) & a->q) {
2992         return false;
2993     }
2994
2995     if (!widenfn) {
2996         return false;
2997     }
2998
2999     if (!vfp_access_check(s)) {
3000         return true;
3001     }
3002
3003     for (pass = 0; pass < a->q + 1; pass++) {
3004         TCGv_i32 tmp;
3005         TCGv_i64 rm0_64, rm1_64, rd_64;
3006
3007         rm0_64 = tcg_temp_new_i64();
3008         rm1_64 = tcg_temp_new_i64();
3009         rd_64 = tcg_temp_new_i64();
3010         tmp = neon_load_reg(a->vm, pass * 2);
3011         widenfn(rm0_64, tmp);
3012         tcg_temp_free_i32(tmp);
3013         tmp = neon_load_reg(a->vm, pass * 2 + 1);
3014         widenfn(rm1_64, tmp);
3015         tcg_temp_free_i32(tmp);
3016         opfn(rd_64, rm0_64, rm1_64);
3017         tcg_temp_free_i64(rm0_64);
3018         tcg_temp_free_i64(rm1_64);
3019
3020         if (accfn) {
3021             TCGv_i64 tmp64 = tcg_temp_new_i64();
3022             neon_load_reg64(tmp64, a->vd + pass);
3023             accfn(rd_64, tmp64, rd_64);
3024             tcg_temp_free_i64(tmp64);
3025         }
3026         neon_store_reg64(rd_64, a->vd + pass);
3027         tcg_temp_free_i64(rd_64);
3028     }
3029     return true;
3030 }
3031
3032 static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
3033 {
3034     static NeonGenWidenFn * const widenfn[] = {
3035         gen_helper_neon_widen_s8,
3036         gen_helper_neon_widen_s16,
3037         tcg_gen_ext_i32_i64,
3038         NULL,
3039     };
3040     static NeonGenTwo64OpFn * const opfn[] = {
3041         gen_helper_neon_paddl_u16,
3042         gen_helper_neon_paddl_u32,
3043         tcg_gen_add_i64,
3044         NULL,
3045     };
3046
3047     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3048 }
3049
3050 static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
3051 {
3052     static NeonGenWidenFn * const widenfn[] = {
3053         gen_helper_neon_widen_u8,
3054         gen_helper_neon_widen_u16,
3055         tcg_gen_extu_i32_i64,
3056         NULL,
3057     };
3058     static NeonGenTwo64OpFn * const opfn[] = {
3059         gen_helper_neon_paddl_u16,
3060         gen_helper_neon_paddl_u32,
3061         tcg_gen_add_i64,
3062         NULL,
3063     };
3064
3065     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3066 }
3067
3068 static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
3069 {
3070     static NeonGenWidenFn * const widenfn[] = {
3071         gen_helper_neon_widen_s8,
3072         gen_helper_neon_widen_s16,
3073         tcg_gen_ext_i32_i64,
3074         NULL,
3075     };
3076     static NeonGenTwo64OpFn * const opfn[] = {
3077         gen_helper_neon_paddl_u16,
3078         gen_helper_neon_paddl_u32,
3079         tcg_gen_add_i64,
3080         NULL,
3081     };
3082     static NeonGenTwo64OpFn * const accfn[] = {
3083         gen_helper_neon_addl_u16,
3084         gen_helper_neon_addl_u32,
3085         tcg_gen_add_i64,
3086         NULL,
3087     };
3088
3089     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3090                              accfn[a->size]);
3091 }
3092
3093 static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
3094 {
3095     static NeonGenWidenFn * const widenfn[] = {
3096         gen_helper_neon_widen_u8,
3097         gen_helper_neon_widen_u16,
3098         tcg_gen_extu_i32_i64,
3099         NULL,
3100     };
3101     static NeonGenTwo64OpFn * const opfn[] = {
3102         gen_helper_neon_paddl_u16,
3103         gen_helper_neon_paddl_u32,
3104         tcg_gen_add_i64,
3105         NULL,
3106     };
3107     static NeonGenTwo64OpFn * const accfn[] = {
3108         gen_helper_neon_addl_u16,
3109         gen_helper_neon_addl_u32,
3110         tcg_gen_add_i64,
3111         NULL,
3112     };
3113
3114     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3115                              accfn[a->size]);
3116 }
3117
3118 typedef void ZipFn(TCGv_ptr, TCGv_ptr);
3119
3120 static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
3121                        ZipFn *fn)
3122 {
3123     TCGv_ptr pd, pm;
3124
3125     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3126         return false;
3127     }
3128
3129     /* UNDEF accesses to D16-D31 if they don't exist. */
3130     if (!dc_isar_feature(aa32_simd_r32, s) &&
3131         ((a->vd | a->vm) & 0x10)) {
3132         return false;
3133     }
3134
3135     if ((a->vd | a->vm) & a->q) {
3136         return false;
3137     }
3138
3139     if (!fn) {
3140         /* Bad size or size/q combination */
3141         return false;
3142     }
3143
3144     if (!vfp_access_check(s)) {
3145         return true;
3146     }
3147
3148     pd = vfp_reg_ptr(true, a->vd);
3149     pm = vfp_reg_ptr(true, a->vm);
3150     fn(pd, pm);
3151     tcg_temp_free_ptr(pd);
3152     tcg_temp_free_ptr(pm);
3153     return true;
3154 }
3155
3156 static bool trans_VUZP(DisasContext *s, arg_2misc *a)
3157 {
3158     static ZipFn * const fn[2][4] = {
3159         {
3160             gen_helper_neon_unzip8,
3161             gen_helper_neon_unzip16,
3162             NULL,
3163             NULL,
3164         }, {
3165             gen_helper_neon_qunzip8,
3166             gen_helper_neon_qunzip16,
3167             gen_helper_neon_qunzip32,
3168             NULL,
3169         }
3170     };
3171     return do_zip_uzp(s, a, fn[a->q][a->size]);
3172 }
3173
3174 static bool trans_VZIP(DisasContext *s, arg_2misc *a)
3175 {
3176     static ZipFn * const fn[2][4] = {
3177         {
3178             gen_helper_neon_zip8,
3179             gen_helper_neon_zip16,
3180             NULL,
3181             NULL,
3182         }, {
3183             gen_helper_neon_qzip8,
3184             gen_helper_neon_qzip16,
3185             gen_helper_neon_qzip32,
3186             NULL,
3187         }
3188     };
3189     return do_zip_uzp(s, a, fn[a->q][a->size]);
3190 }
3191
3192 static bool do_vmovn(DisasContext *s, arg_2misc *a,
3193                      NeonGenNarrowEnvFn *narrowfn)
3194 {
3195     TCGv_i64 rm;
3196     TCGv_i32 rd0, rd1;
3197
3198     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3199         return false;
3200     }
3201
3202     /* UNDEF accesses to D16-D31 if they don't exist. */
3203     if (!dc_isar_feature(aa32_simd_r32, s) &&
3204         ((a->vd | a->vm) & 0x10)) {
3205         return false;
3206     }
3207
3208     if (a->vm & 1) {
3209         return false;
3210     }
3211
3212     if (!narrowfn) {
3213         return false;
3214     }
3215
3216     if (!vfp_access_check(s)) {
3217         return true;
3218     }
3219
3220     rm = tcg_temp_new_i64();
3221     rd0 = tcg_temp_new_i32();
3222     rd1 = tcg_temp_new_i32();
3223
3224     neon_load_reg64(rm, a->vm);
3225     narrowfn(rd0, cpu_env, rm);
3226     neon_load_reg64(rm, a->vm + 1);
3227     narrowfn(rd1, cpu_env, rm);
3228     neon_store_reg(a->vd, 0, rd0);
3229     neon_store_reg(a->vd, 1, rd1);
3230     tcg_temp_free_i64(rm);
3231     return true;
3232 }
3233
3234 #define DO_VMOVN(INSN, FUNC)                                    \
3235     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3236     {                                                           \
3237         static NeonGenNarrowEnvFn * const narrowfn[] = {        \
3238             FUNC##8,                                            \
3239             FUNC##16,                                           \
3240             FUNC##32,                                           \
3241             NULL,                                               \
3242         };                                                      \
3243         return do_vmovn(s, a, narrowfn[a->size]);               \
3244     }
3245
3246 DO_VMOVN(VMOVN, gen_neon_narrow_u)
3247 DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
3248 DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
3249 DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
3250
3251 static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
3252 {
3253     TCGv_i32 rm0, rm1;
3254     TCGv_i64 rd;
3255     static NeonGenWidenFn * const widenfns[] = {
3256         gen_helper_neon_widen_u8,
3257         gen_helper_neon_widen_u16,
3258         tcg_gen_extu_i32_i64,
3259         NULL,
3260     };
3261     NeonGenWidenFn *widenfn = widenfns[a->size];
3262
3263     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3264         return false;
3265     }
3266
3267     /* UNDEF accesses to D16-D31 if they don't exist. */
3268     if (!dc_isar_feature(aa32_simd_r32, s) &&
3269         ((a->vd | a->vm) & 0x10)) {
3270         return false;
3271     }
3272
3273     if (a->vd & 1) {
3274         return false;
3275     }
3276
3277     if (!widenfn) {
3278         return false;
3279     }
3280
3281     if (!vfp_access_check(s)) {
3282         return true;
3283     }
3284
3285     rd = tcg_temp_new_i64();
3286
3287     rm0 = neon_load_reg(a->vm, 0);
3288     rm1 = neon_load_reg(a->vm, 1);
3289
3290     widenfn(rd, rm0);
3291     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3292     neon_store_reg64(rd, a->vd);
3293     widenfn(rd, rm1);
3294     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3295     neon_store_reg64(rd, a->vd + 1);
3296
3297     tcg_temp_free_i64(rd);
3298     tcg_temp_free_i32(rm0);
3299     tcg_temp_free_i32(rm1);
3300     return true;
3301 }
3302
3303 static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
3304 {
3305     TCGv_ptr fpst;
3306     TCGv_i32 ahp, tmp, tmp2, tmp3;
3307
3308     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3309         !dc_isar_feature(aa32_fp16_spconv, s)) {
3310         return false;
3311     }
3312
3313     /* UNDEF accesses to D16-D31 if they don't exist. */
3314     if (!dc_isar_feature(aa32_simd_r32, s) &&
3315         ((a->vd | a->vm) & 0x10)) {
3316         return false;
3317     }
3318
3319     if ((a->vm & 1) || (a->size != 1)) {
3320         return false;
3321     }
3322
3323     if (!vfp_access_check(s)) {
3324         return true;
3325     }
3326
3327     fpst = fpstatus_ptr(FPST_STD);
3328     ahp = get_ahp_flag();
3329     tmp = neon_load_reg(a->vm, 0);
3330     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3331     tmp2 = neon_load_reg(a->vm, 1);
3332     gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
3333     tcg_gen_shli_i32(tmp2, tmp2, 16);
3334     tcg_gen_or_i32(tmp2, tmp2, tmp);
3335     tcg_temp_free_i32(tmp);
3336     tmp = neon_load_reg(a->vm, 2);
3337     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3338     tmp3 = neon_load_reg(a->vm, 3);
3339     neon_store_reg(a->vd, 0, tmp2);
3340     gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
3341     tcg_gen_shli_i32(tmp3, tmp3, 16);
3342     tcg_gen_or_i32(tmp3, tmp3, tmp);
3343     neon_store_reg(a->vd, 1, tmp3);
3344     tcg_temp_free_i32(tmp);
3345     tcg_temp_free_i32(ahp);
3346     tcg_temp_free_ptr(fpst);
3347
3348     return true;
3349 }
3350
3351 static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
3352 {
3353     TCGv_ptr fpst;
3354     TCGv_i32 ahp, tmp, tmp2, tmp3;
3355
3356     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3357         !dc_isar_feature(aa32_fp16_spconv, s)) {
3358         return false;
3359     }
3360
3361     /* UNDEF accesses to D16-D31 if they don't exist. */
3362     if (!dc_isar_feature(aa32_simd_r32, s) &&
3363         ((a->vd | a->vm) & 0x10)) {
3364         return false;
3365     }
3366
3367     if ((a->vd & 1) || (a->size != 1)) {
3368         return false;
3369     }
3370
3371     if (!vfp_access_check(s)) {
3372         return true;
3373     }
3374
3375     fpst = fpstatus_ptr(FPST_STD);
3376     ahp = get_ahp_flag();
3377     tmp3 = tcg_temp_new_i32();
3378     tmp = neon_load_reg(a->vm, 0);
3379     tmp2 = neon_load_reg(a->vm, 1);
3380     tcg_gen_ext16u_i32(tmp3, tmp);
3381     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3382     neon_store_reg(a->vd, 0, tmp3);
3383     tcg_gen_shri_i32(tmp, tmp, 16);
3384     gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
3385     neon_store_reg(a->vd, 1, tmp);
3386     tmp3 = tcg_temp_new_i32();
3387     tcg_gen_ext16u_i32(tmp3, tmp2);
3388     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3389     neon_store_reg(a->vd, 2, tmp3);
3390     tcg_gen_shri_i32(tmp2, tmp2, 16);
3391     gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
3392     neon_store_reg(a->vd, 3, tmp2);
3393     tcg_temp_free_i32(ahp);
3394     tcg_temp_free_ptr(fpst);
3395
3396     return true;
3397 }
3398
3399 static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
3400 {
3401     int vec_size = a->q ? 16 : 8;
3402     int rd_ofs = neon_reg_offset(a->vd, 0);
3403     int rm_ofs = neon_reg_offset(a->vm, 0);
3404
3405     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3406         return false;
3407     }
3408
3409     /* UNDEF accesses to D16-D31 if they don't exist. */
3410     if (!dc_isar_feature(aa32_simd_r32, s) &&
3411         ((a->vd | a->vm) & 0x10)) {
3412         return false;
3413     }
3414
3415     if (a->size == 3) {
3416         return false;
3417     }
3418
3419     if ((a->vd | a->vm) & a->q) {
3420         return false;
3421     }
3422
3423     if (!vfp_access_check(s)) {
3424         return true;
3425     }
3426
3427     fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size);
3428
3429     return true;
3430 }
3431
3432 #define DO_2MISC_VEC(INSN, FN)                                  \
3433     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3434     {                                                           \
3435         return do_2misc_vec(s, a, FN);                          \
3436     }
3437
3438 DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg)
3439 DO_2MISC_VEC(VABS, tcg_gen_gvec_abs)
3440 DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0)
3441 DO_2MISC_VEC(VCGT0, gen_gvec_cgt0)
3442 DO_2MISC_VEC(VCLE0, gen_gvec_cle0)
3443 DO_2MISC_VEC(VCGE0, gen_gvec_cge0)
3444 DO_2MISC_VEC(VCLT0, gen_gvec_clt0)
3445
3446 static bool trans_VMVN(DisasContext *s, arg_2misc *a)
3447 {
3448     if (a->size != 0) {
3449         return false;
3450     }
3451     return do_2misc_vec(s, a, tcg_gen_gvec_not);
3452 }
3453
3454 #define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3455     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3456                          uint32_t rm_ofs, uint32_t oprsz,               \
3457                          uint32_t maxsz)                                \
3458     {                                                                   \
3459         tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz,        \
3460                            DATA, FUNC);                                 \
3461     }
3462
3463 #define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3464     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3465                          uint32_t rm_ofs, uint32_t oprsz,               \
3466                          uint32_t maxsz)                                \
3467     {                                                                   \
3468         tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC);   \
3469     }
3470
3471 WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0)
3472 WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aese, 1)
3473 WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0)
3474 WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesmc, 1)
3475 WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0)
3476 WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0)
3477 WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0)
3478
3479 #define DO_2M_CRYPTO(INSN, FEATURE, SIZE)                       \
3480     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3481     {                                                           \
3482         if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) {  \
3483             return false;                                       \
3484         }                                                       \
3485         return do_2misc_vec(s, a, gen_##INSN);                  \
3486     }
3487
3488 DO_2M_CRYPTO(AESE, aa32_aes, 0)
3489 DO_2M_CRYPTO(AESD, aa32_aes, 0)
3490 DO_2M_CRYPTO(AESMC, aa32_aes, 0)
3491 DO_2M_CRYPTO(AESIMC, aa32_aes, 0)
3492 DO_2M_CRYPTO(SHA1H, aa32_sha1, 2)
3493 DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2)
3494 DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2)
3495
3496 static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
3497 {
3498     int pass;
3499
3500     /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
3501     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3502         return false;
3503     }
3504
3505     /* UNDEF accesses to D16-D31 if they don't exist. */
3506     if (!dc_isar_feature(aa32_simd_r32, s) &&
3507         ((a->vd | a->vm) & 0x10)) {
3508         return false;
3509     }
3510
3511     if (!fn) {
3512         return false;
3513     }
3514
3515     if ((a->vd | a->vm) & a->q) {
3516         return false;
3517     }
3518
3519     if (!vfp_access_check(s)) {
3520         return true;
3521     }
3522
3523     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3524         TCGv_i32 tmp = neon_load_reg(a->vm, pass);
3525         fn(tmp, tmp);
3526         neon_store_reg(a->vd, pass, tmp);
3527     }
3528
3529     return true;
3530 }
3531
3532 static bool trans_VREV32(DisasContext *s, arg_2misc *a)
3533 {
3534     static NeonGenOneOpFn * const fn[] = {
3535         tcg_gen_bswap32_i32,
3536         gen_swap_half,
3537         NULL,
3538         NULL,
3539     };
3540     return do_2misc(s, a, fn[a->size]);
3541 }
3542
3543 static bool trans_VREV16(DisasContext *s, arg_2misc *a)
3544 {
3545     if (a->size != 0) {
3546         return false;
3547     }
3548     return do_2misc(s, a, gen_rev16);
3549 }
3550
3551 static bool trans_VCLS(DisasContext *s, arg_2misc *a)
3552 {
3553     static NeonGenOneOpFn * const fn[] = {
3554         gen_helper_neon_cls_s8,
3555         gen_helper_neon_cls_s16,
3556         gen_helper_neon_cls_s32,
3557         NULL,