target/arm: Implement fp16 for Neon VCVT with rounding modes
[qemu.git] / target / arm / translate-neon.c.inc
1 /*
2  *  ARM translation: AArch32 Neon instructions
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *  Copyright (c) 2005-2007 CodeSourcery
6  *  Copyright (c) 2007 OpenedHand, Ltd.
7  *  Copyright (c) 2020 Linaro, Ltd.
8  *
9  * This library is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2 of the License, or (at your option) any later version.
13  *
14  * This library is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
21  */
22
23 /*
24  * This file is intended to be included from translate.c; it uses
25  * some macros and definitions provided by that file.
26  * It might be possible to convert it to a standalone .c file eventually.
27  */
28
29 static inline int plus1(DisasContext *s, int x)
30 {
31     return x + 1;
32 }
33
34 static inline int rsub_64(DisasContext *s, int x)
35 {
36     return 64 - x;
37 }
38
39 static inline int rsub_32(DisasContext *s, int x)
40 {
41     return 32 - x;
42 }
43 static inline int rsub_16(DisasContext *s, int x)
44 {
45     return 16 - x;
46 }
47 static inline int rsub_8(DisasContext *s, int x)
48 {
49     return 8 - x;
50 }
51
52 /* Include the generated Neon decoder */
53 #include "decode-neon-dp.c.inc"
54 #include "decode-neon-ls.c.inc"
55 #include "decode-neon-shared.c.inc"
56
57 /* Return the offset of a 2**SIZE piece of a NEON register, at index ELE,
58  * where 0 is the least significant end of the register.
59  */
60 static inline long
61 neon_element_offset(int reg, int element, MemOp size)
62 {
63     int element_size = 1 << size;
64     int ofs = element * element_size;
65 #ifdef HOST_WORDS_BIGENDIAN
66     /* Calculate the offset assuming fully little-endian,
67      * then XOR to account for the order of the 8-byte units.
68      */
69     if (element_size < 8) {
70         ofs ^= 8 - element_size;
71     }
72 #endif
73     return neon_reg_offset(reg, 0) + ofs;
74 }
75
76 static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
77 {
78     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
79
80     switch (mop) {
81     case MO_UB:
82         tcg_gen_ld8u_i32(var, cpu_env, offset);
83         break;
84     case MO_UW:
85         tcg_gen_ld16u_i32(var, cpu_env, offset);
86         break;
87     case MO_UL:
88         tcg_gen_ld_i32(var, cpu_env, offset);
89         break;
90     default:
91         g_assert_not_reached();
92     }
93 }
94
95 static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
96 {
97     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
98
99     switch (mop) {
100     case MO_UB:
101         tcg_gen_ld8u_i64(var, cpu_env, offset);
102         break;
103     case MO_UW:
104         tcg_gen_ld16u_i64(var, cpu_env, offset);
105         break;
106     case MO_UL:
107         tcg_gen_ld32u_i64(var, cpu_env, offset);
108         break;
109     case MO_Q:
110         tcg_gen_ld_i64(var, cpu_env, offset);
111         break;
112     default:
113         g_assert_not_reached();
114     }
115 }
116
117 static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
118 {
119     long offset = neon_element_offset(reg, ele, size);
120
121     switch (size) {
122     case MO_8:
123         tcg_gen_st8_i32(var, cpu_env, offset);
124         break;
125     case MO_16:
126         tcg_gen_st16_i32(var, cpu_env, offset);
127         break;
128     case MO_32:
129         tcg_gen_st_i32(var, cpu_env, offset);
130         break;
131     default:
132         g_assert_not_reached();
133     }
134 }
135
136 static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
137 {
138     long offset = neon_element_offset(reg, ele, size);
139
140     switch (size) {
141     case MO_8:
142         tcg_gen_st8_i64(var, cpu_env, offset);
143         break;
144     case MO_16:
145         tcg_gen_st16_i64(var, cpu_env, offset);
146         break;
147     case MO_32:
148         tcg_gen_st32_i64(var, cpu_env, offset);
149         break;
150     case MO_64:
151         tcg_gen_st_i64(var, cpu_env, offset);
152         break;
153     default:
154         g_assert_not_reached();
155     }
156 }
157
158 static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
159 {
160     int opr_sz;
161     TCGv_ptr fpst;
162     gen_helper_gvec_3_ptr *fn_gvec_ptr;
163
164     if (!dc_isar_feature(aa32_vcma, s)
165         || (!a->size && !dc_isar_feature(aa32_fp16_arith, s))) {
166         return false;
167     }
168
169     /* UNDEF accesses to D16-D31 if they don't exist. */
170     if (!dc_isar_feature(aa32_simd_r32, s) &&
171         ((a->vd | a->vn | a->vm) & 0x10)) {
172         return false;
173     }
174
175     if ((a->vn | a->vm | a->vd) & a->q) {
176         return false;
177     }
178
179     if (!vfp_access_check(s)) {
180         return true;
181     }
182
183     opr_sz = (1 + a->q) * 8;
184     fpst = fpstatus_ptr(a->size == 0 ? FPST_STD_F16 : FPST_STD);
185     fn_gvec_ptr = a->size ? gen_helper_gvec_fcmlas : gen_helper_gvec_fcmlah;
186     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
187                        vfp_reg_offset(1, a->vn),
188                        vfp_reg_offset(1, a->vm),
189                        fpst, opr_sz, opr_sz, a->rot,
190                        fn_gvec_ptr);
191     tcg_temp_free_ptr(fpst);
192     return true;
193 }
194
195 static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
196 {
197     int opr_sz;
198     TCGv_ptr fpst;
199     gen_helper_gvec_3_ptr *fn_gvec_ptr;
200
201     if (!dc_isar_feature(aa32_vcma, s)
202         || (!a->size && !dc_isar_feature(aa32_fp16_arith, s))) {
203         return false;
204     }
205
206     /* UNDEF accesses to D16-D31 if they don't exist. */
207     if (!dc_isar_feature(aa32_simd_r32, s) &&
208         ((a->vd | a->vn | a->vm) & 0x10)) {
209         return false;
210     }
211
212     if ((a->vn | a->vm | a->vd) & a->q) {
213         return false;
214     }
215
216     if (!vfp_access_check(s)) {
217         return true;
218     }
219
220     opr_sz = (1 + a->q) * 8;
221     fpst = fpstatus_ptr(a->size == 0 ? FPST_STD_F16 : FPST_STD);
222     fn_gvec_ptr = a->size ? gen_helper_gvec_fcadds : gen_helper_gvec_fcaddh;
223     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
224                        vfp_reg_offset(1, a->vn),
225                        vfp_reg_offset(1, a->vm),
226                        fpst, opr_sz, opr_sz, a->rot,
227                        fn_gvec_ptr);
228     tcg_temp_free_ptr(fpst);
229     return true;
230 }
231
232 static bool trans_VDOT(DisasContext *s, arg_VDOT *a)
233 {
234     int opr_sz;
235     gen_helper_gvec_3 *fn_gvec;
236
237     if (!dc_isar_feature(aa32_dp, s)) {
238         return false;
239     }
240
241     /* UNDEF accesses to D16-D31 if they don't exist. */
242     if (!dc_isar_feature(aa32_simd_r32, s) &&
243         ((a->vd | a->vn | a->vm) & 0x10)) {
244         return false;
245     }
246
247     if ((a->vn | a->vm | a->vd) & a->q) {
248         return false;
249     }
250
251     if (!vfp_access_check(s)) {
252         return true;
253     }
254
255     opr_sz = (1 + a->q) * 8;
256     fn_gvec = a->u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b;
257     tcg_gen_gvec_3_ool(vfp_reg_offset(1, a->vd),
258                        vfp_reg_offset(1, a->vn),
259                        vfp_reg_offset(1, a->vm),
260                        opr_sz, opr_sz, 0, fn_gvec);
261     return true;
262 }
263
264 static bool trans_VFML(DisasContext *s, arg_VFML *a)
265 {
266     int opr_sz;
267
268     if (!dc_isar_feature(aa32_fhm, s)) {
269         return false;
270     }
271
272     /* UNDEF accesses to D16-D31 if they don't exist. */
273     if (!dc_isar_feature(aa32_simd_r32, s) &&
274         (a->vd & 0x10)) {
275         return false;
276     }
277
278     if (a->vd & a->q) {
279         return false;
280     }
281
282     if (!vfp_access_check(s)) {
283         return true;
284     }
285
286     opr_sz = (1 + a->q) * 8;
287     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
288                        vfp_reg_offset(a->q, a->vn),
289                        vfp_reg_offset(a->q, a->vm),
290                        cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
291                        gen_helper_gvec_fmlal_a32);
292     return true;
293 }
294
295 static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
296 {
297     gen_helper_gvec_3_ptr *fn_gvec_ptr;
298     int opr_sz;
299     TCGv_ptr fpst;
300
301     if (!dc_isar_feature(aa32_vcma, s)) {
302         return false;
303     }
304     if (a->size == 0 && !dc_isar_feature(aa32_fp16_arith, s)) {
305         return false;
306     }
307
308     /* UNDEF accesses to D16-D31 if they don't exist. */
309     if (!dc_isar_feature(aa32_simd_r32, s) &&
310         ((a->vd | a->vn | a->vm) & 0x10)) {
311         return false;
312     }
313
314     if ((a->vd | a->vn) & a->q) {
315         return false;
316     }
317
318     if (!vfp_access_check(s)) {
319         return true;
320     }
321
322     fn_gvec_ptr = (a->size ? gen_helper_gvec_fcmlas_idx
323                    : gen_helper_gvec_fcmlah_idx);
324     opr_sz = (1 + a->q) * 8;
325     fpst = fpstatus_ptr(a->size == 0 ? FPST_STD_F16 : FPST_STD);
326     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
327                        vfp_reg_offset(1, a->vn),
328                        vfp_reg_offset(1, a->vm),
329                        fpst, opr_sz, opr_sz,
330                        (a->index << 2) | a->rot, fn_gvec_ptr);
331     tcg_temp_free_ptr(fpst);
332     return true;
333 }
334
335 static bool trans_VDOT_scalar(DisasContext *s, arg_VDOT_scalar *a)
336 {
337     gen_helper_gvec_3 *fn_gvec;
338     int opr_sz;
339     TCGv_ptr fpst;
340
341     if (!dc_isar_feature(aa32_dp, s)) {
342         return false;
343     }
344
345     /* UNDEF accesses to D16-D31 if they don't exist. */
346     if (!dc_isar_feature(aa32_simd_r32, s) &&
347         ((a->vd | a->vn) & 0x10)) {
348         return false;
349     }
350
351     if ((a->vd | a->vn) & a->q) {
352         return false;
353     }
354
355     if (!vfp_access_check(s)) {
356         return true;
357     }
358
359     fn_gvec = a->u ? gen_helper_gvec_udot_idx_b : gen_helper_gvec_sdot_idx_b;
360     opr_sz = (1 + a->q) * 8;
361     fpst = fpstatus_ptr(FPST_STD);
362     tcg_gen_gvec_3_ool(vfp_reg_offset(1, a->vd),
363                        vfp_reg_offset(1, a->vn),
364                        vfp_reg_offset(1, a->rm),
365                        opr_sz, opr_sz, a->index, fn_gvec);
366     tcg_temp_free_ptr(fpst);
367     return true;
368 }
369
370 static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
371 {
372     int opr_sz;
373
374     if (!dc_isar_feature(aa32_fhm, s)) {
375         return false;
376     }
377
378     /* UNDEF accesses to D16-D31 if they don't exist. */
379     if (!dc_isar_feature(aa32_simd_r32, s) &&
380         ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
381         return false;
382     }
383
384     if (a->vd & a->q) {
385         return false;
386     }
387
388     if (!vfp_access_check(s)) {
389         return true;
390     }
391
392     opr_sz = (1 + a->q) * 8;
393     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
394                        vfp_reg_offset(a->q, a->vn),
395                        vfp_reg_offset(a->q, a->rm),
396                        cpu_env, opr_sz, opr_sz,
397                        (a->index << 2) | a->s, /* is_2 == 0 */
398                        gen_helper_gvec_fmlal_idx_a32);
399     return true;
400 }
401
402 static struct {
403     int nregs;
404     int interleave;
405     int spacing;
406 } const neon_ls_element_type[11] = {
407     {1, 4, 1},
408     {1, 4, 2},
409     {4, 1, 1},
410     {2, 2, 2},
411     {1, 3, 1},
412     {1, 3, 2},
413     {3, 1, 1},
414     {1, 1, 1},
415     {1, 2, 1},
416     {1, 2, 2},
417     {2, 1, 1}
418 };
419
420 static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
421                                       int stride)
422 {
423     if (rm != 15) {
424         TCGv_i32 base;
425
426         base = load_reg(s, rn);
427         if (rm == 13) {
428             tcg_gen_addi_i32(base, base, stride);
429         } else {
430             TCGv_i32 index;
431             index = load_reg(s, rm);
432             tcg_gen_add_i32(base, base, index);
433             tcg_temp_free_i32(index);
434         }
435         store_reg(s, rn, base);
436     }
437 }
438
439 static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
440 {
441     /* Neon load/store multiple structures */
442     int nregs, interleave, spacing, reg, n;
443     MemOp endian = s->be_data;
444     int mmu_idx = get_mem_index(s);
445     int size = a->size;
446     TCGv_i64 tmp64;
447     TCGv_i32 addr, tmp;
448
449     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
450         return false;
451     }
452
453     /* UNDEF accesses to D16-D31 if they don't exist */
454     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
455         return false;
456     }
457     if (a->itype > 10) {
458         return false;
459     }
460     /* Catch UNDEF cases for bad values of align field */
461     switch (a->itype & 0xc) {
462     case 4:
463         if (a->align >= 2) {
464             return false;
465         }
466         break;
467     case 8:
468         if (a->align == 3) {
469             return false;
470         }
471         break;
472     default:
473         break;
474     }
475     nregs = neon_ls_element_type[a->itype].nregs;
476     interleave = neon_ls_element_type[a->itype].interleave;
477     spacing = neon_ls_element_type[a->itype].spacing;
478     if (size == 3 && (interleave | spacing) != 1) {
479         return false;
480     }
481
482     if (!vfp_access_check(s)) {
483         return true;
484     }
485
486     /* For our purposes, bytes are always little-endian.  */
487     if (size == 0) {
488         endian = MO_LE;
489     }
490     /*
491      * Consecutive little-endian elements from a single register
492      * can be promoted to a larger little-endian operation.
493      */
494     if (interleave == 1 && endian == MO_LE) {
495         size = 3;
496     }
497     tmp64 = tcg_temp_new_i64();
498     addr = tcg_temp_new_i32();
499     tmp = tcg_const_i32(1 << size);
500     load_reg_var(s, addr, a->rn);
501     for (reg = 0; reg < nregs; reg++) {
502         for (n = 0; n < 8 >> size; n++) {
503             int xs;
504             for (xs = 0; xs < interleave; xs++) {
505                 int tt = a->vd + reg + spacing * xs;
506
507                 if (a->l) {
508                     gen_aa32_ld_i64(s, tmp64, addr, mmu_idx, endian | size);
509                     neon_store_element64(tt, n, size, tmp64);
510                 } else {
511                     neon_load_element64(tmp64, tt, n, size);
512                     gen_aa32_st_i64(s, tmp64, addr, mmu_idx, endian | size);
513                 }
514                 tcg_gen_add_i32(addr, addr, tmp);
515             }
516         }
517     }
518     tcg_temp_free_i32(addr);
519     tcg_temp_free_i32(tmp);
520     tcg_temp_free_i64(tmp64);
521
522     gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
523     return true;
524 }
525
526 static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
527 {
528     /* Neon load single structure to all lanes */
529     int reg, stride, vec_size;
530     int vd = a->vd;
531     int size = a->size;
532     int nregs = a->n + 1;
533     TCGv_i32 addr, tmp;
534
535     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
536         return false;
537     }
538
539     /* UNDEF accesses to D16-D31 if they don't exist */
540     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
541         return false;
542     }
543
544     if (size == 3) {
545         if (nregs != 4 || a->a == 0) {
546             return false;
547         }
548         /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
549         size = 2;
550     }
551     if (nregs == 1 && a->a == 1 && size == 0) {
552         return false;
553     }
554     if (nregs == 3 && a->a == 1) {
555         return false;
556     }
557
558     if (!vfp_access_check(s)) {
559         return true;
560     }
561
562     /*
563      * VLD1 to all lanes: T bit indicates how many Dregs to write.
564      * VLD2/3/4 to all lanes: T bit indicates register stride.
565      */
566     stride = a->t ? 2 : 1;
567     vec_size = nregs == 1 ? stride * 8 : 8;
568
569     tmp = tcg_temp_new_i32();
570     addr = tcg_temp_new_i32();
571     load_reg_var(s, addr, a->rn);
572     for (reg = 0; reg < nregs; reg++) {
573         gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s),
574                         s->be_data | size);
575         if ((vd & 1) && vec_size == 16) {
576             /*
577              * We cannot write 16 bytes at once because the
578              * destination is unaligned.
579              */
580             tcg_gen_gvec_dup_i32(size, neon_reg_offset(vd, 0),
581                                  8, 8, tmp);
582             tcg_gen_gvec_mov(0, neon_reg_offset(vd + 1, 0),
583                              neon_reg_offset(vd, 0), 8, 8);
584         } else {
585             tcg_gen_gvec_dup_i32(size, neon_reg_offset(vd, 0),
586                                  vec_size, vec_size, tmp);
587         }
588         tcg_gen_addi_i32(addr, addr, 1 << size);
589         vd += stride;
590     }
591     tcg_temp_free_i32(tmp);
592     tcg_temp_free_i32(addr);
593
594     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
595
596     return true;
597 }
598
599 static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
600 {
601     /* Neon load/store single structure to one lane */
602     int reg;
603     int nregs = a->n + 1;
604     int vd = a->vd;
605     TCGv_i32 addr, tmp;
606
607     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
608         return false;
609     }
610
611     /* UNDEF accesses to D16-D31 if they don't exist */
612     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
613         return false;
614     }
615
616     /* Catch the UNDEF cases. This is unavoidably a bit messy. */
617     switch (nregs) {
618     case 1:
619         if (((a->align & (1 << a->size)) != 0) ||
620             (a->size == 2 && ((a->align & 3) == 1 || (a->align & 3) == 2))) {
621             return false;
622         }
623         break;
624     case 3:
625         if ((a->align & 1) != 0) {
626             return false;
627         }
628         /* fall through */
629     case 2:
630         if (a->size == 2 && (a->align & 2) != 0) {
631             return false;
632         }
633         break;
634     case 4:
635         if ((a->size == 2) && ((a->align & 3) == 3)) {
636             return false;
637         }
638         break;
639     default:
640         abort();
641     }
642     if ((vd + a->stride * (nregs - 1)) > 31) {
643         /*
644          * Attempts to write off the end of the register file are
645          * UNPREDICTABLE; we choose to UNDEF because otherwise we would
646          * access off the end of the array that holds the register data.
647          */
648         return false;
649     }
650
651     if (!vfp_access_check(s)) {
652         return true;
653     }
654
655     tmp = tcg_temp_new_i32();
656     addr = tcg_temp_new_i32();
657     load_reg_var(s, addr, a->rn);
658     /*
659      * TODO: if we implemented alignment exceptions, we should check
660      * addr against the alignment encoded in a->align here.
661      */
662     for (reg = 0; reg < nregs; reg++) {
663         if (a->l) {
664             gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s),
665                             s->be_data | a->size);
666             neon_store_element(vd, a->reg_idx, a->size, tmp);
667         } else { /* Store */
668             neon_load_element(tmp, vd, a->reg_idx, a->size);
669             gen_aa32_st_i32(s, tmp, addr, get_mem_index(s),
670                             s->be_data | a->size);
671         }
672         vd += a->stride;
673         tcg_gen_addi_i32(addr, addr, 1 << a->size);
674     }
675     tcg_temp_free_i32(addr);
676     tcg_temp_free_i32(tmp);
677
678     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
679
680     return true;
681 }
682
683 static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
684 {
685     int vec_size = a->q ? 16 : 8;
686     int rd_ofs = neon_reg_offset(a->vd, 0);
687     int rn_ofs = neon_reg_offset(a->vn, 0);
688     int rm_ofs = neon_reg_offset(a->vm, 0);
689
690     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
691         return false;
692     }
693
694     /* UNDEF accesses to D16-D31 if they don't exist. */
695     if (!dc_isar_feature(aa32_simd_r32, s) &&
696         ((a->vd | a->vn | a->vm) & 0x10)) {
697         return false;
698     }
699
700     if ((a->vn | a->vm | a->vd) & a->q) {
701         return false;
702     }
703
704     if (!vfp_access_check(s)) {
705         return true;
706     }
707
708     fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
709     return true;
710 }
711
712 #define DO_3SAME(INSN, FUNC)                                            \
713     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
714     {                                                                   \
715         return do_3same(s, a, FUNC);                                    \
716     }
717
718 DO_3SAME(VADD, tcg_gen_gvec_add)
719 DO_3SAME(VSUB, tcg_gen_gvec_sub)
720 DO_3SAME(VAND, tcg_gen_gvec_and)
721 DO_3SAME(VBIC, tcg_gen_gvec_andc)
722 DO_3SAME(VORR, tcg_gen_gvec_or)
723 DO_3SAME(VORN, tcg_gen_gvec_orc)
724 DO_3SAME(VEOR, tcg_gen_gvec_xor)
725 DO_3SAME(VSHL_S, gen_gvec_sshl)
726 DO_3SAME(VSHL_U, gen_gvec_ushl)
727 DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
728 DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
729 DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
730 DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
731
732 /* These insns are all gvec_bitsel but with the inputs in various orders. */
733 #define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
734     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
735                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
736                                 uint32_t oprsz, uint32_t maxsz)         \
737     {                                                                   \
738         tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
739     }                                                                   \
740     DO_3SAME(INSN, gen_##INSN##_3s)
741
742 DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
743 DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
744 DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
745
746 #define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
747     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
748     {                                                                   \
749         if (a->size == 3) {                                             \
750             return false;                                               \
751         }                                                               \
752         return do_3same(s, a, FUNC);                                    \
753     }
754
755 DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
756 DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
757 DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
758 DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
759 DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
760 DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
761 DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
762 DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
763 DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
764 DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
765 DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
766 DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
767
768 #define DO_3SAME_CMP(INSN, COND)                                        \
769     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
770                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
771                                 uint32_t oprsz, uint32_t maxsz)         \
772     {                                                                   \
773         tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
774     }                                                                   \
775     DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
776
777 DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
778 DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
779 DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
780 DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
781 DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
782
783 #define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
784     static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
785                          uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
786     {                                                                      \
787         tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
788     }
789
790 WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
791
792 static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
793 {
794     if (a->size != 0) {
795         return false;
796     }
797     return do_3same(s, a, gen_VMUL_p_3s);
798 }
799
800 #define DO_VQRDMLAH(INSN, FUNC)                                         \
801     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
802     {                                                                   \
803         if (!dc_isar_feature(aa32_rdm, s)) {                            \
804             return false;                                               \
805         }                                                               \
806         if (a->size != 1 && a->size != 2) {                             \
807             return false;                                               \
808         }                                                               \
809         return do_3same(s, a, FUNC);                                    \
810     }
811
812 DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
813 DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
814
815 #define DO_SHA1(NAME, FUNC)                                             \
816     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
817     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
818     {                                                                   \
819         if (!dc_isar_feature(aa32_sha1, s)) {                           \
820             return false;                                               \
821         }                                                               \
822         return do_3same(s, a, gen_##NAME##_3s);                         \
823     }
824
825 DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
826 DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
827 DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
828 DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
829
830 #define DO_SHA2(NAME, FUNC)                                             \
831     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
832     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
833     {                                                                   \
834         if (!dc_isar_feature(aa32_sha2, s)) {                           \
835             return false;                                               \
836         }                                                               \
837         return do_3same(s, a, gen_##NAME##_3s);                         \
838     }
839
840 DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
841 DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
842 DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
843
844 #define DO_3SAME_64(INSN, FUNC)                                         \
845     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
846                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
847                                 uint32_t oprsz, uint32_t maxsz)         \
848     {                                                                   \
849         static const GVecGen3 op = { .fni8 = FUNC };                    \
850         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op);      \
851     }                                                                   \
852     DO_3SAME(INSN, gen_##INSN##_3s)
853
854 #define DO_3SAME_64_ENV(INSN, FUNC)                                     \
855     static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)    \
856     {                                                                   \
857         FUNC(d, cpu_env, n, m);                                         \
858     }                                                                   \
859     DO_3SAME_64(INSN, gen_##INSN##_elt)
860
861 DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
862 DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
863 DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
864 DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
865 DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
866 DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
867
868 #define DO_3SAME_32(INSN, FUNC)                                         \
869     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
870                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
871                                 uint32_t oprsz, uint32_t maxsz)         \
872     {                                                                   \
873         static const GVecGen3 ops[4] = {                                \
874             { .fni4 = gen_helper_neon_##FUNC##8 },                      \
875             { .fni4 = gen_helper_neon_##FUNC##16 },                     \
876             { .fni4 = gen_helper_neon_##FUNC##32 },                     \
877             { 0 },                                                      \
878         };                                                              \
879         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
880     }                                                                   \
881     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
882     {                                                                   \
883         if (a->size > 2) {                                              \
884             return false;                                               \
885         }                                                               \
886         return do_3same(s, a, gen_##INSN##_3s);                         \
887     }
888
889 /*
890  * Some helper functions need to be passed the cpu_env. In order
891  * to use those with the gvec APIs like tcg_gen_gvec_3() we need
892  * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
893  * and which call a NeonGenTwoOpEnvFn().
894  */
895 #define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
896     static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
897     {                                                                   \
898         FUNC(d, cpu_env, n, m);                                         \
899     }
900
901 #define DO_3SAME_32_ENV(INSN, FUNC)                                     \
902     WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8);        \
903     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16);      \
904     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32);      \
905     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
906                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
907                                 uint32_t oprsz, uint32_t maxsz)         \
908     {                                                                   \
909         static const GVecGen3 ops[4] = {                                \
910             { .fni4 = gen_##INSN##_tramp8 },                            \
911             { .fni4 = gen_##INSN##_tramp16 },                           \
912             { .fni4 = gen_##INSN##_tramp32 },                           \
913             { 0 },                                                      \
914         };                                                              \
915         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
916     }                                                                   \
917     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
918     {                                                                   \
919         if (a->size > 2) {                                              \
920             return false;                                               \
921         }                                                               \
922         return do_3same(s, a, gen_##INSN##_3s);                         \
923     }
924
925 DO_3SAME_32(VHADD_S, hadd_s)
926 DO_3SAME_32(VHADD_U, hadd_u)
927 DO_3SAME_32(VHSUB_S, hsub_s)
928 DO_3SAME_32(VHSUB_U, hsub_u)
929 DO_3SAME_32(VRHADD_S, rhadd_s)
930 DO_3SAME_32(VRHADD_U, rhadd_u)
931 DO_3SAME_32(VRSHL_S, rshl_s)
932 DO_3SAME_32(VRSHL_U, rshl_u)
933
934 DO_3SAME_32_ENV(VQSHL_S, qshl_s)
935 DO_3SAME_32_ENV(VQSHL_U, qshl_u)
936 DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
937 DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
938
939 static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn)
940 {
941     /* Operations handled pairwise 32 bits at a time */
942     TCGv_i32 tmp, tmp2, tmp3;
943
944     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
945         return false;
946     }
947
948     /* UNDEF accesses to D16-D31 if they don't exist. */
949     if (!dc_isar_feature(aa32_simd_r32, s) &&
950         ((a->vd | a->vn | a->vm) & 0x10)) {
951         return false;
952     }
953
954     if (a->size == 3) {
955         return false;
956     }
957
958     if (!vfp_access_check(s)) {
959         return true;
960     }
961
962     assert(a->q == 0); /* enforced by decode patterns */
963
964     /*
965      * Note that we have to be careful not to clobber the source operands
966      * in the "vm == vd" case by storing the result of the first pass too
967      * early. Since Q is 0 there are always just two passes, so instead
968      * of a complicated loop over each pass we just unroll.
969      */
970     tmp = neon_load_reg(a->vn, 0);
971     tmp2 = neon_load_reg(a->vn, 1);
972     fn(tmp, tmp, tmp2);
973     tcg_temp_free_i32(tmp2);
974
975     tmp3 = neon_load_reg(a->vm, 0);
976     tmp2 = neon_load_reg(a->vm, 1);
977     fn(tmp3, tmp3, tmp2);
978     tcg_temp_free_i32(tmp2);
979
980     neon_store_reg(a->vd, 0, tmp);
981     neon_store_reg(a->vd, 1, tmp3);
982     return true;
983 }
984
985 #define DO_3SAME_PAIR(INSN, func)                                       \
986     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
987     {                                                                   \
988         static NeonGenTwoOpFn * const fns[] = {                         \
989             gen_helper_neon_##func##8,                                  \
990             gen_helper_neon_##func##16,                                 \
991             gen_helper_neon_##func##32,                                 \
992         };                                                              \
993         if (a->size > 2) {                                              \
994             return false;                                               \
995         }                                                               \
996         return do_3same_pair(s, a, fns[a->size]);                       \
997     }
998
999 /* 32-bit pairwise ops end up the same as the elementwise versions.  */
1000 #define gen_helper_neon_pmax_s32  tcg_gen_smax_i32
1001 #define gen_helper_neon_pmax_u32  tcg_gen_umax_i32
1002 #define gen_helper_neon_pmin_s32  tcg_gen_smin_i32
1003 #define gen_helper_neon_pmin_u32  tcg_gen_umin_i32
1004 #define gen_helper_neon_padd_u32  tcg_gen_add_i32
1005
1006 DO_3SAME_PAIR(VPMAX_S, pmax_s)
1007 DO_3SAME_PAIR(VPMIN_S, pmin_s)
1008 DO_3SAME_PAIR(VPMAX_U, pmax_u)
1009 DO_3SAME_PAIR(VPMIN_U, pmin_u)
1010 DO_3SAME_PAIR(VPADD, padd_u)
1011
1012 #define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
1013     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16);    \
1014     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32);    \
1015     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
1016                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
1017                                 uint32_t oprsz, uint32_t maxsz)         \
1018     {                                                                   \
1019         static const GVecGen3 ops[2] = {                                \
1020             { .fni4 = gen_##INSN##_tramp16 },                           \
1021             { .fni4 = gen_##INSN##_tramp32 },                           \
1022         };                                                              \
1023         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
1024     }                                                                   \
1025     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1026     {                                                                   \
1027         if (a->size != 1 && a->size != 2) {                             \
1028             return false;                                               \
1029         }                                                               \
1030         return do_3same(s, a, gen_##INSN##_3s);                         \
1031     }
1032
1033 DO_3SAME_VQDMULH(VQDMULH, qdmulh)
1034 DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
1035
1036 #define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC)                              \
1037     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
1038                          uint32_t rn_ofs, uint32_t rm_ofs,              \
1039                          uint32_t oprsz, uint32_t maxsz)                \
1040     {                                                                   \
1041         TCGv_ptr fpst = fpstatus_ptr(FPST);                             \
1042         tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
1043                            oprsz, maxsz, 0, FUNC);                      \
1044         tcg_temp_free_ptr(fpst);                                        \
1045     }
1046
1047 #define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC)                                 \
1048     WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC)                 \
1049     WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC)             \
1050     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
1051     {                                                                   \
1052         if (a->size != 0) {                                             \
1053             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
1054                 return false;                                           \
1055             }                                                           \
1056             return do_3same(s, a, gen_##INSN##_fp16_3s);                \
1057         }                                                               \
1058         return do_3same(s, a, gen_##INSN##_fp32_3s);                    \
1059     }
1060
1061
1062 DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
1063 DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
1064 DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
1065 DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
1066 DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
1067 DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
1068 DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
1069 DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
1070 DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
1071 DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
1072 DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
1073 DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
1074 DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
1075 DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
1076 DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
1077 DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
1078 DO_3S_FP_GVEC(VRSQRTS, gen_helper_gvec_rsqrts_nf_s, gen_helper_gvec_rsqrts_nf_h)
1079
1080 WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
1081 WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
1082 WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
1083 WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
1084
1085 static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
1086 {
1087     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1088         return false;
1089     }
1090
1091     if (a->size != 0) {
1092         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1093             return false;
1094         }
1095         return do_3same(s, a, gen_VMAXNM_fp16_3s);
1096     }
1097     return do_3same(s, a, gen_VMAXNM_fp32_3s);
1098 }
1099
1100 static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
1101 {
1102     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1103         return false;
1104     }
1105
1106     if (a->size != 0) {
1107         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1108             return false;
1109         }
1110         return do_3same(s, a, gen_VMINNM_fp16_3s);
1111     }
1112     return do_3same(s, a, gen_VMINNM_fp32_3s);
1113 }
1114
1115 static bool do_3same_fp_pair(DisasContext *s, arg_3same *a,
1116                              gen_helper_gvec_3_ptr *fn)
1117 {
1118     /* FP pairwise operations */
1119     TCGv_ptr fpstatus;
1120
1121     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1122         return false;
1123     }
1124
1125     /* UNDEF accesses to D16-D31 if they don't exist. */
1126     if (!dc_isar_feature(aa32_simd_r32, s) &&
1127         ((a->vd | a->vn | a->vm) & 0x10)) {
1128         return false;
1129     }
1130
1131     if (!vfp_access_check(s)) {
1132         return true;
1133     }
1134
1135     assert(a->q == 0); /* enforced by decode patterns */
1136
1137
1138     fpstatus = fpstatus_ptr(a->size != 0 ? FPST_STD_F16 : FPST_STD);
1139     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
1140                        vfp_reg_offset(1, a->vn),
1141                        vfp_reg_offset(1, a->vm),
1142                        fpstatus, 8, 8, 0, fn);
1143     tcg_temp_free_ptr(fpstatus);
1144
1145     return true;
1146 }
1147
1148 /*
1149  * For all the functions using this macro, size == 1 means fp16,
1150  * which is an architecture extension we don't implement yet.
1151  */
1152 #define DO_3S_FP_PAIR(INSN,FUNC)                                    \
1153     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
1154     {                                                               \
1155         if (a->size != 0) {                                         \
1156             if (!dc_isar_feature(aa32_fp16_arith, s)) {             \
1157                 return false;                                       \
1158             }                                                       \
1159             return do_3same_fp_pair(s, a, FUNC##h);                 \
1160         }                                                           \
1161         return do_3same_fp_pair(s, a, FUNC##s);                     \
1162     }
1163
1164 DO_3S_FP_PAIR(VPADD, gen_helper_neon_padd)
1165 DO_3S_FP_PAIR(VPMAX, gen_helper_neon_pmax)
1166 DO_3S_FP_PAIR(VPMIN, gen_helper_neon_pmin)
1167
1168 static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
1169 {
1170     /* Handle a 2-reg-shift insn which can be vectorized. */
1171     int vec_size = a->q ? 16 : 8;
1172     int rd_ofs = neon_reg_offset(a->vd, 0);
1173     int rm_ofs = neon_reg_offset(a->vm, 0);
1174
1175     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1176         return false;
1177     }
1178
1179     /* UNDEF accesses to D16-D31 if they don't exist. */
1180     if (!dc_isar_feature(aa32_simd_r32, s) &&
1181         ((a->vd | a->vm) & 0x10)) {
1182         return false;
1183     }
1184
1185     if ((a->vm | a->vd) & a->q) {
1186         return false;
1187     }
1188
1189     if (!vfp_access_check(s)) {
1190         return true;
1191     }
1192
1193     fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
1194     return true;
1195 }
1196
1197 #define DO_2SH(INSN, FUNC)                                              \
1198     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1199     {                                                                   \
1200         return do_vector_2sh(s, a, FUNC);                               \
1201     }                                                                   \
1202
1203 DO_2SH(VSHL, tcg_gen_gvec_shli)
1204 DO_2SH(VSLI, gen_gvec_sli)
1205 DO_2SH(VSRI, gen_gvec_sri)
1206 DO_2SH(VSRA_S, gen_gvec_ssra)
1207 DO_2SH(VSRA_U, gen_gvec_usra)
1208 DO_2SH(VRSHR_S, gen_gvec_srshr)
1209 DO_2SH(VRSHR_U, gen_gvec_urshr)
1210 DO_2SH(VRSRA_S, gen_gvec_srsra)
1211 DO_2SH(VRSRA_U, gen_gvec_ursra)
1212
1213 static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
1214 {
1215     /* Signed shift out of range results in all-sign-bits */
1216     a->shift = MIN(a->shift, (8 << a->size) - 1);
1217     return do_vector_2sh(s, a, tcg_gen_gvec_sari);
1218 }
1219
1220 static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
1221                             int64_t shift, uint32_t oprsz, uint32_t maxsz)
1222 {
1223     tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
1224 }
1225
1226 static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
1227 {
1228     /* Shift out of range is architecturally valid and results in zero. */
1229     if (a->shift >= (8 << a->size)) {
1230         return do_vector_2sh(s, a, gen_zero_rd_2sh);
1231     } else {
1232         return do_vector_2sh(s, a, tcg_gen_gvec_shri);
1233     }
1234 }
1235
1236 static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
1237                              NeonGenTwo64OpEnvFn *fn)
1238 {
1239     /*
1240      * 2-reg-and-shift operations, size == 3 case, where the
1241      * function needs to be passed cpu_env.
1242      */
1243     TCGv_i64 constimm;
1244     int pass;
1245
1246     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1247         return false;
1248     }
1249
1250     /* UNDEF accesses to D16-D31 if they don't exist. */
1251     if (!dc_isar_feature(aa32_simd_r32, s) &&
1252         ((a->vd | a->vm) & 0x10)) {
1253         return false;
1254     }
1255
1256     if ((a->vm | a->vd) & a->q) {
1257         return false;
1258     }
1259
1260     if (!vfp_access_check(s)) {
1261         return true;
1262     }
1263
1264     /*
1265      * To avoid excessive duplication of ops we implement shift
1266      * by immediate using the variable shift operations.
1267      */
1268     constimm = tcg_const_i64(dup_const(a->size, a->shift));
1269
1270     for (pass = 0; pass < a->q + 1; pass++) {
1271         TCGv_i64 tmp = tcg_temp_new_i64();
1272
1273         neon_load_reg64(tmp, a->vm + pass);
1274         fn(tmp, cpu_env, tmp, constimm);
1275         neon_store_reg64(tmp, a->vd + pass);
1276         tcg_temp_free_i64(tmp);
1277     }
1278     tcg_temp_free_i64(constimm);
1279     return true;
1280 }
1281
1282 static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
1283                              NeonGenTwoOpEnvFn *fn)
1284 {
1285     /*
1286      * 2-reg-and-shift operations, size < 3 case, where the
1287      * helper needs to be passed cpu_env.
1288      */
1289     TCGv_i32 constimm;
1290     int pass;
1291
1292     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1293         return false;
1294     }
1295
1296     /* UNDEF accesses to D16-D31 if they don't exist. */
1297     if (!dc_isar_feature(aa32_simd_r32, s) &&
1298         ((a->vd | a->vm) & 0x10)) {
1299         return false;
1300     }
1301
1302     if ((a->vm | a->vd) & a->q) {
1303         return false;
1304     }
1305
1306     if (!vfp_access_check(s)) {
1307         return true;
1308     }
1309
1310     /*
1311      * To avoid excessive duplication of ops we implement shift
1312      * by immediate using the variable shift operations.
1313      */
1314     constimm = tcg_const_i32(dup_const(a->size, a->shift));
1315
1316     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1317         TCGv_i32 tmp = neon_load_reg(a->vm, pass);
1318         fn(tmp, cpu_env, tmp, constimm);
1319         neon_store_reg(a->vd, pass, tmp);
1320     }
1321     tcg_temp_free_i32(constimm);
1322     return true;
1323 }
1324
1325 #define DO_2SHIFT_ENV(INSN, FUNC)                                       \
1326     static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
1327     {                                                                   \
1328         return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
1329     }                                                                   \
1330     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1331     {                                                                   \
1332         static NeonGenTwoOpEnvFn * const fns[] = {                      \
1333             gen_helper_neon_##FUNC##8,                                  \
1334             gen_helper_neon_##FUNC##16,                                 \
1335             gen_helper_neon_##FUNC##32,                                 \
1336         };                                                              \
1337         assert(a->size < ARRAY_SIZE(fns));                              \
1338         return do_2shift_env_32(s, a, fns[a->size]);                    \
1339     }
1340
1341 DO_2SHIFT_ENV(VQSHLU, qshlu_s)
1342 DO_2SHIFT_ENV(VQSHL_U, qshl_u)
1343 DO_2SHIFT_ENV(VQSHL_S, qshl_s)
1344
1345 static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
1346                                 NeonGenTwo64OpFn *shiftfn,
1347                                 NeonGenNarrowEnvFn *narrowfn)
1348 {
1349     /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
1350     TCGv_i64 constimm, rm1, rm2;
1351     TCGv_i32 rd;
1352
1353     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1354         return false;
1355     }
1356
1357     /* UNDEF accesses to D16-D31 if they don't exist. */
1358     if (!dc_isar_feature(aa32_simd_r32, s) &&
1359         ((a->vd | a->vm) & 0x10)) {
1360         return false;
1361     }
1362
1363     if (a->vm & 1) {
1364         return false;
1365     }
1366
1367     if (!vfp_access_check(s)) {
1368         return true;
1369     }
1370
1371     /*
1372      * This is always a right shift, and the shiftfn is always a
1373      * left-shift helper, which thus needs the negated shift count.
1374      */
1375     constimm = tcg_const_i64(-a->shift);
1376     rm1 = tcg_temp_new_i64();
1377     rm2 = tcg_temp_new_i64();
1378
1379     /* Load both inputs first to avoid potential overwrite if rm == rd */
1380     neon_load_reg64(rm1, a->vm);
1381     neon_load_reg64(rm2, a->vm + 1);
1382
1383     shiftfn(rm1, rm1, constimm);
1384     rd = tcg_temp_new_i32();
1385     narrowfn(rd, cpu_env, rm1);
1386     neon_store_reg(a->vd, 0, rd);
1387
1388     shiftfn(rm2, rm2, constimm);
1389     rd = tcg_temp_new_i32();
1390     narrowfn(rd, cpu_env, rm2);
1391     neon_store_reg(a->vd, 1, rd);
1392
1393     tcg_temp_free_i64(rm1);
1394     tcg_temp_free_i64(rm2);
1395     tcg_temp_free_i64(constimm);
1396
1397     return true;
1398 }
1399
1400 static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
1401                                 NeonGenTwoOpFn *shiftfn,
1402                                 NeonGenNarrowEnvFn *narrowfn)
1403 {
1404     /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
1405     TCGv_i32 constimm, rm1, rm2, rm3, rm4;
1406     TCGv_i64 rtmp;
1407     uint32_t imm;
1408
1409     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1410         return false;
1411     }
1412
1413     /* UNDEF accesses to D16-D31 if they don't exist. */
1414     if (!dc_isar_feature(aa32_simd_r32, s) &&
1415         ((a->vd | a->vm) & 0x10)) {
1416         return false;
1417     }
1418
1419     if (a->vm & 1) {
1420         return false;
1421     }
1422
1423     if (!vfp_access_check(s)) {
1424         return true;
1425     }
1426
1427     /*
1428      * This is always a right shift, and the shiftfn is always a
1429      * left-shift helper, which thus needs the negated shift count
1430      * duplicated into each lane of the immediate value.
1431      */
1432     if (a->size == 1) {
1433         imm = (uint16_t)(-a->shift);
1434         imm |= imm << 16;
1435     } else {
1436         /* size == 2 */
1437         imm = -a->shift;
1438     }
1439     constimm = tcg_const_i32(imm);
1440
1441     /* Load all inputs first to avoid potential overwrite */
1442     rm1 = neon_load_reg(a->vm, 0);
1443     rm2 = neon_load_reg(a->vm, 1);
1444     rm3 = neon_load_reg(a->vm + 1, 0);
1445     rm4 = neon_load_reg(a->vm + 1, 1);
1446     rtmp = tcg_temp_new_i64();
1447
1448     shiftfn(rm1, rm1, constimm);
1449     shiftfn(rm2, rm2, constimm);
1450
1451     tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
1452     tcg_temp_free_i32(rm2);
1453
1454     narrowfn(rm1, cpu_env, rtmp);
1455     neon_store_reg(a->vd, 0, rm1);
1456
1457     shiftfn(rm3, rm3, constimm);
1458     shiftfn(rm4, rm4, constimm);
1459     tcg_temp_free_i32(constimm);
1460
1461     tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
1462     tcg_temp_free_i32(rm4);
1463
1464     narrowfn(rm3, cpu_env, rtmp);
1465     tcg_temp_free_i64(rtmp);
1466     neon_store_reg(a->vd, 1, rm3);
1467     return true;
1468 }
1469
1470 #define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
1471     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1472     {                                                                   \
1473         return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
1474     }
1475 #define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
1476     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1477     {                                                                   \
1478         return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
1479     }
1480
1481 static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1482 {
1483     tcg_gen_extrl_i64_i32(dest, src);
1484 }
1485
1486 static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1487 {
1488     gen_helper_neon_narrow_u16(dest, src);
1489 }
1490
1491 static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1492 {
1493     gen_helper_neon_narrow_u8(dest, src);
1494 }
1495
1496 DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
1497 DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
1498 DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
1499
1500 DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
1501 DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
1502 DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
1503
1504 DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
1505 DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
1506 DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
1507
1508 DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
1509 DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
1510 DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
1511 DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
1512 DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
1513 DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
1514
1515 DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
1516 DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
1517 DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
1518
1519 DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
1520 DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
1521 DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
1522
1523 DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
1524 DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
1525 DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
1526
1527 static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
1528                          NeonGenWidenFn *widenfn, bool u)
1529 {
1530     TCGv_i64 tmp;
1531     TCGv_i32 rm0, rm1;
1532     uint64_t widen_mask = 0;
1533
1534     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1535         return false;
1536     }
1537
1538     /* UNDEF accesses to D16-D31 if they don't exist. */
1539     if (!dc_isar_feature(aa32_simd_r32, s) &&
1540         ((a->vd | a->vm) & 0x10)) {
1541         return false;
1542     }
1543
1544     if (a->vd & 1) {
1545         return false;
1546     }
1547
1548     if (!vfp_access_check(s)) {
1549         return true;
1550     }
1551
1552     /*
1553      * This is a widen-and-shift operation. The shift is always less
1554      * than the width of the source type, so after widening the input
1555      * vector we can simply shift the whole 64-bit widened register,
1556      * and then clear the potential overflow bits resulting from left
1557      * bits of the narrow input appearing as right bits of the left
1558      * neighbour narrow input. Calculate a mask of bits to clear.
1559      */
1560     if ((a->shift != 0) && (a->size < 2 || u)) {
1561         int esize = 8 << a->size;
1562         widen_mask = MAKE_64BIT_MASK(0, esize);
1563         widen_mask >>= esize - a->shift;
1564         widen_mask = dup_const(a->size + 1, widen_mask);
1565     }
1566
1567     rm0 = neon_load_reg(a->vm, 0);
1568     rm1 = neon_load_reg(a->vm, 1);
1569     tmp = tcg_temp_new_i64();
1570
1571     widenfn(tmp, rm0);
1572     tcg_temp_free_i32(rm0);
1573     if (a->shift != 0) {
1574         tcg_gen_shli_i64(tmp, tmp, a->shift);
1575         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1576     }
1577     neon_store_reg64(tmp, a->vd);
1578
1579     widenfn(tmp, rm1);
1580     tcg_temp_free_i32(rm1);
1581     if (a->shift != 0) {
1582         tcg_gen_shli_i64(tmp, tmp, a->shift);
1583         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1584     }
1585     neon_store_reg64(tmp, a->vd + 1);
1586     tcg_temp_free_i64(tmp);
1587     return true;
1588 }
1589
1590 static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
1591 {
1592     static NeonGenWidenFn * const widenfn[] = {
1593         gen_helper_neon_widen_s8,
1594         gen_helper_neon_widen_s16,
1595         tcg_gen_ext_i32_i64,
1596     };
1597     return do_vshll_2sh(s, a, widenfn[a->size], false);
1598 }
1599
1600 static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
1601 {
1602     static NeonGenWidenFn * const widenfn[] = {
1603         gen_helper_neon_widen_u8,
1604         gen_helper_neon_widen_u16,
1605         tcg_gen_extu_i32_i64,
1606     };
1607     return do_vshll_2sh(s, a, widenfn[a->size], true);
1608 }
1609
1610 static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
1611                       gen_helper_gvec_2_ptr *fn)
1612 {
1613     /* FP operations in 2-reg-and-shift group */
1614     int vec_size = a->q ? 16 : 8;
1615     int rd_ofs = neon_reg_offset(a->vd, 0);
1616     int rm_ofs = neon_reg_offset(a->vm, 0);
1617     TCGv_ptr fpst;
1618
1619     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1620         return false;
1621     }
1622
1623     if (a->size != 0) {
1624         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1625             return false;
1626         }
1627     }
1628
1629     /* UNDEF accesses to D16-D31 if they don't exist. */
1630     if (!dc_isar_feature(aa32_simd_r32, s) &&
1631         ((a->vd | a->vm) & 0x10)) {
1632         return false;
1633     }
1634
1635     if ((a->vm | a->vd) & a->q) {
1636         return false;
1637     }
1638
1639     if (!vfp_access_check(s)) {
1640         return true;
1641     }
1642
1643     fpst = fpstatus_ptr(a->size ? FPST_STD_F16 : FPST_STD);
1644     tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, vec_size, vec_size, a->shift, fn);
1645     tcg_temp_free_ptr(fpst);
1646     return true;
1647 }
1648
1649 #define DO_FP_2SH(INSN, FUNC)                                           \
1650     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1651     {                                                                   \
1652         return do_fp_2sh(s, a, FUNC);                                   \
1653     }
1654
1655 DO_FP_2SH(VCVT_SF, gen_helper_gvec_vcvt_sf)
1656 DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf)
1657 DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs)
1658 DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu)
1659
1660 DO_FP_2SH(VCVT_SH, gen_helper_gvec_vcvt_sh)
1661 DO_FP_2SH(VCVT_UH, gen_helper_gvec_vcvt_uh)
1662 DO_FP_2SH(VCVT_HS, gen_helper_gvec_vcvt_hs)
1663 DO_FP_2SH(VCVT_HU, gen_helper_gvec_vcvt_hu)
1664
1665 static uint64_t asimd_imm_const(uint32_t imm, int cmode, int op)
1666 {
1667     /*
1668      * Expand the encoded constant.
1669      * Note that cmode = 2,3,4,5,6,7,10,11,12,13 imm=0 is UNPREDICTABLE.
1670      * We choose to not special-case this and will behave as if a
1671      * valid constant encoding of 0 had been given.
1672      * cmode = 15 op = 1 must UNDEF; we assume decode has handled that.
1673      */
1674     switch (cmode) {
1675     case 0: case 1:
1676         /* no-op */
1677         break;
1678     case 2: case 3:
1679         imm <<= 8;
1680         break;
1681     case 4: case 5:
1682         imm <<= 16;
1683         break;
1684     case 6: case 7:
1685         imm <<= 24;
1686         break;
1687     case 8: case 9:
1688         imm |= imm << 16;
1689         break;
1690     case 10: case 11:
1691         imm = (imm << 8) | (imm << 24);
1692         break;
1693     case 12:
1694         imm = (imm << 8) | 0xff;
1695         break;
1696     case 13:
1697         imm = (imm << 16) | 0xffff;
1698         break;
1699     case 14:
1700         if (op) {
1701             /*
1702              * This is the only case where the top and bottom 32 bits
1703              * of the encoded constant differ.
1704              */
1705             uint64_t imm64 = 0;
1706             int n;
1707
1708             for (n = 0; n < 8; n++) {
1709                 if (imm & (1 << n)) {
1710                     imm64 |= (0xffULL << (n * 8));
1711                 }
1712             }
1713             return imm64;
1714         }
1715         imm |= (imm << 8) | (imm << 16) | (imm << 24);
1716         break;
1717     case 15:
1718         imm = ((imm & 0x80) << 24) | ((imm & 0x3f) << 19)
1719             | ((imm & 0x40) ? (0x1f << 25) : (1 << 30));
1720         break;
1721     }
1722     if (op) {
1723         imm = ~imm;
1724     }
1725     return dup_const(MO_32, imm);
1726 }
1727
1728 static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
1729                         GVecGen2iFn *fn)
1730 {
1731     uint64_t imm;
1732     int reg_ofs, vec_size;
1733
1734     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1735         return false;
1736     }
1737
1738     /* UNDEF accesses to D16-D31 if they don't exist. */
1739     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
1740         return false;
1741     }
1742
1743     if (a->vd & a->q) {
1744         return false;
1745     }
1746
1747     if (!vfp_access_check(s)) {
1748         return true;
1749     }
1750
1751     reg_ofs = neon_reg_offset(a->vd, 0);
1752     vec_size = a->q ? 16 : 8;
1753     imm = asimd_imm_const(a->imm, a->cmode, a->op);
1754
1755     fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
1756     return true;
1757 }
1758
1759 static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
1760                         int64_t c, uint32_t oprsz, uint32_t maxsz)
1761 {
1762     tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
1763 }
1764
1765 static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
1766 {
1767     /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
1768     GVecGen2iFn *fn;
1769
1770     if ((a->cmode & 1) && a->cmode < 12) {
1771         /* for op=1, the imm will be inverted, so BIC becomes AND. */
1772         fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
1773     } else {
1774         /* There is one unallocated cmode/op combination in this space */
1775         if (a->cmode == 15 && a->op == 1) {
1776             return false;
1777         }
1778         fn = gen_VMOV_1r;
1779     }
1780     return do_1reg_imm(s, a, fn);
1781 }
1782
1783 static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
1784                            NeonGenWidenFn *widenfn,
1785                            NeonGenTwo64OpFn *opfn,
1786                            bool src1_wide)
1787 {
1788     /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
1789     TCGv_i64 rn0_64, rn1_64, rm_64;
1790     TCGv_i32 rm;
1791
1792     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1793         return false;
1794     }
1795
1796     /* UNDEF accesses to D16-D31 if they don't exist. */
1797     if (!dc_isar_feature(aa32_simd_r32, s) &&
1798         ((a->vd | a->vn | a->vm) & 0x10)) {
1799         return false;
1800     }
1801
1802     if (!widenfn || !opfn) {
1803         /* size == 3 case, which is an entirely different insn group */
1804         return false;
1805     }
1806
1807     if ((a->vd & 1) || (src1_wide && (a->vn & 1))) {
1808         return false;
1809     }
1810
1811     if (!vfp_access_check(s)) {
1812         return true;
1813     }
1814
1815     rn0_64 = tcg_temp_new_i64();
1816     rn1_64 = tcg_temp_new_i64();
1817     rm_64 = tcg_temp_new_i64();
1818
1819     if (src1_wide) {
1820         neon_load_reg64(rn0_64, a->vn);
1821     } else {
1822         TCGv_i32 tmp = neon_load_reg(a->vn, 0);
1823         widenfn(rn0_64, tmp);
1824         tcg_temp_free_i32(tmp);
1825     }
1826     rm = neon_load_reg(a->vm, 0);
1827
1828     widenfn(rm_64, rm);
1829     tcg_temp_free_i32(rm);
1830     opfn(rn0_64, rn0_64, rm_64);
1831
1832     /*
1833      * Load second pass inputs before storing the first pass result, to
1834      * avoid incorrect results if a narrow input overlaps with the result.
1835      */
1836     if (src1_wide) {
1837         neon_load_reg64(rn1_64, a->vn + 1);
1838     } else {
1839         TCGv_i32 tmp = neon_load_reg(a->vn, 1);
1840         widenfn(rn1_64, tmp);
1841         tcg_temp_free_i32(tmp);
1842     }
1843     rm = neon_load_reg(a->vm, 1);
1844
1845     neon_store_reg64(rn0_64, a->vd);
1846
1847     widenfn(rm_64, rm);
1848     tcg_temp_free_i32(rm);
1849     opfn(rn1_64, rn1_64, rm_64);
1850     neon_store_reg64(rn1_64, a->vd + 1);
1851
1852     tcg_temp_free_i64(rn0_64);
1853     tcg_temp_free_i64(rn1_64);
1854     tcg_temp_free_i64(rm_64);
1855
1856     return true;
1857 }
1858
1859 #define DO_PREWIDEN(INSN, S, EXT, OP, SRC1WIDE)                         \
1860     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1861     {                                                                   \
1862         static NeonGenWidenFn * const widenfn[] = {                     \
1863             gen_helper_neon_widen_##S##8,                               \
1864             gen_helper_neon_widen_##S##16,                              \
1865             tcg_gen_##EXT##_i32_i64,                                    \
1866             NULL,                                                       \
1867         };                                                              \
1868         static NeonGenTwo64OpFn * const addfn[] = {                     \
1869             gen_helper_neon_##OP##l_u16,                                \
1870             gen_helper_neon_##OP##l_u32,                                \
1871             tcg_gen_##OP##_i64,                                         \
1872             NULL,                                                       \
1873         };                                                              \
1874         return do_prewiden_3d(s, a, widenfn[a->size],                   \
1875                               addfn[a->size], SRC1WIDE);                \
1876     }
1877
1878 DO_PREWIDEN(VADDL_S, s, ext, add, false)
1879 DO_PREWIDEN(VADDL_U, u, extu, add, false)
1880 DO_PREWIDEN(VSUBL_S, s, ext, sub, false)
1881 DO_PREWIDEN(VSUBL_U, u, extu, sub, false)
1882 DO_PREWIDEN(VADDW_S, s, ext, add, true)
1883 DO_PREWIDEN(VADDW_U, u, extu, add, true)
1884 DO_PREWIDEN(VSUBW_S, s, ext, sub, true)
1885 DO_PREWIDEN(VSUBW_U, u, extu, sub, true)
1886
1887 static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
1888                          NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
1889 {
1890     /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
1891     TCGv_i64 rn_64, rm_64;
1892     TCGv_i32 rd0, rd1;
1893
1894     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1895         return false;
1896     }
1897
1898     /* UNDEF accesses to D16-D31 if they don't exist. */
1899     if (!dc_isar_feature(aa32_simd_r32, s) &&
1900         ((a->vd | a->vn | a->vm) & 0x10)) {
1901         return false;
1902     }
1903
1904     if (!opfn || !narrowfn) {
1905         /* size == 3 case, which is an entirely different insn group */
1906         return false;
1907     }
1908
1909     if ((a->vn | a->vm) & 1) {
1910         return false;
1911     }
1912
1913     if (!vfp_access_check(s)) {
1914         return true;
1915     }
1916
1917     rn_64 = tcg_temp_new_i64();
1918     rm_64 = tcg_temp_new_i64();
1919     rd0 = tcg_temp_new_i32();
1920     rd1 = tcg_temp_new_i32();
1921
1922     neon_load_reg64(rn_64, a->vn);
1923     neon_load_reg64(rm_64, a->vm);
1924
1925     opfn(rn_64, rn_64, rm_64);
1926
1927     narrowfn(rd0, rn_64);
1928
1929     neon_load_reg64(rn_64, a->vn + 1);
1930     neon_load_reg64(rm_64, a->vm + 1);
1931
1932     opfn(rn_64, rn_64, rm_64);
1933
1934     narrowfn(rd1, rn_64);
1935
1936     neon_store_reg(a->vd, 0, rd0);
1937     neon_store_reg(a->vd, 1, rd1);
1938
1939     tcg_temp_free_i64(rn_64);
1940     tcg_temp_free_i64(rm_64);
1941
1942     return true;
1943 }
1944
1945 #define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
1946     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1947     {                                                                   \
1948         static NeonGenTwo64OpFn * const addfn[] = {                     \
1949             gen_helper_neon_##OP##l_u16,                                \
1950             gen_helper_neon_##OP##l_u32,                                \
1951             tcg_gen_##OP##_i64,                                         \
1952             NULL,                                                       \
1953         };                                                              \
1954         static NeonGenNarrowFn * const narrowfn[] = {                   \
1955             gen_helper_neon_##NARROWTYPE##_high_u8,                     \
1956             gen_helper_neon_##NARROWTYPE##_high_u16,                    \
1957             EXTOP,                                                      \
1958             NULL,                                                       \
1959         };                                                              \
1960         return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
1961     }
1962
1963 static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
1964 {
1965     tcg_gen_addi_i64(rn, rn, 1u << 31);
1966     tcg_gen_extrh_i64_i32(rd, rn);
1967 }
1968
1969 DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
1970 DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
1971 DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
1972 DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
1973
1974 static bool do_long_3d(DisasContext *s, arg_3diff *a,
1975                        NeonGenTwoOpWidenFn *opfn,
1976                        NeonGenTwo64OpFn *accfn)
1977 {
1978     /*
1979      * 3-regs different lengths, long operations.
1980      * These perform an operation on two inputs that returns a double-width
1981      * result, and then possibly perform an accumulation operation of
1982      * that result into the double-width destination.
1983      */
1984     TCGv_i64 rd0, rd1, tmp;
1985     TCGv_i32 rn, rm;
1986
1987     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1988         return false;
1989     }
1990
1991     /* UNDEF accesses to D16-D31 if they don't exist. */
1992     if (!dc_isar_feature(aa32_simd_r32, s) &&
1993         ((a->vd | a->vn | a->vm) & 0x10)) {
1994         return false;
1995     }
1996
1997     if (!opfn) {
1998         /* size == 3 case, which is an entirely different insn group */
1999         return false;
2000     }
2001
2002     if (a->vd & 1) {
2003         return false;
2004     }
2005
2006     if (!vfp_access_check(s)) {
2007         return true;
2008     }
2009
2010     rd0 = tcg_temp_new_i64();
2011     rd1 = tcg_temp_new_i64();
2012
2013     rn = neon_load_reg(a->vn, 0);
2014     rm = neon_load_reg(a->vm, 0);
2015     opfn(rd0, rn, rm);
2016     tcg_temp_free_i32(rn);
2017     tcg_temp_free_i32(rm);
2018
2019     rn = neon_load_reg(a->vn, 1);
2020     rm = neon_load_reg(a->vm, 1);
2021     opfn(rd1, rn, rm);
2022     tcg_temp_free_i32(rn);
2023     tcg_temp_free_i32(rm);
2024
2025     /* Don't store results until after all loads: they might overlap */
2026     if (accfn) {
2027         tmp = tcg_temp_new_i64();
2028         neon_load_reg64(tmp, a->vd);
2029         accfn(tmp, tmp, rd0);
2030         neon_store_reg64(tmp, a->vd);
2031         neon_load_reg64(tmp, a->vd + 1);
2032         accfn(tmp, tmp, rd1);
2033         neon_store_reg64(tmp, a->vd + 1);
2034         tcg_temp_free_i64(tmp);
2035     } else {
2036         neon_store_reg64(rd0, a->vd);
2037         neon_store_reg64(rd1, a->vd + 1);
2038     }
2039
2040     tcg_temp_free_i64(rd0);
2041     tcg_temp_free_i64(rd1);
2042
2043     return true;
2044 }
2045
2046 static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
2047 {
2048     static NeonGenTwoOpWidenFn * const opfn[] = {
2049         gen_helper_neon_abdl_s16,
2050         gen_helper_neon_abdl_s32,
2051         gen_helper_neon_abdl_s64,
2052         NULL,
2053     };
2054
2055     return do_long_3d(s, a, opfn[a->size], NULL);
2056 }
2057
2058 static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
2059 {
2060     static NeonGenTwoOpWidenFn * const opfn[] = {
2061         gen_helper_neon_abdl_u16,
2062         gen_helper_neon_abdl_u32,
2063         gen_helper_neon_abdl_u64,
2064         NULL,
2065     };
2066
2067     return do_long_3d(s, a, opfn[a->size], NULL);
2068 }
2069
2070 static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
2071 {
2072     static NeonGenTwoOpWidenFn * const opfn[] = {
2073         gen_helper_neon_abdl_s16,
2074         gen_helper_neon_abdl_s32,
2075         gen_helper_neon_abdl_s64,
2076         NULL,
2077     };
2078     static NeonGenTwo64OpFn * const addfn[] = {
2079         gen_helper_neon_addl_u16,
2080         gen_helper_neon_addl_u32,
2081         tcg_gen_add_i64,
2082         NULL,
2083     };
2084
2085     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2086 }
2087
2088 static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
2089 {
2090     static NeonGenTwoOpWidenFn * const opfn[] = {
2091         gen_helper_neon_abdl_u16,
2092         gen_helper_neon_abdl_u32,
2093         gen_helper_neon_abdl_u64,
2094         NULL,
2095     };
2096     static NeonGenTwo64OpFn * const addfn[] = {
2097         gen_helper_neon_addl_u16,
2098         gen_helper_neon_addl_u32,
2099         tcg_gen_add_i64,
2100         NULL,
2101     };
2102
2103     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2104 }
2105
2106 static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2107 {
2108     TCGv_i32 lo = tcg_temp_new_i32();
2109     TCGv_i32 hi = tcg_temp_new_i32();
2110
2111     tcg_gen_muls2_i32(lo, hi, rn, rm);
2112     tcg_gen_concat_i32_i64(rd, lo, hi);
2113
2114     tcg_temp_free_i32(lo);
2115     tcg_temp_free_i32(hi);
2116 }
2117
2118 static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2119 {
2120     TCGv_i32 lo = tcg_temp_new_i32();
2121     TCGv_i32 hi = tcg_temp_new_i32();
2122
2123     tcg_gen_mulu2_i32(lo, hi, rn, rm);
2124     tcg_gen_concat_i32_i64(rd, lo, hi);
2125
2126     tcg_temp_free_i32(lo);
2127     tcg_temp_free_i32(hi);
2128 }
2129
2130 static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
2131 {
2132     static NeonGenTwoOpWidenFn * const opfn[] = {
2133         gen_helper_neon_mull_s8,
2134         gen_helper_neon_mull_s16,
2135         gen_mull_s32,
2136         NULL,
2137     };
2138
2139     return do_long_3d(s, a, opfn[a->size], NULL);
2140 }
2141
2142 static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
2143 {
2144     static NeonGenTwoOpWidenFn * const opfn[] = {
2145         gen_helper_neon_mull_u8,
2146         gen_helper_neon_mull_u16,
2147         gen_mull_u32,
2148         NULL,
2149     };
2150
2151     return do_long_3d(s, a, opfn[a->size], NULL);
2152 }
2153
2154 #define DO_VMLAL(INSN,MULL,ACC)                                         \
2155     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2156     {                                                                   \
2157         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2158             gen_helper_neon_##MULL##8,                                  \
2159             gen_helper_neon_##MULL##16,                                 \
2160             gen_##MULL##32,                                             \
2161             NULL,                                                       \
2162         };                                                              \
2163         static NeonGenTwo64OpFn * const accfn[] = {                     \
2164             gen_helper_neon_##ACC##l_u16,                               \
2165             gen_helper_neon_##ACC##l_u32,                               \
2166             tcg_gen_##ACC##_i64,                                        \
2167             NULL,                                                       \
2168         };                                                              \
2169         return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
2170     }
2171
2172 DO_VMLAL(VMLAL_S,mull_s,add)
2173 DO_VMLAL(VMLAL_U,mull_u,add)
2174 DO_VMLAL(VMLSL_S,mull_s,sub)
2175 DO_VMLAL(VMLSL_U,mull_u,sub)
2176
2177 static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2178 {
2179     gen_helper_neon_mull_s16(rd, rn, rm);
2180     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rd, rd);
2181 }
2182
2183 static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2184 {
2185     gen_mull_s32(rd, rn, rm);
2186     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rd, rd);
2187 }
2188
2189 static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
2190 {
2191     static NeonGenTwoOpWidenFn * const opfn[] = {
2192         NULL,
2193         gen_VQDMULL_16,
2194         gen_VQDMULL_32,
2195         NULL,
2196     };
2197
2198     return do_long_3d(s, a, opfn[a->size], NULL);
2199 }
2200
2201 static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2202 {
2203     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2204 }
2205
2206 static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2207 {
2208     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2209 }
2210
2211 static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
2212 {
2213     static NeonGenTwoOpWidenFn * const opfn[] = {
2214         NULL,
2215         gen_VQDMULL_16,
2216         gen_VQDMULL_32,
2217         NULL,
2218     };
2219     static NeonGenTwo64OpFn * const accfn[] = {
2220         NULL,
2221         gen_VQDMLAL_acc_16,
2222         gen_VQDMLAL_acc_32,
2223         NULL,
2224     };
2225
2226     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2227 }
2228
2229 static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2230 {
2231     gen_helper_neon_negl_u32(rm, rm);
2232     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2233 }
2234
2235 static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2236 {
2237     tcg_gen_neg_i64(rm, rm);
2238     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2239 }
2240
2241 static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
2242 {
2243     static NeonGenTwoOpWidenFn * const opfn[] = {
2244         NULL,
2245         gen_VQDMULL_16,
2246         gen_VQDMULL_32,
2247         NULL,
2248     };
2249     static NeonGenTwo64OpFn * const accfn[] = {
2250         NULL,
2251         gen_VQDMLSL_acc_16,
2252         gen_VQDMLSL_acc_32,
2253         NULL,
2254     };
2255
2256     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2257 }
2258
2259 static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
2260 {
2261     gen_helper_gvec_3 *fn_gvec;
2262
2263     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2264         return false;
2265     }
2266
2267     /* UNDEF accesses to D16-D31 if they don't exist. */
2268     if (!dc_isar_feature(aa32_simd_r32, s) &&
2269         ((a->vd | a->vn | a->vm) & 0x10)) {
2270         return false;
2271     }
2272
2273     if (a->vd & 1) {
2274         return false;
2275     }
2276
2277     switch (a->size) {
2278     case 0:
2279         fn_gvec = gen_helper_neon_pmull_h;
2280         break;
2281     case 2:
2282         if (!dc_isar_feature(aa32_pmull, s)) {
2283             return false;
2284         }
2285         fn_gvec = gen_helper_gvec_pmull_q;
2286         break;
2287     default:
2288         return false;
2289     }
2290
2291     if (!vfp_access_check(s)) {
2292         return true;
2293     }
2294
2295     tcg_gen_gvec_3_ool(neon_reg_offset(a->vd, 0),
2296                        neon_reg_offset(a->vn, 0),
2297                        neon_reg_offset(a->vm, 0),
2298                        16, 16, 0, fn_gvec);
2299     return true;
2300 }
2301
2302 static void gen_neon_dup_low16(TCGv_i32 var)
2303 {
2304     TCGv_i32 tmp = tcg_temp_new_i32();
2305     tcg_gen_ext16u_i32(var, var);
2306     tcg_gen_shli_i32(tmp, var, 16);
2307     tcg_gen_or_i32(var, var, tmp);
2308     tcg_temp_free_i32(tmp);
2309 }
2310
2311 static void gen_neon_dup_high16(TCGv_i32 var)
2312 {
2313     TCGv_i32 tmp = tcg_temp_new_i32();
2314     tcg_gen_andi_i32(var, var, 0xffff0000);
2315     tcg_gen_shri_i32(tmp, var, 16);
2316     tcg_gen_or_i32(var, var, tmp);
2317     tcg_temp_free_i32(tmp);
2318 }
2319
2320 static inline TCGv_i32 neon_get_scalar(int size, int reg)
2321 {
2322     TCGv_i32 tmp;
2323     if (size == 1) {
2324         tmp = neon_load_reg(reg & 7, reg >> 4);
2325         if (reg & 8) {
2326             gen_neon_dup_high16(tmp);
2327         } else {
2328             gen_neon_dup_low16(tmp);
2329         }
2330     } else {
2331         tmp = neon_load_reg(reg & 15, reg >> 4);
2332     }
2333     return tmp;
2334 }
2335
2336 static bool do_2scalar(DisasContext *s, arg_2scalar *a,
2337                        NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
2338 {
2339     /*
2340      * Two registers and a scalar: perform an operation between
2341      * the input elements and the scalar, and then possibly
2342      * perform an accumulation operation of that result into the
2343      * destination.
2344      */
2345     TCGv_i32 scalar;
2346     int pass;
2347
2348     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2349         return false;
2350     }
2351
2352     /* UNDEF accesses to D16-D31 if they don't exist. */
2353     if (!dc_isar_feature(aa32_simd_r32, s) &&
2354         ((a->vd | a->vn | a->vm) & 0x10)) {
2355         return false;
2356     }
2357
2358     if (!opfn) {
2359         /* Bad size (including size == 3, which is a different insn group) */
2360         return false;
2361     }
2362
2363     if (a->q && ((a->vd | a->vn) & 1)) {
2364         return false;
2365     }
2366
2367     if (!vfp_access_check(s)) {
2368         return true;
2369     }
2370
2371     scalar = neon_get_scalar(a->size, a->vm);
2372
2373     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2374         TCGv_i32 tmp = neon_load_reg(a->vn, pass);
2375         opfn(tmp, tmp, scalar);
2376         if (accfn) {
2377             TCGv_i32 rd = neon_load_reg(a->vd, pass);
2378             accfn(tmp, rd, tmp);
2379             tcg_temp_free_i32(rd);
2380         }
2381         neon_store_reg(a->vd, pass, tmp);
2382     }
2383     tcg_temp_free_i32(scalar);
2384     return true;
2385 }
2386
2387 static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
2388 {
2389     static NeonGenTwoOpFn * const opfn[] = {
2390         NULL,
2391         gen_helper_neon_mul_u16,
2392         tcg_gen_mul_i32,
2393         NULL,
2394     };
2395
2396     return do_2scalar(s, a, opfn[a->size], NULL);
2397 }
2398
2399 static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
2400 {
2401     static NeonGenTwoOpFn * const opfn[] = {
2402         NULL,
2403         gen_helper_neon_mul_u16,
2404         tcg_gen_mul_i32,
2405         NULL,
2406     };
2407     static NeonGenTwoOpFn * const accfn[] = {
2408         NULL,
2409         gen_helper_neon_add_u16,
2410         tcg_gen_add_i32,
2411         NULL,
2412     };
2413
2414     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2415 }
2416
2417 static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
2418 {
2419     static NeonGenTwoOpFn * const opfn[] = {
2420         NULL,
2421         gen_helper_neon_mul_u16,
2422         tcg_gen_mul_i32,
2423         NULL,
2424     };
2425     static NeonGenTwoOpFn * const accfn[] = {
2426         NULL,
2427         gen_helper_neon_sub_u16,
2428         tcg_gen_sub_i32,
2429         NULL,
2430     };
2431
2432     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2433 }
2434
2435 /*
2436  * Rather than have a float-specific version of do_2scalar just for
2437  * three insns, we wrap a NeonGenTwoSingleOpFn to turn it into
2438  * a NeonGenTwoOpFn.
2439  */
2440 #define WRAP_FP_FN(WRAPNAME, FUNC)                              \
2441     static void WRAPNAME(TCGv_i32 rd, TCGv_i32 rn, TCGv_i32 rm) \
2442     {                                                           \
2443         TCGv_ptr fpstatus = fpstatus_ptr(FPST_STD);             \
2444         FUNC(rd, rn, rm, fpstatus);                             \
2445         tcg_temp_free_ptr(fpstatus);                            \
2446     }
2447
2448 WRAP_FP_FN(gen_VMUL_F_mul, gen_helper_vfp_muls)
2449 WRAP_FP_FN(gen_VMUL_F_add, gen_helper_vfp_adds)
2450 WRAP_FP_FN(gen_VMUL_F_sub, gen_helper_vfp_subs)
2451
2452 static bool trans_VMUL_F_2sc(DisasContext *s, arg_2scalar *a)
2453 {
2454     static NeonGenTwoOpFn * const opfn[] = {
2455         NULL,
2456         NULL, /* TODO: fp16 support */
2457         gen_VMUL_F_mul,
2458         NULL,
2459     };
2460
2461     return do_2scalar(s, a, opfn[a->size], NULL);
2462 }
2463
2464 static bool trans_VMLA_F_2sc(DisasContext *s, arg_2scalar *a)
2465 {
2466     static NeonGenTwoOpFn * const opfn[] = {
2467         NULL,
2468         NULL, /* TODO: fp16 support */
2469         gen_VMUL_F_mul,
2470         NULL,
2471     };
2472     static NeonGenTwoOpFn * const accfn[] = {
2473         NULL,
2474         NULL, /* TODO: fp16 support */
2475         gen_VMUL_F_add,
2476         NULL,
2477     };
2478
2479     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2480 }
2481
2482 static bool trans_VMLS_F_2sc(DisasContext *s, arg_2scalar *a)
2483 {
2484     static NeonGenTwoOpFn * const opfn[] = {
2485         NULL,
2486         NULL, /* TODO: fp16 support */
2487         gen_VMUL_F_mul,
2488         NULL,
2489     };
2490     static NeonGenTwoOpFn * const accfn[] = {
2491         NULL,
2492         NULL, /* TODO: fp16 support */
2493         gen_VMUL_F_sub,
2494         NULL,
2495     };
2496
2497     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2498 }
2499
2500 WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
2501 WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
2502 WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
2503 WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
2504
2505 static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
2506 {
2507     static NeonGenTwoOpFn * const opfn[] = {
2508         NULL,
2509         gen_VQDMULH_16,
2510         gen_VQDMULH_32,
2511         NULL,
2512     };
2513
2514     return do_2scalar(s, a, opfn[a->size], NULL);
2515 }
2516
2517 static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
2518 {
2519     static NeonGenTwoOpFn * const opfn[] = {
2520         NULL,
2521         gen_VQRDMULH_16,
2522         gen_VQRDMULH_32,
2523         NULL,
2524     };
2525
2526     return do_2scalar(s, a, opfn[a->size], NULL);
2527 }
2528
2529 static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
2530                             NeonGenThreeOpEnvFn *opfn)
2531 {
2532     /*
2533      * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
2534      * performs a kind of fused op-then-accumulate using a helper
2535      * function that takes all of rd, rn and the scalar at once.
2536      */
2537     TCGv_i32 scalar;
2538     int pass;
2539
2540     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2541         return false;
2542     }
2543
2544     if (!dc_isar_feature(aa32_rdm, s)) {
2545         return false;
2546     }
2547
2548     /* UNDEF accesses to D16-D31 if they don't exist. */
2549     if (!dc_isar_feature(aa32_simd_r32, s) &&
2550         ((a->vd | a->vn | a->vm) & 0x10)) {
2551         return false;
2552     }
2553
2554     if (!opfn) {
2555         /* Bad size (including size == 3, which is a different insn group) */
2556         return false;
2557     }
2558
2559     if (a->q && ((a->vd | a->vn) & 1)) {
2560         return false;
2561     }
2562
2563     if (!vfp_access_check(s)) {
2564         return true;
2565     }
2566
2567     scalar = neon_get_scalar(a->size, a->vm);
2568
2569     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2570         TCGv_i32 rn = neon_load_reg(a->vn, pass);
2571         TCGv_i32 rd = neon_load_reg(a->vd, pass);
2572         opfn(rd, cpu_env, rn, scalar, rd);
2573         tcg_temp_free_i32(rn);
2574         neon_store_reg(a->vd, pass, rd);
2575     }
2576     tcg_temp_free_i32(scalar);
2577
2578     return true;
2579 }
2580
2581 static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
2582 {
2583     static NeonGenThreeOpEnvFn *opfn[] = {
2584         NULL,
2585         gen_helper_neon_qrdmlah_s16,
2586         gen_helper_neon_qrdmlah_s32,
2587         NULL,
2588     };
2589     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2590 }
2591
2592 static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
2593 {
2594     static NeonGenThreeOpEnvFn *opfn[] = {
2595         NULL,
2596         gen_helper_neon_qrdmlsh_s16,
2597         gen_helper_neon_qrdmlsh_s32,
2598         NULL,
2599     };
2600     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2601 }
2602
2603 static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
2604                             NeonGenTwoOpWidenFn *opfn,
2605                             NeonGenTwo64OpFn *accfn)
2606 {
2607     /*
2608      * Two registers and a scalar, long operations: perform an
2609      * operation on the input elements and the scalar which produces
2610      * a double-width result, and then possibly perform an accumulation
2611      * operation of that result into the destination.
2612      */
2613     TCGv_i32 scalar, rn;
2614     TCGv_i64 rn0_64, rn1_64;
2615
2616     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2617         return false;
2618     }
2619
2620     /* UNDEF accesses to D16-D31 if they don't exist. */
2621     if (!dc_isar_feature(aa32_simd_r32, s) &&
2622         ((a->vd | a->vn | a->vm) & 0x10)) {
2623         return false;
2624     }
2625
2626     if (!opfn) {
2627         /* Bad size (including size == 3, which is a different insn group) */
2628         return false;
2629     }
2630
2631     if (a->vd & 1) {
2632         return false;
2633     }
2634
2635     if (!vfp_access_check(s)) {
2636         return true;
2637     }
2638
2639     scalar = neon_get_scalar(a->size, a->vm);
2640
2641     /* Load all inputs before writing any outputs, in case of overlap */
2642     rn = neon_load_reg(a->vn, 0);
2643     rn0_64 = tcg_temp_new_i64();
2644     opfn(rn0_64, rn, scalar);
2645     tcg_temp_free_i32(rn);
2646
2647     rn = neon_load_reg(a->vn, 1);
2648     rn1_64 = tcg_temp_new_i64();
2649     opfn(rn1_64, rn, scalar);
2650     tcg_temp_free_i32(rn);
2651     tcg_temp_free_i32(scalar);
2652
2653     if (accfn) {
2654         TCGv_i64 t64 = tcg_temp_new_i64();
2655         neon_load_reg64(t64, a->vd);
2656         accfn(t64, t64, rn0_64);
2657         neon_store_reg64(t64, a->vd);
2658         neon_load_reg64(t64, a->vd + 1);
2659         accfn(t64, t64, rn1_64);
2660         neon_store_reg64(t64, a->vd + 1);
2661         tcg_temp_free_i64(t64);
2662     } else {
2663         neon_store_reg64(rn0_64, a->vd);
2664         neon_store_reg64(rn1_64, a->vd + 1);
2665     }
2666     tcg_temp_free_i64(rn0_64);
2667     tcg_temp_free_i64(rn1_64);
2668     return true;
2669 }
2670
2671 static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
2672 {
2673     static NeonGenTwoOpWidenFn * const opfn[] = {
2674         NULL,
2675         gen_helper_neon_mull_s16,
2676         gen_mull_s32,
2677         NULL,
2678     };
2679
2680     return do_2scalar_long(s, a, opfn[a->size], NULL);
2681 }
2682
2683 static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
2684 {
2685     static NeonGenTwoOpWidenFn * const opfn[] = {
2686         NULL,
2687         gen_helper_neon_mull_u16,
2688         gen_mull_u32,
2689         NULL,
2690     };
2691
2692     return do_2scalar_long(s, a, opfn[a->size], NULL);
2693 }
2694
2695 #define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
2696     static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
2697     {                                                                   \
2698         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2699             NULL,                                                       \
2700             gen_helper_neon_##MULL##16,                                 \
2701             gen_##MULL##32,                                             \
2702             NULL,                                                       \
2703         };                                                              \
2704         static NeonGenTwo64OpFn * const accfn[] = {                     \
2705             NULL,                                                       \
2706             gen_helper_neon_##ACC##l_u32,                               \
2707             tcg_gen_##ACC##_i64,                                        \
2708             NULL,                                                       \
2709         };                                                              \
2710         return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
2711     }
2712
2713 DO_VMLAL_2SC(VMLAL_S, mull_s, add)
2714 DO_VMLAL_2SC(VMLAL_U, mull_u, add)
2715 DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
2716 DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
2717
2718 static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
2719 {
2720     static NeonGenTwoOpWidenFn * const opfn[] = {
2721         NULL,
2722         gen_VQDMULL_16,
2723         gen_VQDMULL_32,
2724         NULL,
2725     };
2726
2727     return do_2scalar_long(s, a, opfn[a->size], NULL);
2728 }
2729
2730 static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
2731 {
2732     static NeonGenTwoOpWidenFn * const opfn[] = {
2733         NULL,
2734         gen_VQDMULL_16,
2735         gen_VQDMULL_32,
2736         NULL,
2737     };
2738     static NeonGenTwo64OpFn * const accfn[] = {
2739         NULL,
2740         gen_VQDMLAL_acc_16,
2741         gen_VQDMLAL_acc_32,
2742         NULL,
2743     };
2744
2745     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2746 }
2747
2748 static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
2749 {
2750     static NeonGenTwoOpWidenFn * const opfn[] = {
2751         NULL,
2752         gen_VQDMULL_16,
2753         gen_VQDMULL_32,
2754         NULL,
2755     };
2756     static NeonGenTwo64OpFn * const accfn[] = {
2757         NULL,
2758         gen_VQDMLSL_acc_16,
2759         gen_VQDMLSL_acc_32,
2760         NULL,
2761     };
2762
2763     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2764 }
2765
2766 static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
2767 {
2768     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2769         return false;
2770     }
2771
2772     /* UNDEF accesses to D16-D31 if they don't exist. */
2773     if (!dc_isar_feature(aa32_simd_r32, s) &&
2774         ((a->vd | a->vn | a->vm) & 0x10)) {
2775         return false;
2776     }
2777
2778     if ((a->vn | a->vm | a->vd) & a->q) {
2779         return false;
2780     }
2781
2782     if (a->imm > 7 && !a->q) {
2783         return false;
2784     }
2785
2786     if (!vfp_access_check(s)) {
2787         return true;
2788     }
2789
2790     if (!a->q) {
2791         /* Extract 64 bits from <Vm:Vn> */
2792         TCGv_i64 left, right, dest;
2793
2794         left = tcg_temp_new_i64();
2795         right = tcg_temp_new_i64();
2796         dest = tcg_temp_new_i64();
2797
2798         neon_load_reg64(right, a->vn);
2799         neon_load_reg64(left, a->vm);
2800         tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
2801         neon_store_reg64(dest, a->vd);
2802
2803         tcg_temp_free_i64(left);
2804         tcg_temp_free_i64(right);
2805         tcg_temp_free_i64(dest);
2806     } else {
2807         /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
2808         TCGv_i64 left, middle, right, destleft, destright;
2809
2810         left = tcg_temp_new_i64();
2811         middle = tcg_temp_new_i64();
2812         right = tcg_temp_new_i64();
2813         destleft = tcg_temp_new_i64();
2814         destright = tcg_temp_new_i64();
2815
2816         if (a->imm < 8) {
2817             neon_load_reg64(right, a->vn);
2818             neon_load_reg64(middle, a->vn + 1);
2819             tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
2820             neon_load_reg64(left, a->vm);
2821             tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
2822         } else {
2823             neon_load_reg64(right, a->vn + 1);
2824             neon_load_reg64(middle, a->vm);
2825             tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
2826             neon_load_reg64(left, a->vm + 1);
2827             tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
2828         }
2829
2830         neon_store_reg64(destright, a->vd);
2831         neon_store_reg64(destleft, a->vd + 1);
2832
2833         tcg_temp_free_i64(destright);
2834         tcg_temp_free_i64(destleft);
2835         tcg_temp_free_i64(right);
2836         tcg_temp_free_i64(middle);
2837         tcg_temp_free_i64(left);
2838     }
2839     return true;
2840 }
2841
2842 static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
2843 {
2844     int n;
2845     TCGv_i32 tmp, tmp2, tmp3, tmp4;
2846     TCGv_ptr ptr1;
2847
2848     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2849         return false;
2850     }
2851
2852     /* UNDEF accesses to D16-D31 if they don't exist. */
2853     if (!dc_isar_feature(aa32_simd_r32, s) &&
2854         ((a->vd | a->vn | a->vm) & 0x10)) {
2855         return false;
2856     }
2857
2858     if (!vfp_access_check(s)) {
2859         return true;
2860     }
2861
2862     n = a->len + 1;
2863     if ((a->vn + n) > 32) {
2864         /*
2865          * This is UNPREDICTABLE; we choose to UNDEF to avoid the
2866          * helper function running off the end of the register file.
2867          */
2868         return false;
2869     }
2870     n <<= 3;
2871     if (a->op) {
2872         tmp = neon_load_reg(a->vd, 0);
2873     } else {
2874         tmp = tcg_temp_new_i32();
2875         tcg_gen_movi_i32(tmp, 0);
2876     }
2877     tmp2 = neon_load_reg(a->vm, 0);
2878     ptr1 = vfp_reg_ptr(true, a->vn);
2879     tmp4 = tcg_const_i32(n);
2880     gen_helper_neon_tbl(tmp2, tmp2, tmp, ptr1, tmp4);
2881     tcg_temp_free_i32(tmp);
2882     if (a->op) {
2883         tmp = neon_load_reg(a->vd, 1);
2884     } else {
2885         tmp = tcg_temp_new_i32();
2886         tcg_gen_movi_i32(tmp, 0);
2887     }
2888     tmp3 = neon_load_reg(a->vm, 1);
2889     gen_helper_neon_tbl(tmp3, tmp3, tmp, ptr1, tmp4);
2890     tcg_temp_free_i32(tmp4);
2891     tcg_temp_free_ptr(ptr1);
2892     neon_store_reg(a->vd, 0, tmp2);
2893     neon_store_reg(a->vd, 1, tmp3);
2894     tcg_temp_free_i32(tmp);
2895     return true;
2896 }
2897
2898 static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
2899 {
2900     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2901         return false;
2902     }
2903
2904     /* UNDEF accesses to D16-D31 if they don't exist. */
2905     if (!dc_isar_feature(aa32_simd_r32, s) &&
2906         ((a->vd | a->vm) & 0x10)) {
2907         return false;
2908     }
2909
2910     if (a->vd & a->q) {
2911         return false;
2912     }
2913
2914     if (!vfp_access_check(s)) {
2915         return true;
2916     }
2917
2918     tcg_gen_gvec_dup_mem(a->size, neon_reg_offset(a->vd, 0),
2919                          neon_element_offset(a->vm, a->index, a->size),
2920                          a->q ? 16 : 8, a->q ? 16 : 8);
2921     return true;
2922 }
2923
2924 static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
2925 {
2926     int pass, half;
2927
2928     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2929         return false;
2930     }
2931
2932     /* UNDEF accesses to D16-D31 if they don't exist. */
2933     if (!dc_isar_feature(aa32_simd_r32, s) &&
2934         ((a->vd | a->vm) & 0x10)) {
2935         return false;
2936     }
2937
2938     if ((a->vd | a->vm) & a->q) {
2939         return false;
2940     }
2941
2942     if (a->size == 3) {
2943         return false;
2944     }
2945
2946     if (!vfp_access_check(s)) {
2947         return true;
2948     }
2949
2950     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
2951         TCGv_i32 tmp[2];
2952
2953         for (half = 0; half < 2; half++) {
2954             tmp[half] = neon_load_reg(a->vm, pass * 2 + half);
2955             switch (a->size) {
2956             case 0:
2957                 tcg_gen_bswap32_i32(tmp[half], tmp[half]);
2958                 break;
2959             case 1:
2960                 gen_swap_half(tmp[half], tmp[half]);
2961                 break;
2962             case 2:
2963                 break;
2964             default:
2965                 g_assert_not_reached();
2966             }
2967         }
2968         neon_store_reg(a->vd, pass * 2, tmp[1]);
2969         neon_store_reg(a->vd, pass * 2 + 1, tmp[0]);
2970     }
2971     return true;
2972 }
2973
2974 static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
2975                               NeonGenWidenFn *widenfn,
2976                               NeonGenTwo64OpFn *opfn,
2977                               NeonGenTwo64OpFn *accfn)
2978 {
2979     /*
2980      * Pairwise long operations: widen both halves of the pair,
2981      * combine the pairs with the opfn, and then possibly accumulate
2982      * into the destination with the accfn.
2983      */
2984     int pass;
2985
2986     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2987         return false;
2988     }
2989
2990     /* UNDEF accesses to D16-D31 if they don't exist. */
2991     if (!dc_isar_feature(aa32_simd_r32, s) &&
2992         ((a->vd | a->vm) & 0x10)) {
2993         return false;
2994     }
2995
2996     if ((a->vd | a->vm) & a->q) {
2997         return false;
2998     }
2999
3000     if (!widenfn) {
3001         return false;
3002     }
3003
3004     if (!vfp_access_check(s)) {
3005         return true;
3006     }
3007
3008     for (pass = 0; pass < a->q + 1; pass++) {
3009         TCGv_i32 tmp;
3010         TCGv_i64 rm0_64, rm1_64, rd_64;
3011
3012         rm0_64 = tcg_temp_new_i64();
3013         rm1_64 = tcg_temp_new_i64();
3014         rd_64 = tcg_temp_new_i64();
3015         tmp = neon_load_reg(a->vm, pass * 2);
3016         widenfn(rm0_64, tmp);
3017         tcg_temp_free_i32(tmp);
3018         tmp = neon_load_reg(a->vm, pass * 2 + 1);
3019         widenfn(rm1_64, tmp);
3020         tcg_temp_free_i32(tmp);
3021         opfn(rd_64, rm0_64, rm1_64);
3022         tcg_temp_free_i64(rm0_64);
3023         tcg_temp_free_i64(rm1_64);
3024
3025         if (accfn) {
3026             TCGv_i64 tmp64 = tcg_temp_new_i64();
3027             neon_load_reg64(tmp64, a->vd + pass);
3028             accfn(rd_64, tmp64, rd_64);
3029             tcg_temp_free_i64(tmp64);
3030         }
3031         neon_store_reg64(rd_64, a->vd + pass);
3032         tcg_temp_free_i64(rd_64);
3033     }
3034     return true;
3035 }
3036
3037 static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
3038 {
3039     static NeonGenWidenFn * const widenfn[] = {
3040         gen_helper_neon_widen_s8,
3041         gen_helper_neon_widen_s16,
3042         tcg_gen_ext_i32_i64,
3043         NULL,
3044     };
3045     static NeonGenTwo64OpFn * const opfn[] = {
3046         gen_helper_neon_paddl_u16,
3047         gen_helper_neon_paddl_u32,
3048         tcg_gen_add_i64,
3049         NULL,
3050     };
3051
3052     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3053 }
3054
3055 static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
3056 {
3057     static NeonGenWidenFn * const widenfn[] = {
3058         gen_helper_neon_widen_u8,
3059         gen_helper_neon_widen_u16,
3060         tcg_gen_extu_i32_i64,
3061         NULL,
3062     };
3063     static NeonGenTwo64OpFn * const opfn[] = {
3064         gen_helper_neon_paddl_u16,
3065         gen_helper_neon_paddl_u32,
3066         tcg_gen_add_i64,
3067         NULL,
3068     };
3069
3070     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3071 }
3072
3073 static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
3074 {
3075     static NeonGenWidenFn * const widenfn[] = {
3076         gen_helper_neon_widen_s8,
3077         gen_helper_neon_widen_s16,
3078         tcg_gen_ext_i32_i64,
3079         NULL,
3080     };
3081     static NeonGenTwo64OpFn * const opfn[] = {
3082         gen_helper_neon_paddl_u16,
3083         gen_helper_neon_paddl_u32,
3084         tcg_gen_add_i64,
3085         NULL,
3086     };
3087     static NeonGenTwo64OpFn * const accfn[] = {
3088         gen_helper_neon_addl_u16,
3089         gen_helper_neon_addl_u32,
3090         tcg_gen_add_i64,
3091         NULL,
3092     };
3093
3094     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3095                              accfn[a->size]);
3096 }
3097
3098 static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
3099 {
3100     static NeonGenWidenFn * const widenfn[] = {
3101         gen_helper_neon_widen_u8,
3102         gen_helper_neon_widen_u16,
3103         tcg_gen_extu_i32_i64,
3104         NULL,
3105     };
3106     static NeonGenTwo64OpFn * const opfn[] = {
3107         gen_helper_neon_paddl_u16,
3108         gen_helper_neon_paddl_u32,
3109         tcg_gen_add_i64,
3110         NULL,
3111     };
3112     static NeonGenTwo64OpFn * const accfn[] = {
3113         gen_helper_neon_addl_u16,
3114         gen_helper_neon_addl_u32,
3115         tcg_gen_add_i64,
3116         NULL,
3117     };
3118
3119     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3120                              accfn[a->size]);
3121 }
3122
3123 typedef void ZipFn(TCGv_ptr, TCGv_ptr);
3124
3125 static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
3126                        ZipFn *fn)
3127 {
3128     TCGv_ptr pd, pm;
3129
3130     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3131         return false;
3132     }
3133
3134     /* UNDEF accesses to D16-D31 if they don't exist. */
3135     if (!dc_isar_feature(aa32_simd_r32, s) &&
3136         ((a->vd | a->vm) & 0x10)) {
3137         return false;
3138     }
3139
3140     if ((a->vd | a->vm) & a->q) {
3141         return false;
3142     }
3143
3144     if (!fn) {
3145         /* Bad size or size/q combination */
3146         return false;
3147     }
3148
3149     if (!vfp_access_check(s)) {
3150         return true;
3151     }
3152
3153     pd = vfp_reg_ptr(true, a->vd);
3154     pm = vfp_reg_ptr(true, a->vm);
3155     fn(pd, pm);
3156     tcg_temp_free_ptr(pd);
3157     tcg_temp_free_ptr(pm);
3158     return true;
3159 }
3160
3161 static bool trans_VUZP(DisasContext *s, arg_2misc *a)
3162 {
3163     static ZipFn * const fn[2][4] = {
3164         {
3165             gen_helper_neon_unzip8,
3166             gen_helper_neon_unzip16,
3167             NULL,
3168             NULL,
3169         }, {
3170             gen_helper_neon_qunzip8,
3171             gen_helper_neon_qunzip16,
3172             gen_helper_neon_qunzip32,
3173             NULL,
3174         }
3175     };
3176     return do_zip_uzp(s, a, fn[a->q][a->size]);
3177 }
3178
3179 static bool trans_VZIP(DisasContext *s, arg_2misc *a)
3180 {
3181     static ZipFn * const fn[2][4] = {
3182         {
3183             gen_helper_neon_zip8,
3184             gen_helper_neon_zip16,
3185             NULL,
3186             NULL,
3187         }, {
3188             gen_helper_neon_qzip8,
3189             gen_helper_neon_qzip16,
3190             gen_helper_neon_qzip32,
3191             NULL,
3192         }
3193     };
3194     return do_zip_uzp(s, a, fn[a->q][a->size]);
3195 }
3196
3197 static bool do_vmovn(DisasContext *s, arg_2misc *a,
3198                      NeonGenNarrowEnvFn *narrowfn)
3199 {
3200     TCGv_i64 rm;
3201     TCGv_i32 rd0, rd1;
3202
3203     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3204         return false;
3205     }
3206
3207     /* UNDEF accesses to D16-D31 if they don't exist. */
3208     if (!dc_isar_feature(aa32_simd_r32, s) &&
3209         ((a->vd | a->vm) & 0x10)) {
3210         return false;
3211     }
3212
3213     if (a->vm & 1) {
3214         return false;
3215     }
3216
3217     if (!narrowfn) {
3218         return false;
3219     }
3220
3221     if (!vfp_access_check(s)) {
3222         return true;
3223     }
3224
3225     rm = tcg_temp_new_i64();
3226     rd0 = tcg_temp_new_i32();
3227     rd1 = tcg_temp_new_i32();
3228
3229     neon_load_reg64(rm, a->vm);
3230     narrowfn(rd0, cpu_env, rm);
3231     neon_load_reg64(rm, a->vm + 1);
3232     narrowfn(rd1, cpu_env, rm);
3233     neon_store_reg(a->vd, 0, rd0);
3234     neon_store_reg(a->vd, 1, rd1);
3235     tcg_temp_free_i64(rm);
3236     return true;
3237 }
3238
3239 #define DO_VMOVN(INSN, FUNC)                                    \
3240     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3241     {                                                           \
3242         static NeonGenNarrowEnvFn * const narrowfn[] = {        \
3243             FUNC##8,                                            \
3244             FUNC##16,                                           \
3245             FUNC##32,                                           \
3246             NULL,                                               \
3247         };                                                      \
3248         return do_vmovn(s, a, narrowfn[a->size]);               \
3249     }
3250
3251 DO_VMOVN(VMOVN, gen_neon_narrow_u)
3252 DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
3253 DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
3254 DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
3255
3256 static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
3257 {
3258     TCGv_i32 rm0, rm1;
3259     TCGv_i64 rd;
3260     static NeonGenWidenFn * const widenfns[] = {
3261         gen_helper_neon_widen_u8,
3262         gen_helper_neon_widen_u16,
3263         tcg_gen_extu_i32_i64,
3264         NULL,
3265     };
3266     NeonGenWidenFn *widenfn = widenfns[a->size];
3267
3268     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3269         return false;
3270     }
3271
3272     /* UNDEF accesses to D16-D31 if they don't exist. */
3273     if (!dc_isar_feature(aa32_simd_r32, s) &&
3274         ((a->vd | a->vm) & 0x10)) {
3275         return false;
3276     }
3277
3278     if (a->vd & 1) {
3279         return false;
3280     }
3281
3282     if (!widenfn) {
3283         return false;
3284     }
3285
3286     if (!vfp_access_check(s)) {
3287         return true;
3288     }
3289
3290     rd = tcg_temp_new_i64();
3291
3292     rm0 = neon_load_reg(a->vm, 0);
3293     rm1 = neon_load_reg(a->vm, 1);
3294
3295     widenfn(rd, rm0);
3296     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3297     neon_store_reg64(rd, a->vd);
3298     widenfn(rd, rm1);
3299     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3300     neon_store_reg64(rd, a->vd + 1);
3301
3302     tcg_temp_free_i64(rd);
3303     tcg_temp_free_i32(rm0);
3304     tcg_temp_free_i32(rm1);
3305     return true;
3306 }
3307
3308 static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
3309 {
3310     TCGv_ptr fpst;
3311     TCGv_i32 ahp, tmp, tmp2, tmp3;
3312
3313     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3314         !dc_isar_feature(aa32_fp16_spconv, s)) {
3315         return false;
3316     }
3317
3318     /* UNDEF accesses to D16-D31 if they don't exist. */
3319     if (!dc_isar_feature(aa32_simd_r32, s) &&
3320         ((a->vd | a->vm) & 0x10)) {
3321         return false;
3322     }
3323
3324     if ((a->vm & 1) || (a->size != 1)) {
3325         return false;
3326     }
3327
3328     if (!vfp_access_check(s)) {
3329         return true;
3330     }
3331
3332     fpst = fpstatus_ptr(FPST_STD);
3333     ahp = get_ahp_flag();
3334     tmp = neon_load_reg(a->vm, 0);
3335     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3336     tmp2 = neon_load_reg(a->vm, 1);
3337     gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
3338     tcg_gen_shli_i32(tmp2, tmp2, 16);
3339     tcg_gen_or_i32(tmp2, tmp2, tmp);
3340     tcg_temp_free_i32(tmp);
3341     tmp = neon_load_reg(a->vm, 2);
3342     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3343     tmp3 = neon_load_reg(a->vm, 3);
3344     neon_store_reg(a->vd, 0, tmp2);
3345     gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
3346     tcg_gen_shli_i32(tmp3, tmp3, 16);
3347     tcg_gen_or_i32(tmp3, tmp3, tmp);
3348     neon_store_reg(a->vd, 1, tmp3);
3349     tcg_temp_free_i32(tmp);
3350     tcg_temp_free_i32(ahp);
3351     tcg_temp_free_ptr(fpst);
3352
3353     return true;
3354 }
3355
3356 static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
3357 {
3358     TCGv_ptr fpst;
3359     TCGv_i32 ahp, tmp, tmp2, tmp3;
3360
3361     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3362         !dc_isar_feature(aa32_fp16_spconv, s)) {
3363         return false;
3364     }
3365
3366     /* UNDEF accesses to D16-D31 if they don't exist. */
3367     if (!dc_isar_feature(aa32_simd_r32, s) &&
3368         ((a->vd | a->vm) & 0x10)) {
3369         return false;
3370     }
3371
3372     if ((a->vd & 1) || (a->size != 1)) {
3373         return false;
3374     }
3375
3376     if (!vfp_access_check(s)) {
3377         return true;
3378     }
3379
3380     fpst = fpstatus_ptr(FPST_STD);
3381     ahp = get_ahp_flag();
3382     tmp3 = tcg_temp_new_i32();
3383     tmp = neon_load_reg(a->vm, 0);
3384     tmp2 = neon_load_reg(a->vm, 1);
3385     tcg_gen_ext16u_i32(tmp3, tmp);
3386     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3387     neon_store_reg(a->vd, 0, tmp3);
3388     tcg_gen_shri_i32(tmp, tmp, 16);
3389     gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
3390     neon_store_reg(a->vd, 1, tmp);
3391     tmp3 = tcg_temp_new_i32();
3392     tcg_gen_ext16u_i32(tmp3, tmp2);
3393     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3394     neon_store_reg(a->vd, 2, tmp3);
3395     tcg_gen_shri_i32(tmp2, tmp2, 16);
3396     gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
3397     neon_store_reg(a->vd, 3, tmp2);
3398     tcg_temp_free_i32(ahp);
3399     tcg_temp_free_ptr(fpst);
3400
3401     return true;
3402 }
3403
3404 static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
3405 {
3406     int vec_size = a->q ? 16 : 8;
3407     int rd_ofs = neon_reg_offset(a->vd, 0);
3408     int rm_ofs = neon_reg_offset(a->vm, 0);
3409
3410     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3411         return false;
3412     }
3413
3414     /* UNDEF accesses to D16-D31 if they don't exist. */
3415     if (!dc_isar_feature(aa32_simd_r32, s) &&
3416         ((a->vd | a->vm) & 0x10)) {
3417         return false;
3418     }
3419
3420     if (a->size == 3) {
3421         return false;
3422     }
3423
3424     if ((a->vd | a->vm) & a->q) {
3425         return false;
3426     }
3427
3428     if (!vfp_access_check(s)) {
3429         return true;
3430     }
3431
3432     fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size);
3433
3434     return true;
3435 }
3436
3437 #define DO_2MISC_VEC(INSN, FN)                                  \
3438     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3439     {                                                           \
3440         return do_2misc_vec(s, a, FN);                          \
3441     }
3442
3443 DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg)
3444 DO_2MISC_VEC(VABS, tcg_gen_gvec_abs)
3445 DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0)
3446 DO_2MISC_VEC(VCGT0, gen_gvec_cgt0)
3447 DO_2MISC_VEC(VCLE0, gen_gvec_cle0)
3448 DO_2MISC_VEC(VCGE0, gen_gvec_cge0)
3449 DO_2MISC_VEC(VCLT0, gen_gvec_clt0)
3450
3451 static bool trans_VMVN(DisasContext *s, arg_2misc *a)
3452 {
3453     if (a->size != 0) {
3454         return false;
3455     }
3456     return do_2misc_vec(s, a, tcg_gen_gvec_not);
3457 }
3458
3459 #define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3460     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3461                          uint32_t rm_ofs, uint32_t oprsz,               \
3462                          uint32_t maxsz)                                \
3463     {                                                                   \
3464         tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz,        \
3465                            DATA, FUNC);                                 \
3466     }
3467
3468 #define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3469     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3470                          uint32_t rm_ofs, uint32_t oprsz,               \
3471                          uint32_t maxsz)                                \
3472     {                                                                   \
3473         tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC);   \
3474     }
3475
3476 WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0)
3477 WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aese, 1)
3478 WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0)
3479 WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesmc, 1)
3480 WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0)
3481 WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0)
3482 WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0)
3483
3484 #define DO_2M_CRYPTO(INSN, FEATURE, SIZE)                       \
3485     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3486     {                                                           \
3487         if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) {  \
3488             return false;                                       \
3489         }                                                       \
3490         return do_2misc_vec(s, a, gen_##INSN);                  \
3491     }
3492
3493 DO_2M_CRYPTO(AESE, aa32_aes, 0)
3494 DO_2M_CRYPTO(AESD, aa32_aes, 0)
3495 DO_2M_CRYPTO(AESMC, aa32_aes, 0)
3496 DO_2M_CRYPTO(AESIMC, aa32_aes, 0)
3497 DO_2M_CRYPTO(SHA1H, aa32_sha1, 2)
3498 DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2)
3499 DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2)
3500
3501 static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
3502 {
3503     int pass;
3504
3505     /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
3506     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3507         return false;
3508     }
3509
3510     /* UNDEF accesses to D16-D31 if they don't exist. */
3511     if (!dc_isar_feature(aa32_simd_r32, s) &&
3512         ((a->vd | a->vm) & 0x10)) {
3513         return false;
3514     }
3515
3516     if (!fn) {
3517         return false;
3518     }
3519
3520     if ((a->vd | a->vm) & a->q) {
3521         return false;
3522     }
3523
3524     if (!vfp_access_check(s)) {
3525         return true;
3526     }
3527
3528     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3529         TCGv_i32 tmp = neon_load_reg(a->vm, pass);
3530         fn(tmp, tmp);
3531         neon_store_reg(a->vd, pass, tmp);
3532     }
3533
3534     return true;
3535 }
3536
3537 static bool trans_VREV32(DisasContext *s, arg_2misc *a)
3538 {
3539     static NeonGenOneOpFn * const fn[] = {
3540         tcg_gen_bswap32_i32,
3541         gen_swap_half,
3542         NULL,
3543         NULL,
3544     };
3545     return do_2misc(s, a, fn[a->size]);
3546 }
3547
3548 static bool trans_VREV16(DisasContext *s, arg_2misc *a)
3549 {
3550     if (a->size != 0) {
3551         return false;
3552     }
3553     return do_2misc(s, a, gen_rev16);
3554 }
3555
3556 static bool trans_VCLS(DisasContext *s, arg_2misc *a)
3557 {
3558     static NeonGenOneOpFn * const fn[] = {