migration: increase max-bandwidth to 128 MiB/s (1 Gib/s)
[qemu.git] / target / arm / translate-neon.c.inc
1 /*
2  *  ARM translation: AArch32 Neon instructions
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *  Copyright (c) 2005-2007 CodeSourcery
6  *  Copyright (c) 2007 OpenedHand, Ltd.
7  *  Copyright (c) 2020 Linaro, Ltd.
8  *
9  * This library is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2 of the License, or (at your option) any later version.
13  *
14  * This library is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
21  */
22
23 /*
24  * This file is intended to be included from translate.c; it uses
25  * some macros and definitions provided by that file.
26  * It might be possible to convert it to a standalone .c file eventually.
27  */
28
29 static inline int plus1(DisasContext *s, int x)
30 {
31     return x + 1;
32 }
33
34 static inline int rsub_64(DisasContext *s, int x)
35 {
36     return 64 - x;
37 }
38
39 static inline int rsub_32(DisasContext *s, int x)
40 {
41     return 32 - x;
42 }
43 static inline int rsub_16(DisasContext *s, int x)
44 {
45     return 16 - x;
46 }
47 static inline int rsub_8(DisasContext *s, int x)
48 {
49     return 8 - x;
50 }
51
52 static inline int neon_3same_fp_size(DisasContext *s, int x)
53 {
54     /* Convert 0==fp32, 1==fp16 into a MO_* value */
55     return MO_32 - x;
56 }
57
58 /* Include the generated Neon decoder */
59 #include "decode-neon-dp.c.inc"
60 #include "decode-neon-ls.c.inc"
61 #include "decode-neon-shared.c.inc"
62
63 /* Return the offset of a 2**SIZE piece of a NEON register, at index ELE,
64  * where 0 is the least significant end of the register.
65  */
66 static inline long
67 neon_element_offset(int reg, int element, MemOp size)
68 {
69     int element_size = 1 << size;
70     int ofs = element * element_size;
71 #ifdef HOST_WORDS_BIGENDIAN
72     /* Calculate the offset assuming fully little-endian,
73      * then XOR to account for the order of the 8-byte units.
74      */
75     if (element_size < 8) {
76         ofs ^= 8 - element_size;
77     }
78 #endif
79     return neon_reg_offset(reg, 0) + ofs;
80 }
81
82 static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
83 {
84     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
85
86     switch (mop) {
87     case MO_UB:
88         tcg_gen_ld8u_i32(var, cpu_env, offset);
89         break;
90     case MO_UW:
91         tcg_gen_ld16u_i32(var, cpu_env, offset);
92         break;
93     case MO_UL:
94         tcg_gen_ld_i32(var, cpu_env, offset);
95         break;
96     default:
97         g_assert_not_reached();
98     }
99 }
100
101 static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
102 {
103     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
104
105     switch (mop) {
106     case MO_UB:
107         tcg_gen_ld8u_i64(var, cpu_env, offset);
108         break;
109     case MO_UW:
110         tcg_gen_ld16u_i64(var, cpu_env, offset);
111         break;
112     case MO_UL:
113         tcg_gen_ld32u_i64(var, cpu_env, offset);
114         break;
115     case MO_Q:
116         tcg_gen_ld_i64(var, cpu_env, offset);
117         break;
118     default:
119         g_assert_not_reached();
120     }
121 }
122
123 static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
124 {
125     long offset = neon_element_offset(reg, ele, size);
126
127     switch (size) {
128     case MO_8:
129         tcg_gen_st8_i32(var, cpu_env, offset);
130         break;
131     case MO_16:
132         tcg_gen_st16_i32(var, cpu_env, offset);
133         break;
134     case MO_32:
135         tcg_gen_st_i32(var, cpu_env, offset);
136         break;
137     default:
138         g_assert_not_reached();
139     }
140 }
141
142 static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
143 {
144     long offset = neon_element_offset(reg, ele, size);
145
146     switch (size) {
147     case MO_8:
148         tcg_gen_st8_i64(var, cpu_env, offset);
149         break;
150     case MO_16:
151         tcg_gen_st16_i64(var, cpu_env, offset);
152         break;
153     case MO_32:
154         tcg_gen_st32_i64(var, cpu_env, offset);
155         break;
156     case MO_64:
157         tcg_gen_st_i64(var, cpu_env, offset);
158         break;
159     default:
160         g_assert_not_reached();
161     }
162 }
163
164 static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
165 {
166     int opr_sz;
167     TCGv_ptr fpst;
168     gen_helper_gvec_3_ptr *fn_gvec_ptr;
169
170     if (!dc_isar_feature(aa32_vcma, s)
171         || (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s))) {
172         return false;
173     }
174
175     /* UNDEF accesses to D16-D31 if they don't exist. */
176     if (!dc_isar_feature(aa32_simd_r32, s) &&
177         ((a->vd | a->vn | a->vm) & 0x10)) {
178         return false;
179     }
180
181     if ((a->vn | a->vm | a->vd) & a->q) {
182         return false;
183     }
184
185     if (!vfp_access_check(s)) {
186         return true;
187     }
188
189     opr_sz = (1 + a->q) * 8;
190     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
191     fn_gvec_ptr = (a->size == MO_16) ?
192         gen_helper_gvec_fcmlah : gen_helper_gvec_fcmlas;
193     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
194                        vfp_reg_offset(1, a->vn),
195                        vfp_reg_offset(1, a->vm),
196                        fpst, opr_sz, opr_sz, a->rot,
197                        fn_gvec_ptr);
198     tcg_temp_free_ptr(fpst);
199     return true;
200 }
201
202 static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
203 {
204     int opr_sz;
205     TCGv_ptr fpst;
206     gen_helper_gvec_3_ptr *fn_gvec_ptr;
207
208     if (!dc_isar_feature(aa32_vcma, s)
209         || (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s))) {
210         return false;
211     }
212
213     /* UNDEF accesses to D16-D31 if they don't exist. */
214     if (!dc_isar_feature(aa32_simd_r32, s) &&
215         ((a->vd | a->vn | a->vm) & 0x10)) {
216         return false;
217     }
218
219     if ((a->vn | a->vm | a->vd) & a->q) {
220         return false;
221     }
222
223     if (!vfp_access_check(s)) {
224         return true;
225     }
226
227     opr_sz = (1 + a->q) * 8;
228     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
229     fn_gvec_ptr = (a->size == MO_16) ?
230         gen_helper_gvec_fcaddh : gen_helper_gvec_fcadds;
231     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
232                        vfp_reg_offset(1, a->vn),
233                        vfp_reg_offset(1, a->vm),
234                        fpst, opr_sz, opr_sz, a->rot,
235                        fn_gvec_ptr);
236     tcg_temp_free_ptr(fpst);
237     return true;
238 }
239
240 static bool trans_VDOT(DisasContext *s, arg_VDOT *a)
241 {
242     int opr_sz;
243     gen_helper_gvec_3 *fn_gvec;
244
245     if (!dc_isar_feature(aa32_dp, s)) {
246         return false;
247     }
248
249     /* UNDEF accesses to D16-D31 if they don't exist. */
250     if (!dc_isar_feature(aa32_simd_r32, s) &&
251         ((a->vd | a->vn | a->vm) & 0x10)) {
252         return false;
253     }
254
255     if ((a->vn | a->vm | a->vd) & a->q) {
256         return false;
257     }
258
259     if (!vfp_access_check(s)) {
260         return true;
261     }
262
263     opr_sz = (1 + a->q) * 8;
264     fn_gvec = a->u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b;
265     tcg_gen_gvec_3_ool(vfp_reg_offset(1, a->vd),
266                        vfp_reg_offset(1, a->vn),
267                        vfp_reg_offset(1, a->vm),
268                        opr_sz, opr_sz, 0, fn_gvec);
269     return true;
270 }
271
272 static bool trans_VFML(DisasContext *s, arg_VFML *a)
273 {
274     int opr_sz;
275
276     if (!dc_isar_feature(aa32_fhm, s)) {
277         return false;
278     }
279
280     /* UNDEF accesses to D16-D31 if they don't exist. */
281     if (!dc_isar_feature(aa32_simd_r32, s) &&
282         (a->vd & 0x10)) {
283         return false;
284     }
285
286     if (a->vd & a->q) {
287         return false;
288     }
289
290     if (!vfp_access_check(s)) {
291         return true;
292     }
293
294     opr_sz = (1 + a->q) * 8;
295     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
296                        vfp_reg_offset(a->q, a->vn),
297                        vfp_reg_offset(a->q, a->vm),
298                        cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
299                        gen_helper_gvec_fmlal_a32);
300     return true;
301 }
302
303 static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
304 {
305     gen_helper_gvec_3_ptr *fn_gvec_ptr;
306     int opr_sz;
307     TCGv_ptr fpst;
308
309     if (!dc_isar_feature(aa32_vcma, s)) {
310         return false;
311     }
312     if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) {
313         return false;
314     }
315
316     /* UNDEF accesses to D16-D31 if they don't exist. */
317     if (!dc_isar_feature(aa32_simd_r32, s) &&
318         ((a->vd | a->vn | a->vm) & 0x10)) {
319         return false;
320     }
321
322     if ((a->vd | a->vn) & a->q) {
323         return false;
324     }
325
326     if (!vfp_access_check(s)) {
327         return true;
328     }
329
330     fn_gvec_ptr = (a->size == MO_16) ?
331         gen_helper_gvec_fcmlah_idx : gen_helper_gvec_fcmlas_idx;
332     opr_sz = (1 + a->q) * 8;
333     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
334     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
335                        vfp_reg_offset(1, a->vn),
336                        vfp_reg_offset(1, a->vm),
337                        fpst, opr_sz, opr_sz,
338                        (a->index << 2) | a->rot, fn_gvec_ptr);
339     tcg_temp_free_ptr(fpst);
340     return true;
341 }
342
343 static bool trans_VDOT_scalar(DisasContext *s, arg_VDOT_scalar *a)
344 {
345     gen_helper_gvec_3 *fn_gvec;
346     int opr_sz;
347     TCGv_ptr fpst;
348
349     if (!dc_isar_feature(aa32_dp, s)) {
350         return false;
351     }
352
353     /* UNDEF accesses to D16-D31 if they don't exist. */
354     if (!dc_isar_feature(aa32_simd_r32, s) &&
355         ((a->vd | a->vn) & 0x10)) {
356         return false;
357     }
358
359     if ((a->vd | a->vn) & a->q) {
360         return false;
361     }
362
363     if (!vfp_access_check(s)) {
364         return true;
365     }
366
367     fn_gvec = a->u ? gen_helper_gvec_udot_idx_b : gen_helper_gvec_sdot_idx_b;
368     opr_sz = (1 + a->q) * 8;
369     fpst = fpstatus_ptr(FPST_STD);
370     tcg_gen_gvec_3_ool(vfp_reg_offset(1, a->vd),
371                        vfp_reg_offset(1, a->vn),
372                        vfp_reg_offset(1, a->rm),
373                        opr_sz, opr_sz, a->index, fn_gvec);
374     tcg_temp_free_ptr(fpst);
375     return true;
376 }
377
378 static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
379 {
380     int opr_sz;
381
382     if (!dc_isar_feature(aa32_fhm, s)) {
383         return false;
384     }
385
386     /* UNDEF accesses to D16-D31 if they don't exist. */
387     if (!dc_isar_feature(aa32_simd_r32, s) &&
388         ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
389         return false;
390     }
391
392     if (a->vd & a->q) {
393         return false;
394     }
395
396     if (!vfp_access_check(s)) {
397         return true;
398     }
399
400     opr_sz = (1 + a->q) * 8;
401     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
402                        vfp_reg_offset(a->q, a->vn),
403                        vfp_reg_offset(a->q, a->rm),
404                        cpu_env, opr_sz, opr_sz,
405                        (a->index << 2) | a->s, /* is_2 == 0 */
406                        gen_helper_gvec_fmlal_idx_a32);
407     return true;
408 }
409
410 static struct {
411     int nregs;
412     int interleave;
413     int spacing;
414 } const neon_ls_element_type[11] = {
415     {1, 4, 1},
416     {1, 4, 2},
417     {4, 1, 1},
418     {2, 2, 2},
419     {1, 3, 1},
420     {1, 3, 2},
421     {3, 1, 1},
422     {1, 1, 1},
423     {1, 2, 1},
424     {1, 2, 2},
425     {2, 1, 1}
426 };
427
428 static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
429                                       int stride)
430 {
431     if (rm != 15) {
432         TCGv_i32 base;
433
434         base = load_reg(s, rn);
435         if (rm == 13) {
436             tcg_gen_addi_i32(base, base, stride);
437         } else {
438             TCGv_i32 index;
439             index = load_reg(s, rm);
440             tcg_gen_add_i32(base, base, index);
441             tcg_temp_free_i32(index);
442         }
443         store_reg(s, rn, base);
444     }
445 }
446
447 static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
448 {
449     /* Neon load/store multiple structures */
450     int nregs, interleave, spacing, reg, n;
451     MemOp endian = s->be_data;
452     int mmu_idx = get_mem_index(s);
453     int size = a->size;
454     TCGv_i64 tmp64;
455     TCGv_i32 addr, tmp;
456
457     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
458         return false;
459     }
460
461     /* UNDEF accesses to D16-D31 if they don't exist */
462     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
463         return false;
464     }
465     if (a->itype > 10) {
466         return false;
467     }
468     /* Catch UNDEF cases for bad values of align field */
469     switch (a->itype & 0xc) {
470     case 4:
471         if (a->align >= 2) {
472             return false;
473         }
474         break;
475     case 8:
476         if (a->align == 3) {
477             return false;
478         }
479         break;
480     default:
481         break;
482     }
483     nregs = neon_ls_element_type[a->itype].nregs;
484     interleave = neon_ls_element_type[a->itype].interleave;
485     spacing = neon_ls_element_type[a->itype].spacing;
486     if (size == 3 && (interleave | spacing) != 1) {
487         return false;
488     }
489
490     if (!vfp_access_check(s)) {
491         return true;
492     }
493
494     /* For our purposes, bytes are always little-endian.  */
495     if (size == 0) {
496         endian = MO_LE;
497     }
498     /*
499      * Consecutive little-endian elements from a single register
500      * can be promoted to a larger little-endian operation.
501      */
502     if (interleave == 1 && endian == MO_LE) {
503         size = 3;
504     }
505     tmp64 = tcg_temp_new_i64();
506     addr = tcg_temp_new_i32();
507     tmp = tcg_const_i32(1 << size);
508     load_reg_var(s, addr, a->rn);
509     for (reg = 0; reg < nregs; reg++) {
510         for (n = 0; n < 8 >> size; n++) {
511             int xs;
512             for (xs = 0; xs < interleave; xs++) {
513                 int tt = a->vd + reg + spacing * xs;
514
515                 if (a->l) {
516                     gen_aa32_ld_i64(s, tmp64, addr, mmu_idx, endian | size);
517                     neon_store_element64(tt, n, size, tmp64);
518                 } else {
519                     neon_load_element64(tmp64, tt, n, size);
520                     gen_aa32_st_i64(s, tmp64, addr, mmu_idx, endian | size);
521                 }
522                 tcg_gen_add_i32(addr, addr, tmp);
523             }
524         }
525     }
526     tcg_temp_free_i32(addr);
527     tcg_temp_free_i32(tmp);
528     tcg_temp_free_i64(tmp64);
529
530     gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
531     return true;
532 }
533
534 static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
535 {
536     /* Neon load single structure to all lanes */
537     int reg, stride, vec_size;
538     int vd = a->vd;
539     int size = a->size;
540     int nregs = a->n + 1;
541     TCGv_i32 addr, tmp;
542
543     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
544         return false;
545     }
546
547     /* UNDEF accesses to D16-D31 if they don't exist */
548     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
549         return false;
550     }
551
552     if (size == 3) {
553         if (nregs != 4 || a->a == 0) {
554             return false;
555         }
556         /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
557         size = 2;
558     }
559     if (nregs == 1 && a->a == 1 && size == 0) {
560         return false;
561     }
562     if (nregs == 3 && a->a == 1) {
563         return false;
564     }
565
566     if (!vfp_access_check(s)) {
567         return true;
568     }
569
570     /*
571      * VLD1 to all lanes: T bit indicates how many Dregs to write.
572      * VLD2/3/4 to all lanes: T bit indicates register stride.
573      */
574     stride = a->t ? 2 : 1;
575     vec_size = nregs == 1 ? stride * 8 : 8;
576
577     tmp = tcg_temp_new_i32();
578     addr = tcg_temp_new_i32();
579     load_reg_var(s, addr, a->rn);
580     for (reg = 0; reg < nregs; reg++) {
581         gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s),
582                         s->be_data | size);
583         if ((vd & 1) && vec_size == 16) {
584             /*
585              * We cannot write 16 bytes at once because the
586              * destination is unaligned.
587              */
588             tcg_gen_gvec_dup_i32(size, neon_reg_offset(vd, 0),
589                                  8, 8, tmp);
590             tcg_gen_gvec_mov(0, neon_reg_offset(vd + 1, 0),
591                              neon_reg_offset(vd, 0), 8, 8);
592         } else {
593             tcg_gen_gvec_dup_i32(size, neon_reg_offset(vd, 0),
594                                  vec_size, vec_size, tmp);
595         }
596         tcg_gen_addi_i32(addr, addr, 1 << size);
597         vd += stride;
598     }
599     tcg_temp_free_i32(tmp);
600     tcg_temp_free_i32(addr);
601
602     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
603
604     return true;
605 }
606
607 static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
608 {
609     /* Neon load/store single structure to one lane */
610     int reg;
611     int nregs = a->n + 1;
612     int vd = a->vd;
613     TCGv_i32 addr, tmp;
614
615     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
616         return false;
617     }
618
619     /* UNDEF accesses to D16-D31 if they don't exist */
620     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
621         return false;
622     }
623
624     /* Catch the UNDEF cases. This is unavoidably a bit messy. */
625     switch (nregs) {
626     case 1:
627         if (((a->align & (1 << a->size)) != 0) ||
628             (a->size == 2 && ((a->align & 3) == 1 || (a->align & 3) == 2))) {
629             return false;
630         }
631         break;
632     case 3:
633         if ((a->align & 1) != 0) {
634             return false;
635         }
636         /* fall through */
637     case 2:
638         if (a->size == 2 && (a->align & 2) != 0) {
639             return false;
640         }
641         break;
642     case 4:
643         if ((a->size == 2) && ((a->align & 3) == 3)) {
644             return false;
645         }
646         break;
647     default:
648         abort();
649     }
650     if ((vd + a->stride * (nregs - 1)) > 31) {
651         /*
652          * Attempts to write off the end of the register file are
653          * UNPREDICTABLE; we choose to UNDEF because otherwise we would
654          * access off the end of the array that holds the register data.
655          */
656         return false;
657     }
658
659     if (!vfp_access_check(s)) {
660         return true;
661     }
662
663     tmp = tcg_temp_new_i32();
664     addr = tcg_temp_new_i32();
665     load_reg_var(s, addr, a->rn);
666     /*
667      * TODO: if we implemented alignment exceptions, we should check
668      * addr against the alignment encoded in a->align here.
669      */
670     for (reg = 0; reg < nregs; reg++) {
671         if (a->l) {
672             gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s),
673                             s->be_data | a->size);
674             neon_store_element(vd, a->reg_idx, a->size, tmp);
675         } else { /* Store */
676             neon_load_element(tmp, vd, a->reg_idx, a->size);
677             gen_aa32_st_i32(s, tmp, addr, get_mem_index(s),
678                             s->be_data | a->size);
679         }
680         vd += a->stride;
681         tcg_gen_addi_i32(addr, addr, 1 << a->size);
682     }
683     tcg_temp_free_i32(addr);
684     tcg_temp_free_i32(tmp);
685
686     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
687
688     return true;
689 }
690
691 static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
692 {
693     int vec_size = a->q ? 16 : 8;
694     int rd_ofs = neon_reg_offset(a->vd, 0);
695     int rn_ofs = neon_reg_offset(a->vn, 0);
696     int rm_ofs = neon_reg_offset(a->vm, 0);
697
698     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
699         return false;
700     }
701
702     /* UNDEF accesses to D16-D31 if they don't exist. */
703     if (!dc_isar_feature(aa32_simd_r32, s) &&
704         ((a->vd | a->vn | a->vm) & 0x10)) {
705         return false;
706     }
707
708     if ((a->vn | a->vm | a->vd) & a->q) {
709         return false;
710     }
711
712     if (!vfp_access_check(s)) {
713         return true;
714     }
715
716     fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
717     return true;
718 }
719
720 #define DO_3SAME(INSN, FUNC)                                            \
721     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
722     {                                                                   \
723         return do_3same(s, a, FUNC);                                    \
724     }
725
726 DO_3SAME(VADD, tcg_gen_gvec_add)
727 DO_3SAME(VSUB, tcg_gen_gvec_sub)
728 DO_3SAME(VAND, tcg_gen_gvec_and)
729 DO_3SAME(VBIC, tcg_gen_gvec_andc)
730 DO_3SAME(VORR, tcg_gen_gvec_or)
731 DO_3SAME(VORN, tcg_gen_gvec_orc)
732 DO_3SAME(VEOR, tcg_gen_gvec_xor)
733 DO_3SAME(VSHL_S, gen_gvec_sshl)
734 DO_3SAME(VSHL_U, gen_gvec_ushl)
735 DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
736 DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
737 DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
738 DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
739
740 /* These insns are all gvec_bitsel but with the inputs in various orders. */
741 #define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
742     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
743                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
744                                 uint32_t oprsz, uint32_t maxsz)         \
745     {                                                                   \
746         tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
747     }                                                                   \
748     DO_3SAME(INSN, gen_##INSN##_3s)
749
750 DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
751 DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
752 DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
753
754 #define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
755     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
756     {                                                                   \
757         if (a->size == 3) {                                             \
758             return false;                                               \
759         }                                                               \
760         return do_3same(s, a, FUNC);                                    \
761     }
762
763 DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
764 DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
765 DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
766 DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
767 DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
768 DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
769 DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
770 DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
771 DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
772 DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
773 DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
774 DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
775
776 #define DO_3SAME_CMP(INSN, COND)                                        \
777     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
778                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
779                                 uint32_t oprsz, uint32_t maxsz)         \
780     {                                                                   \
781         tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
782     }                                                                   \
783     DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
784
785 DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
786 DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
787 DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
788 DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
789 DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
790
791 #define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
792     static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
793                          uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
794     {                                                                      \
795         tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
796     }
797
798 WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
799
800 static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
801 {
802     if (a->size != 0) {
803         return false;
804     }
805     return do_3same(s, a, gen_VMUL_p_3s);
806 }
807
808 #define DO_VQRDMLAH(INSN, FUNC)                                         \
809     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
810     {                                                                   \
811         if (!dc_isar_feature(aa32_rdm, s)) {                            \
812             return false;                                               \
813         }                                                               \
814         if (a->size != 1 && a->size != 2) {                             \
815             return false;                                               \
816         }                                                               \
817         return do_3same(s, a, FUNC);                                    \
818     }
819
820 DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
821 DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
822
823 #define DO_SHA1(NAME, FUNC)                                             \
824     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
825     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
826     {                                                                   \
827         if (!dc_isar_feature(aa32_sha1, s)) {                           \
828             return false;                                               \
829         }                                                               \
830         return do_3same(s, a, gen_##NAME##_3s);                         \
831     }
832
833 DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
834 DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
835 DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
836 DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
837
838 #define DO_SHA2(NAME, FUNC)                                             \
839     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
840     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
841     {                                                                   \
842         if (!dc_isar_feature(aa32_sha2, s)) {                           \
843             return false;                                               \
844         }                                                               \
845         return do_3same(s, a, gen_##NAME##_3s);                         \
846     }
847
848 DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
849 DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
850 DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
851
852 #define DO_3SAME_64(INSN, FUNC)                                         \
853     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
854                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
855                                 uint32_t oprsz, uint32_t maxsz)         \
856     {                                                                   \
857         static const GVecGen3 op = { .fni8 = FUNC };                    \
858         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op);      \
859     }                                                                   \
860     DO_3SAME(INSN, gen_##INSN##_3s)
861
862 #define DO_3SAME_64_ENV(INSN, FUNC)                                     \
863     static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)    \
864     {                                                                   \
865         FUNC(d, cpu_env, n, m);                                         \
866     }                                                                   \
867     DO_3SAME_64(INSN, gen_##INSN##_elt)
868
869 DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
870 DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
871 DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
872 DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
873 DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
874 DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
875
876 #define DO_3SAME_32(INSN, FUNC)                                         \
877     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
878                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
879                                 uint32_t oprsz, uint32_t maxsz)         \
880     {                                                                   \
881         static const GVecGen3 ops[4] = {                                \
882             { .fni4 = gen_helper_neon_##FUNC##8 },                      \
883             { .fni4 = gen_helper_neon_##FUNC##16 },                     \
884             { .fni4 = gen_helper_neon_##FUNC##32 },                     \
885             { 0 },                                                      \
886         };                                                              \
887         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
888     }                                                                   \
889     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
890     {                                                                   \
891         if (a->size > 2) {                                              \
892             return false;                                               \
893         }                                                               \
894         return do_3same(s, a, gen_##INSN##_3s);                         \
895     }
896
897 /*
898  * Some helper functions need to be passed the cpu_env. In order
899  * to use those with the gvec APIs like tcg_gen_gvec_3() we need
900  * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
901  * and which call a NeonGenTwoOpEnvFn().
902  */
903 #define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
904     static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
905     {                                                                   \
906         FUNC(d, cpu_env, n, m);                                         \
907     }
908
909 #define DO_3SAME_32_ENV(INSN, FUNC)                                     \
910     WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8);        \
911     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16);      \
912     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32);      \
913     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
914                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
915                                 uint32_t oprsz, uint32_t maxsz)         \
916     {                                                                   \
917         static const GVecGen3 ops[4] = {                                \
918             { .fni4 = gen_##INSN##_tramp8 },                            \
919             { .fni4 = gen_##INSN##_tramp16 },                           \
920             { .fni4 = gen_##INSN##_tramp32 },                           \
921             { 0 },                                                      \
922         };                                                              \
923         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
924     }                                                                   \
925     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
926     {                                                                   \
927         if (a->size > 2) {                                              \
928             return false;                                               \
929         }                                                               \
930         return do_3same(s, a, gen_##INSN##_3s);                         \
931     }
932
933 DO_3SAME_32(VHADD_S, hadd_s)
934 DO_3SAME_32(VHADD_U, hadd_u)
935 DO_3SAME_32(VHSUB_S, hsub_s)
936 DO_3SAME_32(VHSUB_U, hsub_u)
937 DO_3SAME_32(VRHADD_S, rhadd_s)
938 DO_3SAME_32(VRHADD_U, rhadd_u)
939 DO_3SAME_32(VRSHL_S, rshl_s)
940 DO_3SAME_32(VRSHL_U, rshl_u)
941
942 DO_3SAME_32_ENV(VQSHL_S, qshl_s)
943 DO_3SAME_32_ENV(VQSHL_U, qshl_u)
944 DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
945 DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
946
947 static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn)
948 {
949     /* Operations handled pairwise 32 bits at a time */
950     TCGv_i32 tmp, tmp2, tmp3;
951
952     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
953         return false;
954     }
955
956     /* UNDEF accesses to D16-D31 if they don't exist. */
957     if (!dc_isar_feature(aa32_simd_r32, s) &&
958         ((a->vd | a->vn | a->vm) & 0x10)) {
959         return false;
960     }
961
962     if (a->size == 3) {
963         return false;
964     }
965
966     if (!vfp_access_check(s)) {
967         return true;
968     }
969
970     assert(a->q == 0); /* enforced by decode patterns */
971
972     /*
973      * Note that we have to be careful not to clobber the source operands
974      * in the "vm == vd" case by storing the result of the first pass too
975      * early. Since Q is 0 there are always just two passes, so instead
976      * of a complicated loop over each pass we just unroll.
977      */
978     tmp = neon_load_reg(a->vn, 0);
979     tmp2 = neon_load_reg(a->vn, 1);
980     fn(tmp, tmp, tmp2);
981     tcg_temp_free_i32(tmp2);
982
983     tmp3 = neon_load_reg(a->vm, 0);
984     tmp2 = neon_load_reg(a->vm, 1);
985     fn(tmp3, tmp3, tmp2);
986     tcg_temp_free_i32(tmp2);
987
988     neon_store_reg(a->vd, 0, tmp);
989     neon_store_reg(a->vd, 1, tmp3);
990     return true;
991 }
992
993 #define DO_3SAME_PAIR(INSN, func)                                       \
994     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
995     {                                                                   \
996         static NeonGenTwoOpFn * const fns[] = {                         \
997             gen_helper_neon_##func##8,                                  \
998             gen_helper_neon_##func##16,                                 \
999             gen_helper_neon_##func##32,                                 \
1000         };                                                              \
1001         if (a->size > 2) {                                              \
1002             return false;                                               \
1003         }                                                               \
1004         return do_3same_pair(s, a, fns[a->size]);                       \
1005     }
1006
1007 /* 32-bit pairwise ops end up the same as the elementwise versions.  */
1008 #define gen_helper_neon_pmax_s32  tcg_gen_smax_i32
1009 #define gen_helper_neon_pmax_u32  tcg_gen_umax_i32
1010 #define gen_helper_neon_pmin_s32  tcg_gen_smin_i32
1011 #define gen_helper_neon_pmin_u32  tcg_gen_umin_i32
1012 #define gen_helper_neon_padd_u32  tcg_gen_add_i32
1013
1014 DO_3SAME_PAIR(VPMAX_S, pmax_s)
1015 DO_3SAME_PAIR(VPMIN_S, pmin_s)
1016 DO_3SAME_PAIR(VPMAX_U, pmax_u)
1017 DO_3SAME_PAIR(VPMIN_U, pmin_u)
1018 DO_3SAME_PAIR(VPADD, padd_u)
1019
1020 #define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
1021     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16);    \
1022     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32);    \
1023     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
1024                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
1025                                 uint32_t oprsz, uint32_t maxsz)         \
1026     {                                                                   \
1027         static const GVecGen3 ops[2] = {                                \
1028             { .fni4 = gen_##INSN##_tramp16 },                           \
1029             { .fni4 = gen_##INSN##_tramp32 },                           \
1030         };                                                              \
1031         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
1032     }                                                                   \
1033     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1034     {                                                                   \
1035         if (a->size != 1 && a->size != 2) {                             \
1036             return false;                                               \
1037         }                                                               \
1038         return do_3same(s, a, gen_##INSN##_3s);                         \
1039     }
1040
1041 DO_3SAME_VQDMULH(VQDMULH, qdmulh)
1042 DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
1043
1044 #define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC)                              \
1045     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
1046                          uint32_t rn_ofs, uint32_t rm_ofs,              \
1047                          uint32_t oprsz, uint32_t maxsz)                \
1048     {                                                                   \
1049         TCGv_ptr fpst = fpstatus_ptr(FPST);                             \
1050         tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
1051                            oprsz, maxsz, 0, FUNC);                      \
1052         tcg_temp_free_ptr(fpst);                                        \
1053     }
1054
1055 #define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC)                                 \
1056     WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC)                 \
1057     WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC)             \
1058     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
1059     {                                                                   \
1060         if (a->size == MO_16) {                                         \
1061             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
1062                 return false;                                           \
1063             }                                                           \
1064             return do_3same(s, a, gen_##INSN##_fp16_3s);                \
1065         }                                                               \
1066         return do_3same(s, a, gen_##INSN##_fp32_3s);                    \
1067     }
1068
1069
1070 DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
1071 DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
1072 DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
1073 DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
1074 DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
1075 DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
1076 DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
1077 DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
1078 DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
1079 DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
1080 DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
1081 DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
1082 DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
1083 DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
1084 DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
1085 DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
1086 DO_3S_FP_GVEC(VRSQRTS, gen_helper_gvec_rsqrts_nf_s, gen_helper_gvec_rsqrts_nf_h)
1087
1088 WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
1089 WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
1090 WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
1091 WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
1092
1093 static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
1094 {
1095     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1096         return false;
1097     }
1098
1099     if (a->size == MO_16) {
1100         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1101             return false;
1102         }
1103         return do_3same(s, a, gen_VMAXNM_fp16_3s);
1104     }
1105     return do_3same(s, a, gen_VMAXNM_fp32_3s);
1106 }
1107
1108 static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
1109 {
1110     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1111         return false;
1112     }
1113
1114     if (a->size == MO_16) {
1115         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1116             return false;
1117         }
1118         return do_3same(s, a, gen_VMINNM_fp16_3s);
1119     }
1120     return do_3same(s, a, gen_VMINNM_fp32_3s);
1121 }
1122
1123 static bool do_3same_fp_pair(DisasContext *s, arg_3same *a,
1124                              gen_helper_gvec_3_ptr *fn)
1125 {
1126     /* FP pairwise operations */
1127     TCGv_ptr fpstatus;
1128
1129     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1130         return false;
1131     }
1132
1133     /* UNDEF accesses to D16-D31 if they don't exist. */
1134     if (!dc_isar_feature(aa32_simd_r32, s) &&
1135         ((a->vd | a->vn | a->vm) & 0x10)) {
1136         return false;
1137     }
1138
1139     if (!vfp_access_check(s)) {
1140         return true;
1141     }
1142
1143     assert(a->q == 0); /* enforced by decode patterns */
1144
1145
1146     fpstatus = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1147     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
1148                        vfp_reg_offset(1, a->vn),
1149                        vfp_reg_offset(1, a->vm),
1150                        fpstatus, 8, 8, 0, fn);
1151     tcg_temp_free_ptr(fpstatus);
1152
1153     return true;
1154 }
1155
1156 /*
1157  * For all the functions using this macro, size == 1 means fp16,
1158  * which is an architecture extension we don't implement yet.
1159  */
1160 #define DO_3S_FP_PAIR(INSN,FUNC)                                    \
1161     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
1162     {                                                               \
1163         if (a->size == MO_16) {                                     \
1164             if (!dc_isar_feature(aa32_fp16_arith, s)) {             \
1165                 return false;                                       \
1166             }                                                       \
1167             return do_3same_fp_pair(s, a, FUNC##h);                 \
1168         }                                                           \
1169         return do_3same_fp_pair(s, a, FUNC##s);                     \
1170     }
1171
1172 DO_3S_FP_PAIR(VPADD, gen_helper_neon_padd)
1173 DO_3S_FP_PAIR(VPMAX, gen_helper_neon_pmax)
1174 DO_3S_FP_PAIR(VPMIN, gen_helper_neon_pmin)
1175
1176 static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
1177 {
1178     /* Handle a 2-reg-shift insn which can be vectorized. */
1179     int vec_size = a->q ? 16 : 8;
1180     int rd_ofs = neon_reg_offset(a->vd, 0);
1181     int rm_ofs = neon_reg_offset(a->vm, 0);
1182
1183     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1184         return false;
1185     }
1186
1187     /* UNDEF accesses to D16-D31 if they don't exist. */
1188     if (!dc_isar_feature(aa32_simd_r32, s) &&
1189         ((a->vd | a->vm) & 0x10)) {
1190         return false;
1191     }
1192
1193     if ((a->vm | a->vd) & a->q) {
1194         return false;
1195     }
1196
1197     if (!vfp_access_check(s)) {
1198         return true;
1199     }
1200
1201     fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
1202     return true;
1203 }
1204
1205 #define DO_2SH(INSN, FUNC)                                              \
1206     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1207     {                                                                   \
1208         return do_vector_2sh(s, a, FUNC);                               \
1209     }                                                                   \
1210
1211 DO_2SH(VSHL, tcg_gen_gvec_shli)
1212 DO_2SH(VSLI, gen_gvec_sli)
1213 DO_2SH(VSRI, gen_gvec_sri)
1214 DO_2SH(VSRA_S, gen_gvec_ssra)
1215 DO_2SH(VSRA_U, gen_gvec_usra)
1216 DO_2SH(VRSHR_S, gen_gvec_srshr)
1217 DO_2SH(VRSHR_U, gen_gvec_urshr)
1218 DO_2SH(VRSRA_S, gen_gvec_srsra)
1219 DO_2SH(VRSRA_U, gen_gvec_ursra)
1220
1221 static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
1222 {
1223     /* Signed shift out of range results in all-sign-bits */
1224     a->shift = MIN(a->shift, (8 << a->size) - 1);
1225     return do_vector_2sh(s, a, tcg_gen_gvec_sari);
1226 }
1227
1228 static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
1229                             int64_t shift, uint32_t oprsz, uint32_t maxsz)
1230 {
1231     tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
1232 }
1233
1234 static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
1235 {
1236     /* Shift out of range is architecturally valid and results in zero. */
1237     if (a->shift >= (8 << a->size)) {
1238         return do_vector_2sh(s, a, gen_zero_rd_2sh);
1239     } else {
1240         return do_vector_2sh(s, a, tcg_gen_gvec_shri);
1241     }
1242 }
1243
1244 static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
1245                              NeonGenTwo64OpEnvFn *fn)
1246 {
1247     /*
1248      * 2-reg-and-shift operations, size == 3 case, where the
1249      * function needs to be passed cpu_env.
1250      */
1251     TCGv_i64 constimm;
1252     int pass;
1253
1254     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1255         return false;
1256     }
1257
1258     /* UNDEF accesses to D16-D31 if they don't exist. */
1259     if (!dc_isar_feature(aa32_simd_r32, s) &&
1260         ((a->vd | a->vm) & 0x10)) {
1261         return false;
1262     }
1263
1264     if ((a->vm | a->vd) & a->q) {
1265         return false;
1266     }
1267
1268     if (!vfp_access_check(s)) {
1269         return true;
1270     }
1271
1272     /*
1273      * To avoid excessive duplication of ops we implement shift
1274      * by immediate using the variable shift operations.
1275      */
1276     constimm = tcg_const_i64(dup_const(a->size, a->shift));
1277
1278     for (pass = 0; pass < a->q + 1; pass++) {
1279         TCGv_i64 tmp = tcg_temp_new_i64();
1280
1281         neon_load_reg64(tmp, a->vm + pass);
1282         fn(tmp, cpu_env, tmp, constimm);
1283         neon_store_reg64(tmp, a->vd + pass);
1284         tcg_temp_free_i64(tmp);
1285     }
1286     tcg_temp_free_i64(constimm);
1287     return true;
1288 }
1289
1290 static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
1291                              NeonGenTwoOpEnvFn *fn)
1292 {
1293     /*
1294      * 2-reg-and-shift operations, size < 3 case, where the
1295      * helper needs to be passed cpu_env.
1296      */
1297     TCGv_i32 constimm;
1298     int pass;
1299
1300     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1301         return false;
1302     }
1303
1304     /* UNDEF accesses to D16-D31 if they don't exist. */
1305     if (!dc_isar_feature(aa32_simd_r32, s) &&
1306         ((a->vd | a->vm) & 0x10)) {
1307         return false;
1308     }
1309
1310     if ((a->vm | a->vd) & a->q) {
1311         return false;
1312     }
1313
1314     if (!vfp_access_check(s)) {
1315         return true;
1316     }
1317
1318     /*
1319      * To avoid excessive duplication of ops we implement shift
1320      * by immediate using the variable shift operations.
1321      */
1322     constimm = tcg_const_i32(dup_const(a->size, a->shift));
1323
1324     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1325         TCGv_i32 tmp = neon_load_reg(a->vm, pass);
1326         fn(tmp, cpu_env, tmp, constimm);
1327         neon_store_reg(a->vd, pass, tmp);
1328     }
1329     tcg_temp_free_i32(constimm);
1330     return true;
1331 }
1332
1333 #define DO_2SHIFT_ENV(INSN, FUNC)                                       \
1334     static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
1335     {                                                                   \
1336         return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
1337     }                                                                   \
1338     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1339     {                                                                   \
1340         static NeonGenTwoOpEnvFn * const fns[] = {                      \
1341             gen_helper_neon_##FUNC##8,                                  \
1342             gen_helper_neon_##FUNC##16,                                 \
1343             gen_helper_neon_##FUNC##32,                                 \
1344         };                                                              \
1345         assert(a->size < ARRAY_SIZE(fns));                              \
1346         return do_2shift_env_32(s, a, fns[a->size]);                    \
1347     }
1348
1349 DO_2SHIFT_ENV(VQSHLU, qshlu_s)
1350 DO_2SHIFT_ENV(VQSHL_U, qshl_u)
1351 DO_2SHIFT_ENV(VQSHL_S, qshl_s)
1352
1353 static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
1354                                 NeonGenTwo64OpFn *shiftfn,
1355                                 NeonGenNarrowEnvFn *narrowfn)
1356 {
1357     /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
1358     TCGv_i64 constimm, rm1, rm2;
1359     TCGv_i32 rd;
1360
1361     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1362         return false;
1363     }
1364
1365     /* UNDEF accesses to D16-D31 if they don't exist. */
1366     if (!dc_isar_feature(aa32_simd_r32, s) &&
1367         ((a->vd | a->vm) & 0x10)) {
1368         return false;
1369     }
1370
1371     if (a->vm & 1) {
1372         return false;
1373     }
1374
1375     if (!vfp_access_check(s)) {
1376         return true;
1377     }
1378
1379     /*
1380      * This is always a right shift, and the shiftfn is always a
1381      * left-shift helper, which thus needs the negated shift count.
1382      */
1383     constimm = tcg_const_i64(-a->shift);
1384     rm1 = tcg_temp_new_i64();
1385     rm2 = tcg_temp_new_i64();
1386
1387     /* Load both inputs first to avoid potential overwrite if rm == rd */
1388     neon_load_reg64(rm1, a->vm);
1389     neon_load_reg64(rm2, a->vm + 1);
1390
1391     shiftfn(rm1, rm1, constimm);
1392     rd = tcg_temp_new_i32();
1393     narrowfn(rd, cpu_env, rm1);
1394     neon_store_reg(a->vd, 0, rd);
1395
1396     shiftfn(rm2, rm2, constimm);
1397     rd = tcg_temp_new_i32();
1398     narrowfn(rd, cpu_env, rm2);
1399     neon_store_reg(a->vd, 1, rd);
1400
1401     tcg_temp_free_i64(rm1);
1402     tcg_temp_free_i64(rm2);
1403     tcg_temp_free_i64(constimm);
1404
1405     return true;
1406 }
1407
1408 static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
1409                                 NeonGenTwoOpFn *shiftfn,
1410                                 NeonGenNarrowEnvFn *narrowfn)
1411 {
1412     /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
1413     TCGv_i32 constimm, rm1, rm2, rm3, rm4;
1414     TCGv_i64 rtmp;
1415     uint32_t imm;
1416
1417     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1418         return false;
1419     }
1420
1421     /* UNDEF accesses to D16-D31 if they don't exist. */
1422     if (!dc_isar_feature(aa32_simd_r32, s) &&
1423         ((a->vd | a->vm) & 0x10)) {
1424         return false;
1425     }
1426
1427     if (a->vm & 1) {
1428         return false;
1429     }
1430
1431     if (!vfp_access_check(s)) {
1432         return true;
1433     }
1434
1435     /*
1436      * This is always a right shift, and the shiftfn is always a
1437      * left-shift helper, which thus needs the negated shift count
1438      * duplicated into each lane of the immediate value.
1439      */
1440     if (a->size == 1) {
1441         imm = (uint16_t)(-a->shift);
1442         imm |= imm << 16;
1443     } else {
1444         /* size == 2 */
1445         imm = -a->shift;
1446     }
1447     constimm = tcg_const_i32(imm);
1448
1449     /* Load all inputs first to avoid potential overwrite */
1450     rm1 = neon_load_reg(a->vm, 0);
1451     rm2 = neon_load_reg(a->vm, 1);
1452     rm3 = neon_load_reg(a->vm + 1, 0);
1453     rm4 = neon_load_reg(a->vm + 1, 1);
1454     rtmp = tcg_temp_new_i64();
1455
1456     shiftfn(rm1, rm1, constimm);
1457     shiftfn(rm2, rm2, constimm);
1458
1459     tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
1460     tcg_temp_free_i32(rm2);
1461
1462     narrowfn(rm1, cpu_env, rtmp);
1463     neon_store_reg(a->vd, 0, rm1);
1464
1465     shiftfn(rm3, rm3, constimm);
1466     shiftfn(rm4, rm4, constimm);
1467     tcg_temp_free_i32(constimm);
1468
1469     tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
1470     tcg_temp_free_i32(rm4);
1471
1472     narrowfn(rm3, cpu_env, rtmp);
1473     tcg_temp_free_i64(rtmp);
1474     neon_store_reg(a->vd, 1, rm3);
1475     return true;
1476 }
1477
1478 #define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
1479     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1480     {                                                                   \
1481         return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
1482     }
1483 #define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
1484     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1485     {                                                                   \
1486         return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
1487     }
1488
1489 static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1490 {
1491     tcg_gen_extrl_i64_i32(dest, src);
1492 }
1493
1494 static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1495 {
1496     gen_helper_neon_narrow_u16(dest, src);
1497 }
1498
1499 static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1500 {
1501     gen_helper_neon_narrow_u8(dest, src);
1502 }
1503
1504 DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
1505 DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
1506 DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
1507
1508 DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
1509 DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
1510 DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
1511
1512 DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
1513 DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
1514 DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
1515
1516 DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
1517 DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
1518 DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
1519 DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
1520 DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
1521 DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
1522
1523 DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
1524 DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
1525 DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
1526
1527 DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
1528 DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
1529 DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
1530
1531 DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
1532 DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
1533 DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
1534
1535 static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
1536                          NeonGenWidenFn *widenfn, bool u)
1537 {
1538     TCGv_i64 tmp;
1539     TCGv_i32 rm0, rm1;
1540     uint64_t widen_mask = 0;
1541
1542     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1543         return false;
1544     }
1545
1546     /* UNDEF accesses to D16-D31 if they don't exist. */
1547     if (!dc_isar_feature(aa32_simd_r32, s) &&
1548         ((a->vd | a->vm) & 0x10)) {
1549         return false;
1550     }
1551
1552     if (a->vd & 1) {
1553         return false;
1554     }
1555
1556     if (!vfp_access_check(s)) {
1557         return true;
1558     }
1559
1560     /*
1561      * This is a widen-and-shift operation. The shift is always less
1562      * than the width of the source type, so after widening the input
1563      * vector we can simply shift the whole 64-bit widened register,
1564      * and then clear the potential overflow bits resulting from left
1565      * bits of the narrow input appearing as right bits of the left
1566      * neighbour narrow input. Calculate a mask of bits to clear.
1567      */
1568     if ((a->shift != 0) && (a->size < 2 || u)) {
1569         int esize = 8 << a->size;
1570         widen_mask = MAKE_64BIT_MASK(0, esize);
1571         widen_mask >>= esize - a->shift;
1572         widen_mask = dup_const(a->size + 1, widen_mask);
1573     }
1574
1575     rm0 = neon_load_reg(a->vm, 0);
1576     rm1 = neon_load_reg(a->vm, 1);
1577     tmp = tcg_temp_new_i64();
1578
1579     widenfn(tmp, rm0);
1580     tcg_temp_free_i32(rm0);
1581     if (a->shift != 0) {
1582         tcg_gen_shli_i64(tmp, tmp, a->shift);
1583         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1584     }
1585     neon_store_reg64(tmp, a->vd);
1586
1587     widenfn(tmp, rm1);
1588     tcg_temp_free_i32(rm1);
1589     if (a->shift != 0) {
1590         tcg_gen_shli_i64(tmp, tmp, a->shift);
1591         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1592     }
1593     neon_store_reg64(tmp, a->vd + 1);
1594     tcg_temp_free_i64(tmp);
1595     return true;
1596 }
1597
1598 static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
1599 {
1600     static NeonGenWidenFn * const widenfn[] = {
1601         gen_helper_neon_widen_s8,
1602         gen_helper_neon_widen_s16,
1603         tcg_gen_ext_i32_i64,
1604     };
1605     return do_vshll_2sh(s, a, widenfn[a->size], false);
1606 }
1607
1608 static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
1609 {
1610     static NeonGenWidenFn * const widenfn[] = {
1611         gen_helper_neon_widen_u8,
1612         gen_helper_neon_widen_u16,
1613         tcg_gen_extu_i32_i64,
1614     };
1615     return do_vshll_2sh(s, a, widenfn[a->size], true);
1616 }
1617
1618 static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
1619                       gen_helper_gvec_2_ptr *fn)
1620 {
1621     /* FP operations in 2-reg-and-shift group */
1622     int vec_size = a->q ? 16 : 8;
1623     int rd_ofs = neon_reg_offset(a->vd, 0);
1624     int rm_ofs = neon_reg_offset(a->vm, 0);
1625     TCGv_ptr fpst;
1626
1627     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1628         return false;
1629     }
1630
1631     if (a->size == MO_16) {
1632         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1633             return false;
1634         }
1635     }
1636
1637     /* UNDEF accesses to D16-D31 if they don't exist. */
1638     if (!dc_isar_feature(aa32_simd_r32, s) &&
1639         ((a->vd | a->vm) & 0x10)) {
1640         return false;
1641     }
1642
1643     if ((a->vm | a->vd) & a->q) {
1644         return false;
1645     }
1646
1647     if (!vfp_access_check(s)) {
1648         return true;
1649     }
1650
1651     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1652     tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, vec_size, vec_size, a->shift, fn);
1653     tcg_temp_free_ptr(fpst);
1654     return true;
1655 }
1656
1657 #define DO_FP_2SH(INSN, FUNC)                                           \
1658     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1659     {                                                                   \
1660         return do_fp_2sh(s, a, FUNC);                                   \
1661     }
1662
1663 DO_FP_2SH(VCVT_SF, gen_helper_gvec_vcvt_sf)
1664 DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf)
1665 DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs)
1666 DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu)
1667
1668 DO_FP_2SH(VCVT_SH, gen_helper_gvec_vcvt_sh)
1669 DO_FP_2SH(VCVT_UH, gen_helper_gvec_vcvt_uh)
1670 DO_FP_2SH(VCVT_HS, gen_helper_gvec_vcvt_hs)
1671 DO_FP_2SH(VCVT_HU, gen_helper_gvec_vcvt_hu)
1672
1673 static uint64_t asimd_imm_const(uint32_t imm, int cmode, int op)
1674 {
1675     /*
1676      * Expand the encoded constant.
1677      * Note that cmode = 2,3,4,5,6,7,10,11,12,13 imm=0 is UNPREDICTABLE.
1678      * We choose to not special-case this and will behave as if a
1679      * valid constant encoding of 0 had been given.
1680      * cmode = 15 op = 1 must UNDEF; we assume decode has handled that.
1681      */
1682     switch (cmode) {
1683     case 0: case 1:
1684         /* no-op */
1685         break;
1686     case 2: case 3:
1687         imm <<= 8;
1688         break;
1689     case 4: case 5:
1690         imm <<= 16;
1691         break;
1692     case 6: case 7:
1693         imm <<= 24;
1694         break;
1695     case 8: case 9:
1696         imm |= imm << 16;
1697         break;
1698     case 10: case 11:
1699         imm = (imm << 8) | (imm << 24);
1700         break;
1701     case 12:
1702         imm = (imm << 8) | 0xff;
1703         break;
1704     case 13:
1705         imm = (imm << 16) | 0xffff;
1706         break;
1707     case 14:
1708         if (op) {
1709             /*
1710              * This is the only case where the top and bottom 32 bits
1711              * of the encoded constant differ.
1712              */
1713             uint64_t imm64 = 0;
1714             int n;
1715
1716             for (n = 0; n < 8; n++) {
1717                 if (imm & (1 << n)) {
1718                     imm64 |= (0xffULL << (n * 8));
1719                 }
1720             }
1721             return imm64;
1722         }
1723         imm |= (imm << 8) | (imm << 16) | (imm << 24);
1724         break;
1725     case 15:
1726         imm = ((imm & 0x80) << 24) | ((imm & 0x3f) << 19)
1727             | ((imm & 0x40) ? (0x1f << 25) : (1 << 30));
1728         break;
1729     }
1730     if (op) {
1731         imm = ~imm;
1732     }
1733     return dup_const(MO_32, imm);
1734 }
1735
1736 static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
1737                         GVecGen2iFn *fn)
1738 {
1739     uint64_t imm;
1740     int reg_ofs, vec_size;
1741
1742     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1743         return false;
1744     }
1745
1746     /* UNDEF accesses to D16-D31 if they don't exist. */
1747     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
1748         return false;
1749     }
1750
1751     if (a->vd & a->q) {
1752         return false;
1753     }
1754
1755     if (!vfp_access_check(s)) {
1756         return true;
1757     }
1758
1759     reg_ofs = neon_reg_offset(a->vd, 0);
1760     vec_size = a->q ? 16 : 8;
1761     imm = asimd_imm_const(a->imm, a->cmode, a->op);
1762
1763     fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
1764     return true;
1765 }
1766
1767 static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
1768                         int64_t c, uint32_t oprsz, uint32_t maxsz)
1769 {
1770     tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
1771 }
1772
1773 static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
1774 {
1775     /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
1776     GVecGen2iFn *fn;
1777
1778     if ((a->cmode & 1) && a->cmode < 12) {
1779         /* for op=1, the imm will be inverted, so BIC becomes AND. */
1780         fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
1781     } else {
1782         /* There is one unallocated cmode/op combination in this space */
1783         if (a->cmode == 15 && a->op == 1) {
1784             return false;
1785         }
1786         fn = gen_VMOV_1r;
1787     }
1788     return do_1reg_imm(s, a, fn);
1789 }
1790
1791 static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
1792                            NeonGenWidenFn *widenfn,
1793                            NeonGenTwo64OpFn *opfn,
1794                            bool src1_wide)
1795 {
1796     /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
1797     TCGv_i64 rn0_64, rn1_64, rm_64;
1798     TCGv_i32 rm;
1799
1800     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1801         return false;
1802     }
1803
1804     /* UNDEF accesses to D16-D31 if they don't exist. */
1805     if (!dc_isar_feature(aa32_simd_r32, s) &&
1806         ((a->vd | a->vn | a->vm) & 0x10)) {
1807         return false;
1808     }
1809
1810     if (!widenfn || !opfn) {
1811         /* size == 3 case, which is an entirely different insn group */
1812         return false;
1813     }
1814
1815     if ((a->vd & 1) || (src1_wide && (a->vn & 1))) {
1816         return false;
1817     }
1818
1819     if (!vfp_access_check(s)) {
1820         return true;
1821     }
1822
1823     rn0_64 = tcg_temp_new_i64();
1824     rn1_64 = tcg_temp_new_i64();
1825     rm_64 = tcg_temp_new_i64();
1826
1827     if (src1_wide) {
1828         neon_load_reg64(rn0_64, a->vn);
1829     } else {
1830         TCGv_i32 tmp = neon_load_reg(a->vn, 0);
1831         widenfn(rn0_64, tmp);
1832         tcg_temp_free_i32(tmp);
1833     }
1834     rm = neon_load_reg(a->vm, 0);
1835
1836     widenfn(rm_64, rm);
1837     tcg_temp_free_i32(rm);
1838     opfn(rn0_64, rn0_64, rm_64);
1839
1840     /*
1841      * Load second pass inputs before storing the first pass result, to
1842      * avoid incorrect results if a narrow input overlaps with the result.
1843      */
1844     if (src1_wide) {
1845         neon_load_reg64(rn1_64, a->vn + 1);
1846     } else {
1847         TCGv_i32 tmp = neon_load_reg(a->vn, 1);
1848         widenfn(rn1_64, tmp);
1849         tcg_temp_free_i32(tmp);
1850     }
1851     rm = neon_load_reg(a->vm, 1);
1852
1853     neon_store_reg64(rn0_64, a->vd);
1854
1855     widenfn(rm_64, rm);
1856     tcg_temp_free_i32(rm);
1857     opfn(rn1_64, rn1_64, rm_64);
1858     neon_store_reg64(rn1_64, a->vd + 1);
1859
1860     tcg_temp_free_i64(rn0_64);
1861     tcg_temp_free_i64(rn1_64);
1862     tcg_temp_free_i64(rm_64);
1863
1864     return true;
1865 }
1866
1867 #define DO_PREWIDEN(INSN, S, EXT, OP, SRC1WIDE)                         \
1868     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1869     {                                                                   \
1870         static NeonGenWidenFn * const widenfn[] = {                     \
1871             gen_helper_neon_widen_##S##8,                               \
1872             gen_helper_neon_widen_##S##16,                              \
1873             tcg_gen_##EXT##_i32_i64,                                    \
1874             NULL,                                                       \
1875         };                                                              \
1876         static NeonGenTwo64OpFn * const addfn[] = {                     \
1877             gen_helper_neon_##OP##l_u16,                                \
1878             gen_helper_neon_##OP##l_u32,                                \
1879             tcg_gen_##OP##_i64,                                         \
1880             NULL,                                                       \
1881         };                                                              \
1882         return do_prewiden_3d(s, a, widenfn[a->size],                   \
1883                               addfn[a->size], SRC1WIDE);                \
1884     }
1885
1886 DO_PREWIDEN(VADDL_S, s, ext, add, false)
1887 DO_PREWIDEN(VADDL_U, u, extu, add, false)
1888 DO_PREWIDEN(VSUBL_S, s, ext, sub, false)
1889 DO_PREWIDEN(VSUBL_U, u, extu, sub, false)
1890 DO_PREWIDEN(VADDW_S, s, ext, add, true)
1891 DO_PREWIDEN(VADDW_U, u, extu, add, true)
1892 DO_PREWIDEN(VSUBW_S, s, ext, sub, true)
1893 DO_PREWIDEN(VSUBW_U, u, extu, sub, true)
1894
1895 static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
1896                          NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
1897 {
1898     /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
1899     TCGv_i64 rn_64, rm_64;
1900     TCGv_i32 rd0, rd1;
1901
1902     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1903         return false;
1904     }
1905
1906     /* UNDEF accesses to D16-D31 if they don't exist. */
1907     if (!dc_isar_feature(aa32_simd_r32, s) &&
1908         ((a->vd | a->vn | a->vm) & 0x10)) {
1909         return false;
1910     }
1911
1912     if (!opfn || !narrowfn) {
1913         /* size == 3 case, which is an entirely different insn group */
1914         return false;
1915     }
1916
1917     if ((a->vn | a->vm) & 1) {
1918         return false;
1919     }
1920
1921     if (!vfp_access_check(s)) {
1922         return true;
1923     }
1924
1925     rn_64 = tcg_temp_new_i64();
1926     rm_64 = tcg_temp_new_i64();
1927     rd0 = tcg_temp_new_i32();
1928     rd1 = tcg_temp_new_i32();
1929
1930     neon_load_reg64(rn_64, a->vn);
1931     neon_load_reg64(rm_64, a->vm);
1932
1933     opfn(rn_64, rn_64, rm_64);
1934
1935     narrowfn(rd0, rn_64);
1936
1937     neon_load_reg64(rn_64, a->vn + 1);
1938     neon_load_reg64(rm_64, a->vm + 1);
1939
1940     opfn(rn_64, rn_64, rm_64);
1941
1942     narrowfn(rd1, rn_64);
1943
1944     neon_store_reg(a->vd, 0, rd0);
1945     neon_store_reg(a->vd, 1, rd1);
1946
1947     tcg_temp_free_i64(rn_64);
1948     tcg_temp_free_i64(rm_64);
1949
1950     return true;
1951 }
1952
1953 #define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
1954     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1955     {                                                                   \
1956         static NeonGenTwo64OpFn * const addfn[] = {                     \
1957             gen_helper_neon_##OP##l_u16,                                \
1958             gen_helper_neon_##OP##l_u32,                                \
1959             tcg_gen_##OP##_i64,                                         \
1960             NULL,                                                       \
1961         };                                                              \
1962         static NeonGenNarrowFn * const narrowfn[] = {                   \
1963             gen_helper_neon_##NARROWTYPE##_high_u8,                     \
1964             gen_helper_neon_##NARROWTYPE##_high_u16,                    \
1965             EXTOP,                                                      \
1966             NULL,                                                       \
1967         };                                                              \
1968         return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
1969     }
1970
1971 static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
1972 {
1973     tcg_gen_addi_i64(rn, rn, 1u << 31);
1974     tcg_gen_extrh_i64_i32(rd, rn);
1975 }
1976
1977 DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
1978 DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
1979 DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
1980 DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
1981
1982 static bool do_long_3d(DisasContext *s, arg_3diff *a,
1983                        NeonGenTwoOpWidenFn *opfn,
1984                        NeonGenTwo64OpFn *accfn)
1985 {
1986     /*
1987      * 3-regs different lengths, long operations.
1988      * These perform an operation on two inputs that returns a double-width
1989      * result, and then possibly perform an accumulation operation of
1990      * that result into the double-width destination.
1991      */
1992     TCGv_i64 rd0, rd1, tmp;
1993     TCGv_i32 rn, rm;
1994
1995     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1996         return false;
1997     }
1998
1999     /* UNDEF accesses to D16-D31 if they don't exist. */
2000     if (!dc_isar_feature(aa32_simd_r32, s) &&
2001         ((a->vd | a->vn | a->vm) & 0x10)) {
2002         return false;
2003     }
2004
2005     if (!opfn) {
2006         /* size == 3 case, which is an entirely different insn group */
2007         return false;
2008     }
2009
2010     if (a->vd & 1) {
2011         return false;
2012     }
2013
2014     if (!vfp_access_check(s)) {
2015         return true;
2016     }
2017
2018     rd0 = tcg_temp_new_i64();
2019     rd1 = tcg_temp_new_i64();
2020
2021     rn = neon_load_reg(a->vn, 0);
2022     rm = neon_load_reg(a->vm, 0);
2023     opfn(rd0, rn, rm);
2024     tcg_temp_free_i32(rn);
2025     tcg_temp_free_i32(rm);
2026
2027     rn = neon_load_reg(a->vn, 1);
2028     rm = neon_load_reg(a->vm, 1);
2029     opfn(rd1, rn, rm);
2030     tcg_temp_free_i32(rn);
2031     tcg_temp_free_i32(rm);
2032
2033     /* Don't store results until after all loads: they might overlap */
2034     if (accfn) {
2035         tmp = tcg_temp_new_i64();
2036         neon_load_reg64(tmp, a->vd);
2037         accfn(tmp, tmp, rd0);
2038         neon_store_reg64(tmp, a->vd);
2039         neon_load_reg64(tmp, a->vd + 1);
2040         accfn(tmp, tmp, rd1);
2041         neon_store_reg64(tmp, a->vd + 1);
2042         tcg_temp_free_i64(tmp);
2043     } else {
2044         neon_store_reg64(rd0, a->vd);
2045         neon_store_reg64(rd1, a->vd + 1);
2046     }
2047
2048     tcg_temp_free_i64(rd0);
2049     tcg_temp_free_i64(rd1);
2050
2051     return true;
2052 }
2053
2054 static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
2055 {
2056     static NeonGenTwoOpWidenFn * const opfn[] = {
2057         gen_helper_neon_abdl_s16,
2058         gen_helper_neon_abdl_s32,
2059         gen_helper_neon_abdl_s64,
2060         NULL,
2061     };
2062
2063     return do_long_3d(s, a, opfn[a->size], NULL);
2064 }
2065
2066 static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
2067 {
2068     static NeonGenTwoOpWidenFn * const opfn[] = {
2069         gen_helper_neon_abdl_u16,
2070         gen_helper_neon_abdl_u32,
2071         gen_helper_neon_abdl_u64,
2072         NULL,
2073     };
2074
2075     return do_long_3d(s, a, opfn[a->size], NULL);
2076 }
2077
2078 static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
2079 {
2080     static NeonGenTwoOpWidenFn * const opfn[] = {
2081         gen_helper_neon_abdl_s16,
2082         gen_helper_neon_abdl_s32,
2083         gen_helper_neon_abdl_s64,
2084         NULL,
2085     };
2086     static NeonGenTwo64OpFn * const addfn[] = {
2087         gen_helper_neon_addl_u16,
2088         gen_helper_neon_addl_u32,
2089         tcg_gen_add_i64,
2090         NULL,
2091     };
2092
2093     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2094 }
2095
2096 static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
2097 {
2098     static NeonGenTwoOpWidenFn * const opfn[] = {
2099         gen_helper_neon_abdl_u16,
2100         gen_helper_neon_abdl_u32,
2101         gen_helper_neon_abdl_u64,
2102         NULL,
2103     };
2104     static NeonGenTwo64OpFn * const addfn[] = {
2105         gen_helper_neon_addl_u16,
2106         gen_helper_neon_addl_u32,
2107         tcg_gen_add_i64,
2108         NULL,
2109     };
2110
2111     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2112 }
2113
2114 static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2115 {
2116     TCGv_i32 lo = tcg_temp_new_i32();
2117     TCGv_i32 hi = tcg_temp_new_i32();
2118
2119     tcg_gen_muls2_i32(lo, hi, rn, rm);
2120     tcg_gen_concat_i32_i64(rd, lo, hi);
2121
2122     tcg_temp_free_i32(lo);
2123     tcg_temp_free_i32(hi);
2124 }
2125
2126 static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2127 {
2128     TCGv_i32 lo = tcg_temp_new_i32();
2129     TCGv_i32 hi = tcg_temp_new_i32();
2130
2131     tcg_gen_mulu2_i32(lo, hi, rn, rm);
2132     tcg_gen_concat_i32_i64(rd, lo, hi);
2133
2134     tcg_temp_free_i32(lo);
2135     tcg_temp_free_i32(hi);
2136 }
2137
2138 static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
2139 {
2140     static NeonGenTwoOpWidenFn * const opfn[] = {
2141         gen_helper_neon_mull_s8,
2142         gen_helper_neon_mull_s16,
2143         gen_mull_s32,
2144         NULL,
2145     };
2146
2147     return do_long_3d(s, a, opfn[a->size], NULL);
2148 }
2149
2150 static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
2151 {
2152     static NeonGenTwoOpWidenFn * const opfn[] = {
2153         gen_helper_neon_mull_u8,
2154         gen_helper_neon_mull_u16,
2155         gen_mull_u32,
2156         NULL,
2157     };
2158
2159     return do_long_3d(s, a, opfn[a->size], NULL);
2160 }
2161
2162 #define DO_VMLAL(INSN,MULL,ACC)                                         \
2163     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2164     {                                                                   \
2165         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2166             gen_helper_neon_##MULL##8,                                  \
2167             gen_helper_neon_##MULL##16,                                 \
2168             gen_##MULL##32,                                             \
2169             NULL,                                                       \
2170         };                                                              \
2171         static NeonGenTwo64OpFn * const accfn[] = {                     \
2172             gen_helper_neon_##ACC##l_u16,                               \
2173             gen_helper_neon_##ACC##l_u32,                               \
2174             tcg_gen_##ACC##_i64,                                        \
2175             NULL,                                                       \
2176         };                                                              \
2177         return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
2178     }
2179
2180 DO_VMLAL(VMLAL_S,mull_s,add)
2181 DO_VMLAL(VMLAL_U,mull_u,add)
2182 DO_VMLAL(VMLSL_S,mull_s,sub)
2183 DO_VMLAL(VMLSL_U,mull_u,sub)
2184
2185 static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2186 {
2187     gen_helper_neon_mull_s16(rd, rn, rm);
2188     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rd, rd);
2189 }
2190
2191 static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2192 {
2193     gen_mull_s32(rd, rn, rm);
2194     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rd, rd);
2195 }
2196
2197 static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
2198 {
2199     static NeonGenTwoOpWidenFn * const opfn[] = {
2200         NULL,
2201         gen_VQDMULL_16,
2202         gen_VQDMULL_32,
2203         NULL,
2204     };
2205
2206     return do_long_3d(s, a, opfn[a->size], NULL);
2207 }
2208
2209 static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2210 {
2211     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2212 }
2213
2214 static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2215 {
2216     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2217 }
2218
2219 static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
2220 {
2221     static NeonGenTwoOpWidenFn * const opfn[] = {
2222         NULL,
2223         gen_VQDMULL_16,
2224         gen_VQDMULL_32,
2225         NULL,
2226     };
2227     static NeonGenTwo64OpFn * const accfn[] = {
2228         NULL,
2229         gen_VQDMLAL_acc_16,
2230         gen_VQDMLAL_acc_32,
2231         NULL,
2232     };
2233
2234     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2235 }
2236
2237 static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2238 {
2239     gen_helper_neon_negl_u32(rm, rm);
2240     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2241 }
2242
2243 static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2244 {
2245     tcg_gen_neg_i64(rm, rm);
2246     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2247 }
2248
2249 static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
2250 {
2251     static NeonGenTwoOpWidenFn * const opfn[] = {
2252         NULL,
2253         gen_VQDMULL_16,
2254         gen_VQDMULL_32,
2255         NULL,
2256     };
2257     static NeonGenTwo64OpFn * const accfn[] = {
2258         NULL,
2259         gen_VQDMLSL_acc_16,
2260         gen_VQDMLSL_acc_32,
2261         NULL,
2262     };
2263
2264     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2265 }
2266
2267 static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
2268 {
2269     gen_helper_gvec_3 *fn_gvec;
2270
2271     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2272         return false;
2273     }
2274
2275     /* UNDEF accesses to D16-D31 if they don't exist. */
2276     if (!dc_isar_feature(aa32_simd_r32, s) &&
2277         ((a->vd | a->vn | a->vm) & 0x10)) {
2278         return false;
2279     }
2280
2281     if (a->vd & 1) {
2282         return false;
2283     }
2284
2285     switch (a->size) {
2286     case 0:
2287         fn_gvec = gen_helper_neon_pmull_h;
2288         break;
2289     case 2:
2290         if (!dc_isar_feature(aa32_pmull, s)) {
2291             return false;
2292         }
2293         fn_gvec = gen_helper_gvec_pmull_q;
2294         break;
2295     default:
2296         return false;
2297     }
2298
2299     if (!vfp_access_check(s)) {
2300         return true;
2301     }
2302
2303     tcg_gen_gvec_3_ool(neon_reg_offset(a->vd, 0),
2304                        neon_reg_offset(a->vn, 0),
2305                        neon_reg_offset(a->vm, 0),
2306                        16, 16, 0, fn_gvec);
2307     return true;
2308 }
2309
2310 static void gen_neon_dup_low16(TCGv_i32 var)
2311 {
2312     TCGv_i32 tmp = tcg_temp_new_i32();
2313     tcg_gen_ext16u_i32(var, var);
2314     tcg_gen_shli_i32(tmp, var, 16);
2315     tcg_gen_or_i32(var, var, tmp);
2316     tcg_temp_free_i32(tmp);
2317 }
2318
2319 static void gen_neon_dup_high16(TCGv_i32 var)
2320 {
2321     TCGv_i32 tmp = tcg_temp_new_i32();
2322     tcg_gen_andi_i32(var, var, 0xffff0000);
2323     tcg_gen_shri_i32(tmp, var, 16);
2324     tcg_gen_or_i32(var, var, tmp);
2325     tcg_temp_free_i32(tmp);
2326 }
2327
2328 static inline TCGv_i32 neon_get_scalar(int size, int reg)
2329 {
2330     TCGv_i32 tmp;
2331     if (size == 1) {
2332         tmp = neon_load_reg(reg & 7, reg >> 4);
2333         if (reg & 8) {
2334             gen_neon_dup_high16(tmp);
2335         } else {
2336             gen_neon_dup_low16(tmp);
2337         }
2338     } else {
2339         tmp = neon_load_reg(reg & 15, reg >> 4);
2340     }
2341     return tmp;
2342 }
2343
2344 static bool do_2scalar(DisasContext *s, arg_2scalar *a,
2345                        NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
2346 {
2347     /*
2348      * Two registers and a scalar: perform an operation between
2349      * the input elements and the scalar, and then possibly
2350      * perform an accumulation operation of that result into the
2351      * destination.
2352      */
2353     TCGv_i32 scalar;
2354     int pass;
2355
2356     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2357         return false;
2358     }
2359
2360     /* UNDEF accesses to D16-D31 if they don't exist. */
2361     if (!dc_isar_feature(aa32_simd_r32, s) &&
2362         ((a->vd | a->vn | a->vm) & 0x10)) {
2363         return false;
2364     }
2365
2366     if (!opfn) {
2367         /* Bad size (including size == 3, which is a different insn group) */
2368         return false;
2369     }
2370
2371     if (a->q && ((a->vd | a->vn) & 1)) {
2372         return false;
2373     }
2374
2375     if (!vfp_access_check(s)) {
2376         return true;
2377     }
2378
2379     scalar = neon_get_scalar(a->size, a->vm);
2380
2381     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2382         TCGv_i32 tmp = neon_load_reg(a->vn, pass);
2383         opfn(tmp, tmp, scalar);
2384         if (accfn) {
2385             TCGv_i32 rd = neon_load_reg(a->vd, pass);
2386             accfn(tmp, rd, tmp);
2387             tcg_temp_free_i32(rd);
2388         }
2389         neon_store_reg(a->vd, pass, tmp);
2390     }
2391     tcg_temp_free_i32(scalar);
2392     return true;
2393 }
2394
2395 static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
2396 {
2397     static NeonGenTwoOpFn * const opfn[] = {
2398         NULL,
2399         gen_helper_neon_mul_u16,
2400         tcg_gen_mul_i32,
2401         NULL,
2402     };
2403
2404     return do_2scalar(s, a, opfn[a->size], NULL);
2405 }
2406
2407 static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
2408 {
2409     static NeonGenTwoOpFn * const opfn[] = {
2410         NULL,
2411         gen_helper_neon_mul_u16,
2412         tcg_gen_mul_i32,
2413         NULL,
2414     };
2415     static NeonGenTwoOpFn * const accfn[] = {
2416         NULL,
2417         gen_helper_neon_add_u16,
2418         tcg_gen_add_i32,
2419         NULL,
2420     };
2421
2422     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2423 }
2424
2425 static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
2426 {
2427     static NeonGenTwoOpFn * const opfn[] = {
2428         NULL,
2429         gen_helper_neon_mul_u16,
2430         tcg_gen_mul_i32,
2431         NULL,
2432     };
2433     static NeonGenTwoOpFn * const accfn[] = {
2434         NULL,
2435         gen_helper_neon_sub_u16,
2436         tcg_gen_sub_i32,
2437         NULL,
2438     };
2439
2440     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2441 }
2442
2443 static bool do_2scalar_fp_vec(DisasContext *s, arg_2scalar *a,
2444                               gen_helper_gvec_3_ptr *fn)
2445 {
2446     /* Two registers and a scalar, using gvec */
2447     int vec_size = a->q ? 16 : 8;
2448     int rd_ofs = neon_reg_offset(a->vd, 0);
2449     int rn_ofs = neon_reg_offset(a->vn, 0);
2450     int rm_ofs;
2451     int idx;
2452     TCGv_ptr fpstatus;
2453
2454     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2455         return false;
2456     }
2457
2458     /* UNDEF accesses to D16-D31 if they don't exist. */
2459     if (!dc_isar_feature(aa32_simd_r32, s) &&
2460         ((a->vd | a->vn | a->vm) & 0x10)) {
2461         return false;
2462     }
2463
2464     if (!fn) {
2465         /* Bad size (including size == 3, which is a different insn group) */
2466         return false;
2467     }
2468
2469     if (a->q && ((a->vd | a->vn) & 1)) {
2470         return false;
2471     }
2472
2473     if (!vfp_access_check(s)) {
2474         return true;
2475     }
2476
2477     /* a->vm is M:Vm, which encodes both register and index */
2478     idx = extract32(a->vm, a->size + 2, 2);
2479     a->vm = extract32(a->vm, 0, a->size + 2);
2480     rm_ofs = neon_reg_offset(a->vm, 0);
2481
2482     fpstatus = fpstatus_ptr(a->size == 1 ? FPST_STD_F16 : FPST_STD);
2483     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpstatus,
2484                        vec_size, vec_size, idx, fn);
2485     tcg_temp_free_ptr(fpstatus);
2486     return true;
2487 }
2488
2489 #define DO_VMUL_F_2sc(NAME, FUNC)                                       \
2490     static bool trans_##NAME##_F_2sc(DisasContext *s, arg_2scalar *a)   \
2491     {                                                                   \
2492         static gen_helper_gvec_3_ptr * const opfn[] = {                 \
2493             NULL,                                                       \
2494             gen_helper_##FUNC##_h,                                      \
2495             gen_helper_##FUNC##_s,                                      \
2496             NULL,                                                       \
2497         };                                                              \
2498         if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) { \
2499             return false;                                               \
2500         }                                                               \
2501         return do_2scalar_fp_vec(s, a, opfn[a->size]);                  \
2502     }
2503
2504 DO_VMUL_F_2sc(VMUL, gvec_fmul_idx)
2505 DO_VMUL_F_2sc(VMLA, gvec_fmla_nf_idx)
2506 DO_VMUL_F_2sc(VMLS, gvec_fmls_nf_idx)
2507
2508 WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
2509 WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
2510 WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
2511 WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
2512
2513 static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
2514 {
2515     static NeonGenTwoOpFn * const opfn[] = {
2516         NULL,
2517         gen_VQDMULH_16,
2518         gen_VQDMULH_32,
2519         NULL,
2520     };
2521
2522     return do_2scalar(s, a, opfn[a->size], NULL);
2523 }
2524
2525 static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
2526 {
2527     static NeonGenTwoOpFn * const opfn[] = {
2528         NULL,
2529         gen_VQRDMULH_16,
2530         gen_VQRDMULH_32,
2531         NULL,
2532     };
2533
2534     return do_2scalar(s, a, opfn[a->size], NULL);
2535 }
2536
2537 static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
2538                             NeonGenThreeOpEnvFn *opfn)
2539 {
2540     /*
2541      * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
2542      * performs a kind of fused op-then-accumulate using a helper
2543      * function that takes all of rd, rn and the scalar at once.
2544      */
2545     TCGv_i32 scalar;
2546     int pass;
2547
2548     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2549         return false;
2550     }
2551
2552     if (!dc_isar_feature(aa32_rdm, s)) {
2553         return false;
2554     }
2555
2556     /* UNDEF accesses to D16-D31 if they don't exist. */
2557     if (!dc_isar_feature(aa32_simd_r32, s) &&
2558         ((a->vd | a->vn | a->vm) & 0x10)) {
2559         return false;
2560     }
2561
2562     if (!opfn) {
2563         /* Bad size (including size == 3, which is a different insn group) */
2564         return false;
2565     }
2566
2567     if (a->q && ((a->vd | a->vn) & 1)) {
2568         return false;
2569     }
2570
2571     if (!vfp_access_check(s)) {
2572         return true;
2573     }
2574
2575     scalar = neon_get_scalar(a->size, a->vm);
2576
2577     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2578         TCGv_i32 rn = neon_load_reg(a->vn, pass);
2579         TCGv_i32 rd = neon_load_reg(a->vd, pass);
2580         opfn(rd, cpu_env, rn, scalar, rd);
2581         tcg_temp_free_i32(rn);
2582         neon_store_reg(a->vd, pass, rd);
2583     }
2584     tcg_temp_free_i32(scalar);
2585
2586     return true;
2587 }
2588
2589 static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
2590 {
2591     static NeonGenThreeOpEnvFn *opfn[] = {
2592         NULL,
2593         gen_helper_neon_qrdmlah_s16,
2594         gen_helper_neon_qrdmlah_s32,
2595         NULL,
2596     };
2597     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2598 }
2599
2600 static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
2601 {
2602     static NeonGenThreeOpEnvFn *opfn[] = {
2603         NULL,
2604         gen_helper_neon_qrdmlsh_s16,
2605         gen_helper_neon_qrdmlsh_s32,
2606         NULL,
2607     };
2608     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2609 }
2610
2611 static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
2612                             NeonGenTwoOpWidenFn *opfn,
2613                             NeonGenTwo64OpFn *accfn)
2614 {
2615     /*
2616      * Two registers and a scalar, long operations: perform an
2617      * operation on the input elements and the scalar which produces
2618      * a double-width result, and then possibly perform an accumulation
2619      * operation of that result into the destination.
2620      */
2621     TCGv_i32 scalar, rn;
2622     TCGv_i64 rn0_64, rn1_64;
2623
2624     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2625         return false;
2626     }
2627
2628     /* UNDEF accesses to D16-D31 if they don't exist. */
2629     if (!dc_isar_feature(aa32_simd_r32, s) &&
2630         ((a->vd | a->vn | a->vm) & 0x10)) {
2631         return false;
2632     }
2633
2634     if (!opfn) {
2635         /* Bad size (including size == 3, which is a different insn group) */
2636         return false;
2637     }
2638
2639     if (a->vd & 1) {
2640         return false;
2641     }
2642
2643     if (!vfp_access_check(s)) {
2644         return true;
2645     }
2646
2647     scalar = neon_get_scalar(a->size, a->vm);
2648
2649     /* Load all inputs before writing any outputs, in case of overlap */
2650     rn = neon_load_reg(a->vn, 0);
2651     rn0_64 = tcg_temp_new_i64();
2652     opfn(rn0_64, rn, scalar);
2653     tcg_temp_free_i32(rn);
2654
2655     rn = neon_load_reg(a->vn, 1);
2656     rn1_64 = tcg_temp_new_i64();
2657     opfn(rn1_64, rn, scalar);
2658     tcg_temp_free_i32(rn);
2659     tcg_temp_free_i32(scalar);
2660
2661     if (accfn) {
2662         TCGv_i64 t64 = tcg_temp_new_i64();
2663         neon_load_reg64(t64, a->vd);
2664         accfn(t64, t64, rn0_64);
2665         neon_store_reg64(t64, a->vd);
2666         neon_load_reg64(t64, a->vd + 1);
2667         accfn(t64, t64, rn1_64);
2668         neon_store_reg64(t64, a->vd + 1);
2669         tcg_temp_free_i64(t64);
2670     } else {
2671         neon_store_reg64(rn0_64, a->vd);
2672         neon_store_reg64(rn1_64, a->vd + 1);
2673     }
2674     tcg_temp_free_i64(rn0_64);
2675     tcg_temp_free_i64(rn1_64);
2676     return true;
2677 }
2678
2679 static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
2680 {
2681     static NeonGenTwoOpWidenFn * const opfn[] = {
2682         NULL,
2683         gen_helper_neon_mull_s16,
2684         gen_mull_s32,
2685         NULL,
2686     };
2687
2688     return do_2scalar_long(s, a, opfn[a->size], NULL);
2689 }
2690
2691 static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
2692 {
2693     static NeonGenTwoOpWidenFn * const opfn[] = {
2694         NULL,
2695         gen_helper_neon_mull_u16,
2696         gen_mull_u32,
2697         NULL,
2698     };
2699
2700     return do_2scalar_long(s, a, opfn[a->size], NULL);
2701 }
2702
2703 #define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
2704     static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
2705     {                                                                   \
2706         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2707             NULL,                                                       \
2708             gen_helper_neon_##MULL##16,                                 \
2709             gen_##MULL##32,                                             \
2710             NULL,                                                       \
2711         };                                                              \
2712         static NeonGenTwo64OpFn * const accfn[] = {                     \
2713             NULL,                                                       \
2714             gen_helper_neon_##ACC##l_u32,                               \
2715             tcg_gen_##ACC##_i64,                                        \
2716             NULL,                                                       \
2717         };                                                              \
2718         return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
2719     }
2720
2721 DO_VMLAL_2SC(VMLAL_S, mull_s, add)
2722 DO_VMLAL_2SC(VMLAL_U, mull_u, add)
2723 DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
2724 DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
2725
2726 static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
2727 {
2728     static NeonGenTwoOpWidenFn * const opfn[] = {
2729         NULL,
2730         gen_VQDMULL_16,
2731         gen_VQDMULL_32,
2732         NULL,
2733     };
2734
2735     return do_2scalar_long(s, a, opfn[a->size], NULL);
2736 }
2737
2738 static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
2739 {
2740     static NeonGenTwoOpWidenFn * const opfn[] = {
2741         NULL,
2742         gen_VQDMULL_16,
2743         gen_VQDMULL_32,
2744         NULL,
2745     };
2746     static NeonGenTwo64OpFn * const accfn[] = {
2747         NULL,
2748         gen_VQDMLAL_acc_16,
2749         gen_VQDMLAL_acc_32,
2750         NULL,
2751     };
2752
2753     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2754 }
2755
2756 static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
2757 {
2758     static NeonGenTwoOpWidenFn * const opfn[] = {
2759         NULL,
2760         gen_VQDMULL_16,
2761         gen_VQDMULL_32,
2762         NULL,
2763     };
2764     static NeonGenTwo64OpFn * const accfn[] = {
2765         NULL,
2766         gen_VQDMLSL_acc_16,
2767         gen_VQDMLSL_acc_32,
2768         NULL,
2769     };
2770
2771     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2772 }
2773
2774 static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
2775 {
2776     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2777         return false;
2778     }
2779
2780     /* UNDEF accesses to D16-D31 if they don't exist. */
2781     if (!dc_isar_feature(aa32_simd_r32, s) &&
2782         ((a->vd | a->vn | a->vm) & 0x10)) {
2783         return false;
2784     }
2785
2786     if ((a->vn | a->vm | a->vd) & a->q) {
2787         return false;
2788     }
2789
2790     if (a->imm > 7 && !a->q) {
2791         return false;
2792     }
2793
2794     if (!vfp_access_check(s)) {
2795         return true;
2796     }
2797
2798     if (!a->q) {
2799         /* Extract 64 bits from <Vm:Vn> */
2800         TCGv_i64 left, right, dest;
2801
2802         left = tcg_temp_new_i64();
2803         right = tcg_temp_new_i64();
2804         dest = tcg_temp_new_i64();
2805
2806         neon_load_reg64(right, a->vn);
2807         neon_load_reg64(left, a->vm);
2808         tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
2809         neon_store_reg64(dest, a->vd);
2810
2811         tcg_temp_free_i64(left);
2812         tcg_temp_free_i64(right);
2813         tcg_temp_free_i64(dest);
2814     } else {
2815         /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
2816         TCGv_i64 left, middle, right, destleft, destright;
2817
2818         left = tcg_temp_new_i64();
2819         middle = tcg_temp_new_i64();
2820         right = tcg_temp_new_i64();
2821         destleft = tcg_temp_new_i64();
2822         destright = tcg_temp_new_i64();
2823
2824         if (a->imm < 8) {
2825             neon_load_reg64(right, a->vn);
2826             neon_load_reg64(middle, a->vn + 1);
2827             tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
2828             neon_load_reg64(left, a->vm);
2829             tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
2830         } else {
2831             neon_load_reg64(right, a->vn + 1);
2832             neon_load_reg64(middle, a->vm);
2833             tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
2834             neon_load_reg64(left, a->vm + 1);
2835             tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
2836         }
2837
2838         neon_store_reg64(destright, a->vd);
2839         neon_store_reg64(destleft, a->vd + 1);
2840
2841         tcg_temp_free_i64(destright);
2842         tcg_temp_free_i64(destleft);
2843         tcg_temp_free_i64(right);
2844         tcg_temp_free_i64(middle);
2845         tcg_temp_free_i64(left);
2846     }
2847     return true;
2848 }
2849
2850 static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
2851 {
2852     int n;
2853     TCGv_i32 tmp, tmp2, tmp3, tmp4;
2854     TCGv_ptr ptr1;
2855
2856     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2857         return false;
2858     }
2859
2860     /* UNDEF accesses to D16-D31 if they don't exist. */
2861     if (!dc_isar_feature(aa32_simd_r32, s) &&
2862         ((a->vd | a->vn | a->vm) & 0x10)) {
2863         return false;
2864     }
2865
2866     if (!vfp_access_check(s)) {
2867         return true;
2868     }
2869
2870     n = a->len + 1;
2871     if ((a->vn + n) > 32) {
2872         /*
2873          * This is UNPREDICTABLE; we choose to UNDEF to avoid the
2874          * helper function running off the end of the register file.
2875          */
2876         return false;
2877     }
2878     n <<= 3;
2879     if (a->op) {
2880         tmp = neon_load_reg(a->vd, 0);
2881     } else {
2882         tmp = tcg_temp_new_i32();
2883         tcg_gen_movi_i32(tmp, 0);
2884     }
2885     tmp2 = neon_load_reg(a->vm, 0);
2886     ptr1 = vfp_reg_ptr(true, a->vn);
2887     tmp4 = tcg_const_i32(n);
2888     gen_helper_neon_tbl(tmp2, tmp2, tmp, ptr1, tmp4);
2889     tcg_temp_free_i32(tmp);
2890     if (a->op) {
2891         tmp = neon_load_reg(a->vd, 1);
2892     } else {
2893         tmp = tcg_temp_new_i32();
2894         tcg_gen_movi_i32(tmp, 0);
2895     }
2896     tmp3 = neon_load_reg(a->vm, 1);
2897     gen_helper_neon_tbl(tmp3, tmp3, tmp, ptr1, tmp4);
2898     tcg_temp_free_i32(tmp4);
2899     tcg_temp_free_ptr(ptr1);
2900     neon_store_reg(a->vd, 0, tmp2);
2901     neon_store_reg(a->vd, 1, tmp3);
2902     tcg_temp_free_i32(tmp);
2903     return true;
2904 }
2905
2906 static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
2907 {
2908     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2909         return false;
2910     }
2911
2912     /* UNDEF accesses to D16-D31 if they don't exist. */
2913     if (!dc_isar_feature(aa32_simd_r32, s) &&
2914         ((a->vd | a->vm) & 0x10)) {
2915         return false;
2916     }
2917
2918     if (a->vd & a->q) {
2919         return false;
2920     }
2921
2922     if (!vfp_access_check(s)) {
2923         return true;
2924     }
2925
2926     tcg_gen_gvec_dup_mem(a->size, neon_reg_offset(a->vd, 0),
2927                          neon_element_offset(a->vm, a->index, a->size),
2928                          a->q ? 16 : 8, a->q ? 16 : 8);
2929     return true;
2930 }
2931
2932 static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
2933 {
2934     int pass, half;
2935
2936     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2937         return false;
2938     }
2939
2940     /* UNDEF accesses to D16-D31 if they don't exist. */
2941     if (!dc_isar_feature(aa32_simd_r32, s) &&
2942         ((a->vd | a->vm) & 0x10)) {
2943         return false;
2944     }
2945
2946     if ((a->vd | a->vm) & a->q) {
2947         return false;
2948     }
2949
2950     if (a->size == 3) {
2951         return false;
2952     }
2953
2954     if (!vfp_access_check(s)) {
2955         return true;
2956     }
2957
2958     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
2959         TCGv_i32 tmp[2];
2960
2961         for (half = 0; half < 2; half++) {
2962             tmp[half] = neon_load_reg(a->vm, pass * 2 + half);
2963             switch (a->size) {
2964             case 0:
2965                 tcg_gen_bswap32_i32(tmp[half], tmp[half]);
2966                 break;
2967             case 1:
2968                 gen_swap_half(tmp[half], tmp[half]);
2969                 break;
2970             case 2:
2971                 break;
2972             default:
2973                 g_assert_not_reached();
2974             }
2975         }
2976         neon_store_reg(a->vd, pass * 2, tmp[1]);
2977         neon_store_reg(a->vd, pass * 2 + 1, tmp[0]);
2978     }
2979     return true;
2980 }
2981
2982 static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
2983                               NeonGenWidenFn *widenfn,
2984                               NeonGenTwo64OpFn *opfn,
2985                               NeonGenTwo64OpFn *accfn)
2986 {
2987     /*
2988      * Pairwise long operations: widen both halves of the pair,
2989      * combine the pairs with the opfn, and then possibly accumulate
2990      * into the destination with the accfn.
2991      */
2992     int pass;
2993
2994     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2995         return false;
2996     }
2997
2998     /* UNDEF accesses to D16-D31 if they don't exist. */
2999     if (!dc_isar_feature(aa32_simd_r32, s) &&
3000         ((a->vd | a->vm) & 0x10)) {
3001         return false;
3002     }
3003
3004     if ((a->vd | a->vm) & a->q) {
3005         return false;
3006     }
3007
3008     if (!widenfn) {
3009         return false;
3010     }
3011
3012     if (!vfp_access_check(s)) {
3013         return true;
3014     }
3015
3016     for (pass = 0; pass < a->q + 1; pass++) {
3017         TCGv_i32 tmp;
3018         TCGv_i64 rm0_64, rm1_64, rd_64;
3019
3020         rm0_64 = tcg_temp_new_i64();
3021         rm1_64 = tcg_temp_new_i64();
3022         rd_64 = tcg_temp_new_i64();
3023         tmp = neon_load_reg(a->vm, pass * 2);
3024         widenfn(rm0_64, tmp);
3025         tcg_temp_free_i32(tmp);
3026         tmp = neon_load_reg(a->vm, pass * 2 + 1);
3027         widenfn(rm1_64, tmp);
3028         tcg_temp_free_i32(tmp);
3029         opfn(rd_64, rm0_64, rm1_64);
3030         tcg_temp_free_i64(rm0_64);
3031         tcg_temp_free_i64(rm1_64);
3032
3033         if (accfn) {
3034             TCGv_i64 tmp64 = tcg_temp_new_i64();
3035             neon_load_reg64(tmp64, a->vd + pass);
3036             accfn(rd_64, tmp64, rd_64);
3037             tcg_temp_free_i64(tmp64);
3038         }
3039         neon_store_reg64(rd_64, a->vd + pass);
3040         tcg_temp_free_i64(rd_64);
3041     }
3042     return true;
3043 }
3044
3045 static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
3046 {
3047     static NeonGenWidenFn * const widenfn[] = {
3048         gen_helper_neon_widen_s8,
3049         gen_helper_neon_widen_s16,
3050         tcg_gen_ext_i32_i64,
3051         NULL,
3052     };
3053     static NeonGenTwo64OpFn * const opfn[] = {
3054         gen_helper_neon_paddl_u16,
3055         gen_helper_neon_paddl_u32,
3056         tcg_gen_add_i64,
3057         NULL,
3058     };
3059
3060     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3061 }
3062
3063 static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
3064 {
3065     static NeonGenWidenFn * const widenfn[] = {
3066         gen_helper_neon_widen_u8,
3067         gen_helper_neon_widen_u16,
3068         tcg_gen_extu_i32_i64,
3069         NULL,
3070     };
3071     static NeonGenTwo64OpFn * const opfn[] = {
3072         gen_helper_neon_paddl_u16,
3073         gen_helper_neon_paddl_u32,
3074         tcg_gen_add_i64,
3075         NULL,
3076     };
3077
3078     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3079 }
3080
3081 static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
3082 {
3083     static NeonGenWidenFn * const widenfn[] = {
3084         gen_helper_neon_widen_s8,
3085         gen_helper_neon_widen_s16,
3086         tcg_gen_ext_i32_i64,
3087         NULL,
3088     };
3089     static NeonGenTwo64OpFn * const opfn[] = {
3090         gen_helper_neon_paddl_u16,
3091         gen_helper_neon_paddl_u32,
3092         tcg_gen_add_i64,
3093         NULL,
3094     };
3095     static NeonGenTwo64OpFn * const accfn[] = {
3096         gen_helper_neon_addl_u16,
3097         gen_helper_neon_addl_u32,
3098         tcg_gen_add_i64,
3099         NULL,
3100     };
3101
3102     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3103                              accfn[a->size]);
3104 }
3105
3106 static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
3107 {
3108     static NeonGenWidenFn * const widenfn[] = {
3109         gen_helper_neon_widen_u8,
3110         gen_helper_neon_widen_u16,
3111         tcg_gen_extu_i32_i64,
3112         NULL,
3113     };
3114     static NeonGenTwo64OpFn * const opfn[] = {
3115         gen_helper_neon_paddl_u16,
3116         gen_helper_neon_paddl_u32,
3117         tcg_gen_add_i64,
3118         NULL,
3119     };
3120     static NeonGenTwo64OpFn * const accfn[] = {
3121         gen_helper_neon_addl_u16,
3122         gen_helper_neon_addl_u32,
3123         tcg_gen_add_i64,
3124         NULL,
3125     };
3126
3127     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3128                              accfn[a->size]);
3129 }
3130
3131 typedef void ZipFn(TCGv_ptr, TCGv_ptr);
3132
3133 static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
3134                        ZipFn *fn)
3135 {
3136     TCGv_ptr pd, pm;
3137
3138     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3139         return false;
3140     }
3141
3142     /* UNDEF accesses to D16-D31 if they don't exist. */
3143     if (!dc_isar_feature(aa32_simd_r32, s) &&
3144         ((a->vd | a->vm) & 0x10)) {
3145         return false;
3146     }
3147
3148     if ((a->vd | a->vm) & a->q) {
3149         return false;
3150     }
3151
3152     if (!fn) {
3153         /* Bad size or size/q combination */
3154         return false;
3155     }
3156
3157     if (!vfp_access_check(s)) {
3158         return true;
3159     }
3160
3161     pd = vfp_reg_ptr(true, a->vd);
3162     pm = vfp_reg_ptr(true, a->vm);
3163     fn(pd, pm);
3164     tcg_temp_free_ptr(pd);
3165     tcg_temp_free_ptr(pm);
3166     return true;
3167 }
3168
3169 static bool trans_VUZP(DisasContext *s, arg_2misc *a)
3170 {
3171     static ZipFn * const fn[2][4] = {
3172         {
3173             gen_helper_neon_unzip8,
3174             gen_helper_neon_unzip16,
3175             NULL,
3176             NULL,
3177         }, {
3178             gen_helper_neon_qunzip8,
3179             gen_helper_neon_qunzip16,
3180             gen_helper_neon_qunzip32,
3181             NULL,
3182         }
3183     };
3184     return do_zip_uzp(s, a, fn[a->q][a->size]);
3185 }
3186
3187 static bool trans_VZIP(DisasContext *s, arg_2misc *a)
3188 {
3189     static ZipFn * const fn[2][4] = {
3190         {
3191             gen_helper_neon_zip8,
3192             gen_helper_neon_zip16,
3193             NULL,
3194             NULL,
3195         }, {
3196             gen_helper_neon_qzip8,
3197             gen_helper_neon_qzip16,
3198             gen_helper_neon_qzip32,
3199             NULL,
3200         }
3201     };
3202     return do_zip_uzp(s, a, fn[a->q][a->size]);
3203 }
3204
3205 static bool do_vmovn(DisasContext *s, arg_2misc *a,
3206                      NeonGenNarrowEnvFn *narrowfn)
3207 {
3208     TCGv_i64 rm;
3209     TCGv_i32 rd0, rd1;
3210
3211     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3212         return false;
3213     }
3214
3215     /* UNDEF accesses to D16-D31 if they don't exist. */
3216     if (!dc_isar_feature(aa32_simd_r32, s) &&
3217         ((a->vd | a->vm) & 0x10)) {
3218         return false;
3219     }
3220
3221     if (a->vm & 1) {
3222         return false;
3223     }
3224
3225     if (!narrowfn) {
3226         return false;
3227     }
3228
3229     if (!vfp_access_check(s)) {
3230         return true;
3231     }
3232
3233     rm = tcg_temp_new_i64();
3234     rd0 = tcg_temp_new_i32();
3235     rd1 = tcg_temp_new_i32();
3236
3237     neon_load_reg64(rm, a->vm);
3238     narrowfn(rd0, cpu_env, rm);
3239     neon_load_reg64(rm, a->vm + 1);
3240     narrowfn(rd1, cpu_env, rm);
3241     neon_store_reg(a->vd, 0, rd0);
3242     neon_store_reg(a->vd, 1, rd1);
3243     tcg_temp_free_i64(rm);
3244     return true;
3245 }
3246
3247 #define DO_VMOVN(INSN, FUNC)                                    \
3248     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3249     {                                                           \
3250         static NeonGenNarrowEnvFn * const narrowfn[] = {        \
3251             FUNC##8,                                            \
3252             FUNC##16,                                           \
3253             FUNC##32,                                           \
3254             NULL,                                               \
3255         };                                                      \
3256         return do_vmovn(s, a, narrowfn[a->size]);               \
3257     }
3258
3259 DO_VMOVN(VMOVN, gen_neon_narrow_u)
3260 DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
3261 DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
3262 DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
3263
3264 static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
3265 {
3266     TCGv_i32 rm0, rm1;
3267     TCGv_i64 rd;
3268     static NeonGenWidenFn * const widenfns[] = {
3269         gen_helper_neon_widen_u8,
3270         gen_helper_neon_widen_u16,
3271         tcg_gen_extu_i32_i64,
3272         NULL,
3273     };
3274     NeonGenWidenFn *widenfn = widenfns[a->size];
3275
3276     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3277         return false;
3278     }
3279
3280     /* UNDEF accesses to D16-D31 if they don't exist. */
3281     if (!dc_isar_feature(aa32_simd_r32, s) &&
3282         ((a->vd | a->vm) & 0x10)) {
3283         return false;
3284     }
3285
3286     if (a->vd & 1) {
3287         return false;
3288     }
3289
3290     if (!widenfn) {
3291         return false;
3292     }
3293
3294     if (!vfp_access_check(s)) {
3295         return true;
3296     }
3297
3298     rd = tcg_temp_new_i64();
3299
3300     rm0 = neon_load_reg(a->vm, 0);
3301     rm1 = neon_load_reg(a->vm, 1);
3302
3303     widenfn(rd, rm0);
3304     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3305     neon_store_reg64(rd, a->vd);
3306     widenfn(rd, rm1);
3307     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3308     neon_store_reg64(rd, a->vd + 1);
3309
3310     tcg_temp_free_i64(rd);
3311     tcg_temp_free_i32(rm0);
3312     tcg_temp_free_i32(rm1);
3313     return true;
3314 }
3315
3316 static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
3317 {
3318     TCGv_ptr fpst;
3319     TCGv_i32 ahp, tmp, tmp2, tmp3;
3320
3321     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3322         !dc_isar_feature(aa32_fp16_spconv, s)) {
3323         return false;
3324     }
3325
3326     /* UNDEF accesses to D16-D31 if they don't exist. */
3327     if (!dc_isar_feature(aa32_simd_r32, s) &&
3328         ((a->vd | a->vm) & 0x10)) {
3329         return false;
3330     }
3331
3332     if ((a->vm & 1) || (a->size != 1)) {
3333         return false;
3334     }
3335
3336     if (!vfp_access_check(s)) {
3337         return true;
3338     }
3339
3340     fpst = fpstatus_ptr(FPST_STD);
3341     ahp = get_ahp_flag();
3342     tmp = neon_load_reg(a->vm, 0);
3343     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3344     tmp2 = neon_load_reg(a->vm, 1);
3345     gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
3346     tcg_gen_shli_i32(tmp2, tmp2, 16);
3347     tcg_gen_or_i32(tmp2, tmp2, tmp);
3348     tcg_temp_free_i32(tmp);
3349     tmp = neon_load_reg(a->vm, 2);
3350     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3351     tmp3 = neon_load_reg(a->vm, 3);
3352     neon_store_reg(a->vd, 0, tmp2);
3353     gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
3354     tcg_gen_shli_i32(tmp3, tmp3, 16);
3355     tcg_gen_or_i32(tmp3, tmp3, tmp);
3356     neon_store_reg(a->vd, 1, tmp3);
3357     tcg_temp_free_i32(tmp);
3358     tcg_temp_free_i32(ahp);
3359     tcg_temp_free_ptr(fpst);
3360
3361     return true;
3362 }
3363
3364 static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
3365 {
3366     TCGv_ptr fpst;
3367     TCGv_i32 ahp, tmp, tmp2, tmp3;
3368
3369     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3370         !dc_isar_feature(aa32_fp16_spconv, s)) {
3371         return false;
3372     }
3373
3374     /* UNDEF accesses to D16-D31 if they don't exist. */
3375     if (!dc_isar_feature(aa32_simd_r32, s) &&
3376         ((a->vd | a->vm) & 0x10)) {
3377         return false;
3378     }
3379
3380     if ((a->vd & 1) || (a->size != 1)) {
3381         return false;
3382     }
3383
3384     if (!vfp_access_check(s)) {
3385         return true;
3386     }
3387
3388     fpst = fpstatus_ptr(FPST_STD);
3389     ahp = get_ahp_flag();
3390     tmp3 = tcg_temp_new_i32();
3391     tmp = neon_load_reg(a->vm, 0);
3392     tmp2 = neon_load_reg(a->vm, 1);
3393     tcg_gen_ext16u_i32(tmp3, tmp);
3394     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3395     neon_store_reg(a->vd, 0, tmp3);
3396     tcg_gen_shri_i32(tmp, tmp, 16);
3397     gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
3398     neon_store_reg(a->vd, 1, tmp);
3399     tmp3 = tcg_temp_new_i32();
3400     tcg_gen_ext16u_i32(tmp3, tmp2);
3401     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3402     neon_store_reg(a->vd, 2, tmp3);
3403     tcg_gen_shri_i32(tmp2, tmp2, 16);
3404     gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
3405     neon_store_reg(a->vd, 3, tmp2);
3406     tcg_temp_free_i32(ahp);
3407     tcg_temp_free_ptr(fpst);
3408
3409     return true;
3410 }
3411
3412 static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
3413 {
3414     int vec_size = a->q ? 16 : 8;
3415     int rd_ofs = neon_reg_offset(a->vd, 0);
3416     int rm_ofs = neon_reg_offset(a->vm, 0);
3417
3418     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3419         return false;
3420     }
3421
3422     /* UNDEF accesses to D16-D31 if they don't exist. */
3423     if (!dc_isar_feature(aa32_simd_r32, s) &&
3424         ((a->vd | a->vm) & 0x10)) {
3425         return false;
3426     }
3427
3428     if (a->size == 3) {
3429         return false;
3430     }
3431
3432     if ((a->vd | a->vm) & a->q) {
3433         return false;
3434     }
3435
3436     if (!vfp_access_check(s)) {
3437         return true;
3438     }
3439
3440     fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size);
3441
3442     return true;
3443 }
3444
3445 #define DO_2MISC_VEC(INSN, FN)                                  \
3446     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3447     {                                                           \
3448         return do_2misc_vec(s, a, FN);                          \
3449     }
3450
3451 DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg)
3452 DO_2MISC_VEC(VABS, tcg_gen_gvec_abs)
3453 DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0)
3454 DO_2MISC_VEC(VCGT0, gen_gvec_cgt0)
3455 DO_2MISC_VEC(VCLE0, gen_gvec_cle0)
3456 DO_2MISC_VEC(VCGE0, gen_gvec_cge0)
3457 DO_2MISC_VEC(VCLT0, gen_gvec_clt0)
3458
3459 static bool trans_VMVN(DisasContext *s, arg_2misc *a)
3460 {
3461     if (a->size != 0) {
3462         return false;
3463     }
3464     return do_2misc_vec(s, a, tcg_gen_gvec_not);
3465 }
3466
3467 #define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3468     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3469                          uint32_t rm_ofs, uint32_t oprsz,               \
3470                          uint32_t maxsz)                                \
3471     {                                                                   \
3472         tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz,        \
3473                            DATA, FUNC);                                 \
3474     }
3475
3476 #define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3477     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3478                          uint32_t rm_ofs, uint32_t oprsz,               \
3479                          uint32_t maxsz)                                \
3480     {                                                                   \
3481         tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC);   \
3482     }
3483
3484 WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0)
3485 WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aese, 1)
3486 WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0)
3487 WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesmc, 1)
3488 WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0)
3489 WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0)
3490 WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0)
3491
3492 #define DO_2M_CRYPTO(INSN, FEATURE, SIZE)                       \
3493     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3494     {                                                           \
3495         if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) {  \
3496             return false;                                       \
3497         }                                                       \
3498         return do_2misc_vec(s, a, gen_##INSN);                  \
3499     }
3500
3501 DO_2M_CRYPTO(AESE, aa32_aes, 0)
3502 DO_2M_CRYPTO(AESD, aa32_aes, 0)
3503 DO_2M_CRYPTO(AESMC, aa32_aes, 0)
3504 DO_2M_CRYPTO(AESIMC, aa32_aes, 0)
3505 DO_2M_CRYPTO(SHA1H, aa32_sha1, 2)
3506 DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2)
3507 DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2)
3508
3509 static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
3510 {
3511     int pass;
3512
3513     /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
3514     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3515         return false;
3516     }
3517
3518     /* UNDEF accesses to D16-D31 if they don't exist. */
3519     if (!dc_isar_feature(aa32_simd_r32, s) &&
3520         ((a->vd | a->vm) & 0x10)) {
3521         return false;
3522     }
3523
3524     if (!fn) {
3525         return false;
3526     }
3527
3528     if ((a->vd | a->vm) & a->q) {
3529         return false;
3530     }
3531
3532     if (!vfp_access_check(s)) {
3533         return true;
3534     }
3535
3536     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3537         TCGv_i32 tmp = neon_load_reg(a->vm, pass);
3538         fn(tmp, tmp);
3539         neon_store_reg(a->vd, pass, tmp);
3540     }
3541
3542     return true;
3543 }
3544
3545 static bool trans_VREV32(DisasContext *s, arg_2misc *a)
3546 {
3547     static NeonGenOneOpFn * const fn[] =