target/arm: Implement FP16 for Neon VADD, VSUB, VABD, VMUL
[qemu.git] / target / arm / translate-neon.c.inc
1 /*
2  *  ARM translation: AArch32 Neon instructions
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *  Copyright (c) 2005-2007 CodeSourcery
6  *  Copyright (c) 2007 OpenedHand, Ltd.
7  *  Copyright (c) 2020 Linaro, Ltd.
8  *
9  * This library is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2 of the License, or (at your option) any later version.
13  *
14  * This library is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
21  */
22
23 /*
24  * This file is intended to be included from translate.c; it uses
25  * some macros and definitions provided by that file.
26  * It might be possible to convert it to a standalone .c file eventually.
27  */
28
29 static inline int plus1(DisasContext *s, int x)
30 {
31     return x + 1;
32 }
33
34 static inline int rsub_64(DisasContext *s, int x)
35 {
36     return 64 - x;
37 }
38
39 static inline int rsub_32(DisasContext *s, int x)
40 {
41     return 32 - x;
42 }
43 static inline int rsub_16(DisasContext *s, int x)
44 {
45     return 16 - x;
46 }
47 static inline int rsub_8(DisasContext *s, int x)
48 {
49     return 8 - x;
50 }
51
52 /* Include the generated Neon decoder */
53 #include "decode-neon-dp.c.inc"
54 #include "decode-neon-ls.c.inc"
55 #include "decode-neon-shared.c.inc"
56
57 /* Return the offset of a 2**SIZE piece of a NEON register, at index ELE,
58  * where 0 is the least significant end of the register.
59  */
60 static inline long
61 neon_element_offset(int reg, int element, MemOp size)
62 {
63     int element_size = 1 << size;
64     int ofs = element * element_size;
65 #ifdef HOST_WORDS_BIGENDIAN
66     /* Calculate the offset assuming fully little-endian,
67      * then XOR to account for the order of the 8-byte units.
68      */
69     if (element_size < 8) {
70         ofs ^= 8 - element_size;
71     }
72 #endif
73     return neon_reg_offset(reg, 0) + ofs;
74 }
75
76 static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
77 {
78     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
79
80     switch (mop) {
81     case MO_UB:
82         tcg_gen_ld8u_i32(var, cpu_env, offset);
83         break;
84     case MO_UW:
85         tcg_gen_ld16u_i32(var, cpu_env, offset);
86         break;
87     case MO_UL:
88         tcg_gen_ld_i32(var, cpu_env, offset);
89         break;
90     default:
91         g_assert_not_reached();
92     }
93 }
94
95 static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
96 {
97     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
98
99     switch (mop) {
100     case MO_UB:
101         tcg_gen_ld8u_i64(var, cpu_env, offset);
102         break;
103     case MO_UW:
104         tcg_gen_ld16u_i64(var, cpu_env, offset);
105         break;
106     case MO_UL:
107         tcg_gen_ld32u_i64(var, cpu_env, offset);
108         break;
109     case MO_Q:
110         tcg_gen_ld_i64(var, cpu_env, offset);
111         break;
112     default:
113         g_assert_not_reached();
114     }
115 }
116
117 static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
118 {
119     long offset = neon_element_offset(reg, ele, size);
120
121     switch (size) {
122     case MO_8:
123         tcg_gen_st8_i32(var, cpu_env, offset);
124         break;
125     case MO_16:
126         tcg_gen_st16_i32(var, cpu_env, offset);
127         break;
128     case MO_32:
129         tcg_gen_st_i32(var, cpu_env, offset);
130         break;
131     default:
132         g_assert_not_reached();
133     }
134 }
135
136 static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
137 {
138     long offset = neon_element_offset(reg, ele, size);
139
140     switch (size) {
141     case MO_8:
142         tcg_gen_st8_i64(var, cpu_env, offset);
143         break;
144     case MO_16:
145         tcg_gen_st16_i64(var, cpu_env, offset);
146         break;
147     case MO_32:
148         tcg_gen_st32_i64(var, cpu_env, offset);
149         break;
150     case MO_64:
151         tcg_gen_st_i64(var, cpu_env, offset);
152         break;
153     default:
154         g_assert_not_reached();
155     }
156 }
157
158 static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
159 {
160     int opr_sz;
161     TCGv_ptr fpst;
162     gen_helper_gvec_3_ptr *fn_gvec_ptr;
163
164     if (!dc_isar_feature(aa32_vcma, s)
165         || (!a->size && !dc_isar_feature(aa32_fp16_arith, s))) {
166         return false;
167     }
168
169     /* UNDEF accesses to D16-D31 if they don't exist. */
170     if (!dc_isar_feature(aa32_simd_r32, s) &&
171         ((a->vd | a->vn | a->vm) & 0x10)) {
172         return false;
173     }
174
175     if ((a->vn | a->vm | a->vd) & a->q) {
176         return false;
177     }
178
179     if (!vfp_access_check(s)) {
180         return true;
181     }
182
183     opr_sz = (1 + a->q) * 8;
184     fpst = fpstatus_ptr(a->size == 0 ? FPST_STD_F16 : FPST_STD);
185     fn_gvec_ptr = a->size ? gen_helper_gvec_fcmlas : gen_helper_gvec_fcmlah;
186     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
187                        vfp_reg_offset(1, a->vn),
188                        vfp_reg_offset(1, a->vm),
189                        fpst, opr_sz, opr_sz, a->rot,
190                        fn_gvec_ptr);
191     tcg_temp_free_ptr(fpst);
192     return true;
193 }
194
195 static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
196 {
197     int opr_sz;
198     TCGv_ptr fpst;
199     gen_helper_gvec_3_ptr *fn_gvec_ptr;
200
201     if (!dc_isar_feature(aa32_vcma, s)
202         || (!a->size && !dc_isar_feature(aa32_fp16_arith, s))) {
203         return false;
204     }
205
206     /* UNDEF accesses to D16-D31 if they don't exist. */
207     if (!dc_isar_feature(aa32_simd_r32, s) &&
208         ((a->vd | a->vn | a->vm) & 0x10)) {
209         return false;
210     }
211
212     if ((a->vn | a->vm | a->vd) & a->q) {
213         return false;
214     }
215
216     if (!vfp_access_check(s)) {
217         return true;
218     }
219
220     opr_sz = (1 + a->q) * 8;
221     fpst = fpstatus_ptr(a->size == 0 ? FPST_STD_F16 : FPST_STD);
222     fn_gvec_ptr = a->size ? gen_helper_gvec_fcadds : gen_helper_gvec_fcaddh;
223     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
224                        vfp_reg_offset(1, a->vn),
225                        vfp_reg_offset(1, a->vm),
226                        fpst, opr_sz, opr_sz, a->rot,
227                        fn_gvec_ptr);
228     tcg_temp_free_ptr(fpst);
229     return true;
230 }
231
232 static bool trans_VDOT(DisasContext *s, arg_VDOT *a)
233 {
234     int opr_sz;
235     gen_helper_gvec_3 *fn_gvec;
236
237     if (!dc_isar_feature(aa32_dp, s)) {
238         return false;
239     }
240
241     /* UNDEF accesses to D16-D31 if they don't exist. */
242     if (!dc_isar_feature(aa32_simd_r32, s) &&
243         ((a->vd | a->vn | a->vm) & 0x10)) {
244         return false;
245     }
246
247     if ((a->vn | a->vm | a->vd) & a->q) {
248         return false;
249     }
250
251     if (!vfp_access_check(s)) {
252         return true;
253     }
254
255     opr_sz = (1 + a->q) * 8;
256     fn_gvec = a->u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b;
257     tcg_gen_gvec_3_ool(vfp_reg_offset(1, a->vd),
258                        vfp_reg_offset(1, a->vn),
259                        vfp_reg_offset(1, a->vm),
260                        opr_sz, opr_sz, 0, fn_gvec);
261     return true;
262 }
263
264 static bool trans_VFML(DisasContext *s, arg_VFML *a)
265 {
266     int opr_sz;
267
268     if (!dc_isar_feature(aa32_fhm, s)) {
269         return false;
270     }
271
272     /* UNDEF accesses to D16-D31 if they don't exist. */
273     if (!dc_isar_feature(aa32_simd_r32, s) &&
274         (a->vd & 0x10)) {
275         return false;
276     }
277
278     if (a->vd & a->q) {
279         return false;
280     }
281
282     if (!vfp_access_check(s)) {
283         return true;
284     }
285
286     opr_sz = (1 + a->q) * 8;
287     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
288                        vfp_reg_offset(a->q, a->vn),
289                        vfp_reg_offset(a->q, a->vm),
290                        cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
291                        gen_helper_gvec_fmlal_a32);
292     return true;
293 }
294
295 static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
296 {
297     gen_helper_gvec_3_ptr *fn_gvec_ptr;
298     int opr_sz;
299     TCGv_ptr fpst;
300
301     if (!dc_isar_feature(aa32_vcma, s)) {
302         return false;
303     }
304     if (a->size == 0 && !dc_isar_feature(aa32_fp16_arith, s)) {
305         return false;
306     }
307
308     /* UNDEF accesses to D16-D31 if they don't exist. */
309     if (!dc_isar_feature(aa32_simd_r32, s) &&
310         ((a->vd | a->vn | a->vm) & 0x10)) {
311         return false;
312     }
313
314     if ((a->vd | a->vn) & a->q) {
315         return false;
316     }
317
318     if (!vfp_access_check(s)) {
319         return true;
320     }
321
322     fn_gvec_ptr = (a->size ? gen_helper_gvec_fcmlas_idx
323                    : gen_helper_gvec_fcmlah_idx);
324     opr_sz = (1 + a->q) * 8;
325     fpst = fpstatus_ptr(a->size == 0 ? FPST_STD_F16 : FPST_STD);
326     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
327                        vfp_reg_offset(1, a->vn),
328                        vfp_reg_offset(1, a->vm),
329                        fpst, opr_sz, opr_sz,
330                        (a->index << 2) | a->rot, fn_gvec_ptr);
331     tcg_temp_free_ptr(fpst);
332     return true;
333 }
334
335 static bool trans_VDOT_scalar(DisasContext *s, arg_VDOT_scalar *a)
336 {
337     gen_helper_gvec_3 *fn_gvec;
338     int opr_sz;
339     TCGv_ptr fpst;
340
341     if (!dc_isar_feature(aa32_dp, s)) {
342         return false;
343     }
344
345     /* UNDEF accesses to D16-D31 if they don't exist. */
346     if (!dc_isar_feature(aa32_simd_r32, s) &&
347         ((a->vd | a->vn) & 0x10)) {
348         return false;
349     }
350
351     if ((a->vd | a->vn) & a->q) {
352         return false;
353     }
354
355     if (!vfp_access_check(s)) {
356         return true;
357     }
358
359     fn_gvec = a->u ? gen_helper_gvec_udot_idx_b : gen_helper_gvec_sdot_idx_b;
360     opr_sz = (1 + a->q) * 8;
361     fpst = fpstatus_ptr(FPST_STD);
362     tcg_gen_gvec_3_ool(vfp_reg_offset(1, a->vd),
363                        vfp_reg_offset(1, a->vn),
364                        vfp_reg_offset(1, a->rm),
365                        opr_sz, opr_sz, a->index, fn_gvec);
366     tcg_temp_free_ptr(fpst);
367     return true;
368 }
369
370 static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
371 {
372     int opr_sz;
373
374     if (!dc_isar_feature(aa32_fhm, s)) {
375         return false;
376     }
377
378     /* UNDEF accesses to D16-D31 if they don't exist. */
379     if (!dc_isar_feature(aa32_simd_r32, s) &&
380         ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
381         return false;
382     }
383
384     if (a->vd & a->q) {
385         return false;
386     }
387
388     if (!vfp_access_check(s)) {
389         return true;
390     }
391
392     opr_sz = (1 + a->q) * 8;
393     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
394                        vfp_reg_offset(a->q, a->vn),
395                        vfp_reg_offset(a->q, a->rm),
396                        cpu_env, opr_sz, opr_sz,
397                        (a->index << 2) | a->s, /* is_2 == 0 */
398                        gen_helper_gvec_fmlal_idx_a32);
399     return true;
400 }
401
402 static struct {
403     int nregs;
404     int interleave;
405     int spacing;
406 } const neon_ls_element_type[11] = {
407     {1, 4, 1},
408     {1, 4, 2},
409     {4, 1, 1},
410     {2, 2, 2},
411     {1, 3, 1},
412     {1, 3, 2},
413     {3, 1, 1},
414     {1, 1, 1},
415     {1, 2, 1},
416     {1, 2, 2},
417     {2, 1, 1}
418 };
419
420 static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
421                                       int stride)
422 {
423     if (rm != 15) {
424         TCGv_i32 base;
425
426         base = load_reg(s, rn);
427         if (rm == 13) {
428             tcg_gen_addi_i32(base, base, stride);
429         } else {
430             TCGv_i32 index;
431             index = load_reg(s, rm);
432             tcg_gen_add_i32(base, base, index);
433             tcg_temp_free_i32(index);
434         }
435         store_reg(s, rn, base);
436     }
437 }
438
439 static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
440 {
441     /* Neon load/store multiple structures */
442     int nregs, interleave, spacing, reg, n;
443     MemOp endian = s->be_data;
444     int mmu_idx = get_mem_index(s);
445     int size = a->size;
446     TCGv_i64 tmp64;
447     TCGv_i32 addr, tmp;
448
449     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
450         return false;
451     }
452
453     /* UNDEF accesses to D16-D31 if they don't exist */
454     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
455         return false;
456     }
457     if (a->itype > 10) {
458         return false;
459     }
460     /* Catch UNDEF cases for bad values of align field */
461     switch (a->itype & 0xc) {
462     case 4:
463         if (a->align >= 2) {
464             return false;
465         }
466         break;
467     case 8:
468         if (a->align == 3) {
469             return false;
470         }
471         break;
472     default:
473         break;
474     }
475     nregs = neon_ls_element_type[a->itype].nregs;
476     interleave = neon_ls_element_type[a->itype].interleave;
477     spacing = neon_ls_element_type[a->itype].spacing;
478     if (size == 3 && (interleave | spacing) != 1) {
479         return false;
480     }
481
482     if (!vfp_access_check(s)) {
483         return true;
484     }
485
486     /* For our purposes, bytes are always little-endian.  */
487     if (size == 0) {
488         endian = MO_LE;
489     }
490     /*
491      * Consecutive little-endian elements from a single register
492      * can be promoted to a larger little-endian operation.
493      */
494     if (interleave == 1 && endian == MO_LE) {
495         size = 3;
496     }
497     tmp64 = tcg_temp_new_i64();
498     addr = tcg_temp_new_i32();
499     tmp = tcg_const_i32(1 << size);
500     load_reg_var(s, addr, a->rn);
501     for (reg = 0; reg < nregs; reg++) {
502         for (n = 0; n < 8 >> size; n++) {
503             int xs;
504             for (xs = 0; xs < interleave; xs++) {
505                 int tt = a->vd + reg + spacing * xs;
506
507                 if (a->l) {
508                     gen_aa32_ld_i64(s, tmp64, addr, mmu_idx, endian | size);
509                     neon_store_element64(tt, n, size, tmp64);
510                 } else {
511                     neon_load_element64(tmp64, tt, n, size);
512                     gen_aa32_st_i64(s, tmp64, addr, mmu_idx, endian | size);
513                 }
514                 tcg_gen_add_i32(addr, addr, tmp);
515             }
516         }
517     }
518     tcg_temp_free_i32(addr);
519     tcg_temp_free_i32(tmp);
520     tcg_temp_free_i64(tmp64);
521
522     gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
523     return true;
524 }
525
526 static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
527 {
528     /* Neon load single structure to all lanes */
529     int reg, stride, vec_size;
530     int vd = a->vd;
531     int size = a->size;
532     int nregs = a->n + 1;
533     TCGv_i32 addr, tmp;
534
535     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
536         return false;
537     }
538
539     /* UNDEF accesses to D16-D31 if they don't exist */
540     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
541         return false;
542     }
543
544     if (size == 3) {
545         if (nregs != 4 || a->a == 0) {
546             return false;
547         }
548         /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
549         size = 2;
550     }
551     if (nregs == 1 && a->a == 1 && size == 0) {
552         return false;
553     }
554     if (nregs == 3 && a->a == 1) {
555         return false;
556     }
557
558     if (!vfp_access_check(s)) {
559         return true;
560     }
561
562     /*
563      * VLD1 to all lanes: T bit indicates how many Dregs to write.
564      * VLD2/3/4 to all lanes: T bit indicates register stride.
565      */
566     stride = a->t ? 2 : 1;
567     vec_size = nregs == 1 ? stride * 8 : 8;
568
569     tmp = tcg_temp_new_i32();
570     addr = tcg_temp_new_i32();
571     load_reg_var(s, addr, a->rn);
572     for (reg = 0; reg < nregs; reg++) {
573         gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s),
574                         s->be_data | size);
575         if ((vd & 1) && vec_size == 16) {
576             /*
577              * We cannot write 16 bytes at once because the
578              * destination is unaligned.
579              */
580             tcg_gen_gvec_dup_i32(size, neon_reg_offset(vd, 0),
581                                  8, 8, tmp);
582             tcg_gen_gvec_mov(0, neon_reg_offset(vd + 1, 0),
583                              neon_reg_offset(vd, 0), 8, 8);
584         } else {
585             tcg_gen_gvec_dup_i32(size, neon_reg_offset(vd, 0),
586                                  vec_size, vec_size, tmp);
587         }
588         tcg_gen_addi_i32(addr, addr, 1 << size);
589         vd += stride;
590     }
591     tcg_temp_free_i32(tmp);
592     tcg_temp_free_i32(addr);
593
594     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
595
596     return true;
597 }
598
599 static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
600 {
601     /* Neon load/store single structure to one lane */
602     int reg;
603     int nregs = a->n + 1;
604     int vd = a->vd;
605     TCGv_i32 addr, tmp;
606
607     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
608         return false;
609     }
610
611     /* UNDEF accesses to D16-D31 if they don't exist */
612     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
613         return false;
614     }
615
616     /* Catch the UNDEF cases. This is unavoidably a bit messy. */
617     switch (nregs) {
618     case 1:
619         if (((a->align & (1 << a->size)) != 0) ||
620             (a->size == 2 && ((a->align & 3) == 1 || (a->align & 3) == 2))) {
621             return false;
622         }
623         break;
624     case 3:
625         if ((a->align & 1) != 0) {
626             return false;
627         }
628         /* fall through */
629     case 2:
630         if (a->size == 2 && (a->align & 2) != 0) {
631             return false;
632         }
633         break;
634     case 4:
635         if ((a->size == 2) && ((a->align & 3) == 3)) {
636             return false;
637         }
638         break;
639     default:
640         abort();
641     }
642     if ((vd + a->stride * (nregs - 1)) > 31) {
643         /*
644          * Attempts to write off the end of the register file are
645          * UNPREDICTABLE; we choose to UNDEF because otherwise we would
646          * access off the end of the array that holds the register data.
647          */
648         return false;
649     }
650
651     if (!vfp_access_check(s)) {
652         return true;
653     }
654
655     tmp = tcg_temp_new_i32();
656     addr = tcg_temp_new_i32();
657     load_reg_var(s, addr, a->rn);
658     /*
659      * TODO: if we implemented alignment exceptions, we should check
660      * addr against the alignment encoded in a->align here.
661      */
662     for (reg = 0; reg < nregs; reg++) {
663         if (a->l) {
664             gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s),
665                             s->be_data | a->size);
666             neon_store_element(vd, a->reg_idx, a->size, tmp);
667         } else { /* Store */
668             neon_load_element(tmp, vd, a->reg_idx, a->size);
669             gen_aa32_st_i32(s, tmp, addr, get_mem_index(s),
670                             s->be_data | a->size);
671         }
672         vd += a->stride;
673         tcg_gen_addi_i32(addr, addr, 1 << a->size);
674     }
675     tcg_temp_free_i32(addr);
676     tcg_temp_free_i32(tmp);
677
678     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
679
680     return true;
681 }
682
683 static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
684 {
685     int vec_size = a->q ? 16 : 8;
686     int rd_ofs = neon_reg_offset(a->vd, 0);
687     int rn_ofs = neon_reg_offset(a->vn, 0);
688     int rm_ofs = neon_reg_offset(a->vm, 0);
689
690     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
691         return false;
692     }
693
694     /* UNDEF accesses to D16-D31 if they don't exist. */
695     if (!dc_isar_feature(aa32_simd_r32, s) &&
696         ((a->vd | a->vn | a->vm) & 0x10)) {
697         return false;
698     }
699
700     if ((a->vn | a->vm | a->vd) & a->q) {
701         return false;
702     }
703
704     if (!vfp_access_check(s)) {
705         return true;
706     }
707
708     fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
709     return true;
710 }
711
712 #define DO_3SAME(INSN, FUNC)                                            \
713     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
714     {                                                                   \
715         return do_3same(s, a, FUNC);                                    \
716     }
717
718 DO_3SAME(VADD, tcg_gen_gvec_add)
719 DO_3SAME(VSUB, tcg_gen_gvec_sub)
720 DO_3SAME(VAND, tcg_gen_gvec_and)
721 DO_3SAME(VBIC, tcg_gen_gvec_andc)
722 DO_3SAME(VORR, tcg_gen_gvec_or)
723 DO_3SAME(VORN, tcg_gen_gvec_orc)
724 DO_3SAME(VEOR, tcg_gen_gvec_xor)
725 DO_3SAME(VSHL_S, gen_gvec_sshl)
726 DO_3SAME(VSHL_U, gen_gvec_ushl)
727 DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
728 DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
729 DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
730 DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
731
732 /* These insns are all gvec_bitsel but with the inputs in various orders. */
733 #define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
734     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
735                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
736                                 uint32_t oprsz, uint32_t maxsz)         \
737     {                                                                   \
738         tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
739     }                                                                   \
740     DO_3SAME(INSN, gen_##INSN##_3s)
741
742 DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
743 DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
744 DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
745
746 #define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
747     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
748     {                                                                   \
749         if (a->size == 3) {                                             \
750             return false;                                               \
751         }                                                               \
752         return do_3same(s, a, FUNC);                                    \
753     }
754
755 DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
756 DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
757 DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
758 DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
759 DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
760 DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
761 DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
762 DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
763 DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
764 DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
765 DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
766 DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
767
768 #define DO_3SAME_CMP(INSN, COND)                                        \
769     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
770                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
771                                 uint32_t oprsz, uint32_t maxsz)         \
772     {                                                                   \
773         tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
774     }                                                                   \
775     DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
776
777 DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
778 DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
779 DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
780 DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
781 DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
782
783 #define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
784     static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
785                          uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
786     {                                                                      \
787         tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
788     }
789
790 WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
791
792 static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
793 {
794     if (a->size != 0) {
795         return false;
796     }
797     return do_3same(s, a, gen_VMUL_p_3s);
798 }
799
800 #define DO_VQRDMLAH(INSN, FUNC)                                         \
801     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
802     {                                                                   \
803         if (!dc_isar_feature(aa32_rdm, s)) {                            \
804             return false;                                               \
805         }                                                               \
806         if (a->size != 1 && a->size != 2) {                             \
807             return false;                                               \
808         }                                                               \
809         return do_3same(s, a, FUNC);                                    \
810     }
811
812 DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
813 DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
814
815 #define DO_SHA1(NAME, FUNC)                                             \
816     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
817     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
818     {                                                                   \
819         if (!dc_isar_feature(aa32_sha1, s)) {                           \
820             return false;                                               \
821         }                                                               \
822         return do_3same(s, a, gen_##NAME##_3s);                         \
823     }
824
825 DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
826 DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
827 DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
828 DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
829
830 #define DO_SHA2(NAME, FUNC)                                             \
831     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
832     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
833     {                                                                   \
834         if (!dc_isar_feature(aa32_sha2, s)) {                           \
835             return false;                                               \
836         }                                                               \
837         return do_3same(s, a, gen_##NAME##_3s);                         \
838     }
839
840 DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
841 DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
842 DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
843
844 #define DO_3SAME_64(INSN, FUNC)                                         \
845     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
846                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
847                                 uint32_t oprsz, uint32_t maxsz)         \
848     {                                                                   \
849         static const GVecGen3 op = { .fni8 = FUNC };                    \
850         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op);      \
851     }                                                                   \
852     DO_3SAME(INSN, gen_##INSN##_3s)
853
854 #define DO_3SAME_64_ENV(INSN, FUNC)                                     \
855     static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)    \
856     {                                                                   \
857         FUNC(d, cpu_env, n, m);                                         \
858     }                                                                   \
859     DO_3SAME_64(INSN, gen_##INSN##_elt)
860
861 DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
862 DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
863 DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
864 DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
865 DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
866 DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
867
868 #define DO_3SAME_32(INSN, FUNC)                                         \
869     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
870                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
871                                 uint32_t oprsz, uint32_t maxsz)         \
872     {                                                                   \
873         static const GVecGen3 ops[4] = {                                \
874             { .fni4 = gen_helper_neon_##FUNC##8 },                      \
875             { .fni4 = gen_helper_neon_##FUNC##16 },                     \
876             { .fni4 = gen_helper_neon_##FUNC##32 },                     \
877             { 0 },                                                      \
878         };                                                              \
879         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
880     }                                                                   \
881     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
882     {                                                                   \
883         if (a->size > 2) {                                              \
884             return false;                                               \
885         }                                                               \
886         return do_3same(s, a, gen_##INSN##_3s);                         \
887     }
888
889 /*
890  * Some helper functions need to be passed the cpu_env. In order
891  * to use those with the gvec APIs like tcg_gen_gvec_3() we need
892  * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
893  * and which call a NeonGenTwoOpEnvFn().
894  */
895 #define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
896     static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
897     {                                                                   \
898         FUNC(d, cpu_env, n, m);                                         \
899     }
900
901 #define DO_3SAME_32_ENV(INSN, FUNC)                                     \
902     WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8);        \
903     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16);      \
904     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32);      \
905     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
906                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
907                                 uint32_t oprsz, uint32_t maxsz)         \
908     {                                                                   \
909         static const GVecGen3 ops[4] = {                                \
910             { .fni4 = gen_##INSN##_tramp8 },                            \
911             { .fni4 = gen_##INSN##_tramp16 },                           \
912             { .fni4 = gen_##INSN##_tramp32 },                           \
913             { 0 },                                                      \
914         };                                                              \
915         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
916     }                                                                   \
917     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
918     {                                                                   \
919         if (a->size > 2) {                                              \
920             return false;                                               \
921         }                                                               \
922         return do_3same(s, a, gen_##INSN##_3s);                         \
923     }
924
925 DO_3SAME_32(VHADD_S, hadd_s)
926 DO_3SAME_32(VHADD_U, hadd_u)
927 DO_3SAME_32(VHSUB_S, hsub_s)
928 DO_3SAME_32(VHSUB_U, hsub_u)
929 DO_3SAME_32(VRHADD_S, rhadd_s)
930 DO_3SAME_32(VRHADD_U, rhadd_u)
931 DO_3SAME_32(VRSHL_S, rshl_s)
932 DO_3SAME_32(VRSHL_U, rshl_u)
933
934 DO_3SAME_32_ENV(VQSHL_S, qshl_s)
935 DO_3SAME_32_ENV(VQSHL_U, qshl_u)
936 DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
937 DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
938
939 static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn)
940 {
941     /* Operations handled pairwise 32 bits at a time */
942     TCGv_i32 tmp, tmp2, tmp3;
943
944     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
945         return false;
946     }
947
948     /* UNDEF accesses to D16-D31 if they don't exist. */
949     if (!dc_isar_feature(aa32_simd_r32, s) &&
950         ((a->vd | a->vn | a->vm) & 0x10)) {
951         return false;
952     }
953
954     if (a->size == 3) {
955         return false;
956     }
957
958     if (!vfp_access_check(s)) {
959         return true;
960     }
961
962     assert(a->q == 0); /* enforced by decode patterns */
963
964     /*
965      * Note that we have to be careful not to clobber the source operands
966      * in the "vm == vd" case by storing the result of the first pass too
967      * early. Since Q is 0 there are always just two passes, so instead
968      * of a complicated loop over each pass we just unroll.
969      */
970     tmp = neon_load_reg(a->vn, 0);
971     tmp2 = neon_load_reg(a->vn, 1);
972     fn(tmp, tmp, tmp2);
973     tcg_temp_free_i32(tmp2);
974
975     tmp3 = neon_load_reg(a->vm, 0);
976     tmp2 = neon_load_reg(a->vm, 1);
977     fn(tmp3, tmp3, tmp2);
978     tcg_temp_free_i32(tmp2);
979
980     neon_store_reg(a->vd, 0, tmp);
981     neon_store_reg(a->vd, 1, tmp3);
982     return true;
983 }
984
985 #define DO_3SAME_PAIR(INSN, func)                                       \
986     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
987     {                                                                   \
988         static NeonGenTwoOpFn * const fns[] = {                         \
989             gen_helper_neon_##func##8,                                  \
990             gen_helper_neon_##func##16,                                 \
991             gen_helper_neon_##func##32,                                 \
992         };                                                              \
993         if (a->size > 2) {                                              \
994             return false;                                               \
995         }                                                               \
996         return do_3same_pair(s, a, fns[a->size]);                       \
997     }
998
999 /* 32-bit pairwise ops end up the same as the elementwise versions.  */
1000 #define gen_helper_neon_pmax_s32  tcg_gen_smax_i32
1001 #define gen_helper_neon_pmax_u32  tcg_gen_umax_i32
1002 #define gen_helper_neon_pmin_s32  tcg_gen_smin_i32
1003 #define gen_helper_neon_pmin_u32  tcg_gen_umin_i32
1004 #define gen_helper_neon_padd_u32  tcg_gen_add_i32
1005
1006 DO_3SAME_PAIR(VPMAX_S, pmax_s)
1007 DO_3SAME_PAIR(VPMIN_S, pmin_s)
1008 DO_3SAME_PAIR(VPMAX_U, pmax_u)
1009 DO_3SAME_PAIR(VPMIN_U, pmin_u)
1010 DO_3SAME_PAIR(VPADD, padd_u)
1011
1012 #define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
1013     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16);    \
1014     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32);    \
1015     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
1016                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
1017                                 uint32_t oprsz, uint32_t maxsz)         \
1018     {                                                                   \
1019         static const GVecGen3 ops[2] = {                                \
1020             { .fni4 = gen_##INSN##_tramp16 },                           \
1021             { .fni4 = gen_##INSN##_tramp32 },                           \
1022         };                                                              \
1023         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
1024     }                                                                   \
1025     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1026     {                                                                   \
1027         if (a->size != 1 && a->size != 2) {                             \
1028             return false;                                               \
1029         }                                                               \
1030         return do_3same(s, a, gen_##INSN##_3s);                         \
1031     }
1032
1033 DO_3SAME_VQDMULH(VQDMULH, qdmulh)
1034 DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
1035
1036 static bool do_3same_fp(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn,
1037                         bool reads_vd)
1038 {
1039     /*
1040      * FP operations handled elementwise 32 bits at a time.
1041      * If reads_vd is true then the old value of Vd will be
1042      * loaded before calling the callback function. This is
1043      * used for multiply-accumulate type operations.
1044      */
1045     TCGv_i32 tmp, tmp2;
1046     int pass;
1047
1048     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1049         return false;
1050     }
1051
1052     /* UNDEF accesses to D16-D31 if they don't exist. */
1053     if (!dc_isar_feature(aa32_simd_r32, s) &&
1054         ((a->vd | a->vn | a->vm) & 0x10)) {
1055         return false;
1056     }
1057
1058     if ((a->vn | a->vm | a->vd) & a->q) {
1059         return false;
1060     }
1061
1062     if (!vfp_access_check(s)) {
1063         return true;
1064     }
1065
1066     TCGv_ptr fpstatus = fpstatus_ptr(FPST_STD);
1067     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1068         tmp = neon_load_reg(a->vn, pass);
1069         tmp2 = neon_load_reg(a->vm, pass);
1070         if (reads_vd) {
1071             TCGv_i32 tmp_rd = neon_load_reg(a->vd, pass);
1072             fn(tmp_rd, tmp, tmp2, fpstatus);
1073             neon_store_reg(a->vd, pass, tmp_rd);
1074             tcg_temp_free_i32(tmp);
1075         } else {
1076             fn(tmp, tmp, tmp2, fpstatus);
1077             neon_store_reg(a->vd, pass, tmp);
1078         }
1079         tcg_temp_free_i32(tmp2);
1080     }
1081     tcg_temp_free_ptr(fpstatus);
1082     return true;
1083 }
1084
1085 #define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC)                              \
1086     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
1087                          uint32_t rn_ofs, uint32_t rm_ofs,              \
1088                          uint32_t oprsz, uint32_t maxsz)                \
1089     {                                                                   \
1090         TCGv_ptr fpst = fpstatus_ptr(FPST);                             \
1091         tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
1092                            oprsz, maxsz, 0, FUNC);                      \
1093         tcg_temp_free_ptr(fpst);                                        \
1094     }
1095
1096 #define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC)                                 \
1097     WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC)                 \
1098     WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC)             \
1099     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
1100     {                                                                   \
1101         if (a->size != 0) {                                             \
1102             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
1103                 return false;                                           \
1104             }                                                           \
1105             return do_3same(s, a, gen_##INSN##_fp16_3s);                \
1106         }                                                               \
1107         return do_3same(s, a, gen_##INSN##_fp32_3s);                    \
1108     }
1109
1110
1111 DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
1112 DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
1113 DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
1114 DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
1115
1116 /*
1117  * For all the functions using this macro, size == 1 means fp16,
1118  * which is an architecture extension we don't implement yet.
1119  */
1120 #define DO_3S_FP(INSN,FUNC,READS_VD)                                \
1121     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
1122     {                                                               \
1123         if (a->size != 0) {                                         \
1124             /* TODO fp16 support */                                 \
1125             return false;                                           \
1126         }                                                           \
1127         return do_3same_fp(s, a, FUNC, READS_VD);                   \
1128     }
1129
1130 DO_3S_FP(VCEQ, gen_helper_neon_ceq_f32, false)
1131 DO_3S_FP(VCGE, gen_helper_neon_cge_f32, false)
1132 DO_3S_FP(VCGT, gen_helper_neon_cgt_f32, false)
1133 DO_3S_FP(VACGE, gen_helper_neon_acge_f32, false)
1134 DO_3S_FP(VACGT, gen_helper_neon_acgt_f32, false)
1135 DO_3S_FP(VMAX, gen_helper_vfp_maxs, false)
1136 DO_3S_FP(VMIN, gen_helper_vfp_mins, false)
1137
1138 static void gen_VMLA_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
1139                             TCGv_ptr fpstatus)
1140 {
1141     gen_helper_vfp_muls(vn, vn, vm, fpstatus);
1142     gen_helper_vfp_adds(vd, vd, vn, fpstatus);
1143 }
1144
1145 static void gen_VMLS_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
1146                             TCGv_ptr fpstatus)
1147 {
1148     gen_helper_vfp_muls(vn, vn, vm, fpstatus);
1149     gen_helper_vfp_subs(vd, vd, vn, fpstatus);
1150 }
1151
1152 DO_3S_FP(VMLA, gen_VMLA_fp_3s, true)
1153 DO_3S_FP(VMLS, gen_VMLS_fp_3s, true)
1154
1155 static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
1156 {
1157     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1158         return false;
1159     }
1160
1161     if (a->size != 0) {
1162         /* TODO fp16 support */
1163         return false;
1164     }
1165
1166     return do_3same_fp(s, a, gen_helper_vfp_maxnums, false);
1167 }
1168
1169 static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
1170 {
1171     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1172         return false;
1173     }
1174
1175     if (a->size != 0) {
1176         /* TODO fp16 support */
1177         return false;
1178     }
1179
1180     return do_3same_fp(s, a, gen_helper_vfp_minnums, false);
1181 }
1182
1183 WRAP_ENV_FN(gen_VRECPS_tramp, gen_helper_recps_f32)
1184
1185 static void gen_VRECPS_fp_3s(unsigned vece, uint32_t rd_ofs,
1186                              uint32_t rn_ofs, uint32_t rm_ofs,
1187                              uint32_t oprsz, uint32_t maxsz)
1188 {
1189     static const GVecGen3 ops = { .fni4 = gen_VRECPS_tramp };
1190     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops);
1191 }
1192
1193 static bool trans_VRECPS_fp_3s(DisasContext *s, arg_3same *a)
1194 {
1195     if (a->size != 0) {
1196         /* TODO fp16 support */
1197         return false;
1198     }
1199
1200     return do_3same(s, a, gen_VRECPS_fp_3s);
1201 }
1202
1203 WRAP_ENV_FN(gen_VRSQRTS_tramp, gen_helper_rsqrts_f32)
1204
1205 static void gen_VRSQRTS_fp_3s(unsigned vece, uint32_t rd_ofs,
1206                               uint32_t rn_ofs, uint32_t rm_ofs,
1207                               uint32_t oprsz, uint32_t maxsz)
1208 {
1209     static const GVecGen3 ops = { .fni4 = gen_VRSQRTS_tramp };
1210     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops);
1211 }
1212
1213 static bool trans_VRSQRTS_fp_3s(DisasContext *s, arg_3same *a)
1214 {
1215     if (a->size != 0) {
1216         /* TODO fp16 support */
1217         return false;
1218     }
1219
1220     return do_3same(s, a, gen_VRSQRTS_fp_3s);
1221 }
1222
1223 static void gen_VFMA_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
1224                             TCGv_ptr fpstatus)
1225 {
1226     gen_helper_vfp_muladds(vd, vn, vm, vd, fpstatus);
1227 }
1228
1229 static bool trans_VFMA_fp_3s(DisasContext *s, arg_3same *a)
1230 {
1231     if (!dc_isar_feature(aa32_simdfmac, s)) {
1232         return false;
1233     }
1234
1235     if (a->size != 0) {
1236         /* TODO fp16 support */
1237         return false;
1238     }
1239
1240     return do_3same_fp(s, a, gen_VFMA_fp_3s, true);
1241 }
1242
1243 static void gen_VFMS_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
1244                             TCGv_ptr fpstatus)
1245 {
1246     gen_helper_vfp_negs(vn, vn);
1247     gen_helper_vfp_muladds(vd, vn, vm, vd, fpstatus);
1248 }
1249
1250 static bool trans_VFMS_fp_3s(DisasContext *s, arg_3same *a)
1251 {
1252     if (!dc_isar_feature(aa32_simdfmac, s)) {
1253         return false;
1254     }
1255
1256     if (a->size != 0) {
1257         /* TODO fp16 support */
1258         return false;
1259     }
1260
1261     return do_3same_fp(s, a, gen_VFMS_fp_3s, true);
1262 }
1263
1264 static bool do_3same_fp_pair(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn)
1265 {
1266     /* FP operations handled pairwise 32 bits at a time */
1267     TCGv_i32 tmp, tmp2, tmp3;
1268     TCGv_ptr fpstatus;
1269
1270     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1271         return false;
1272     }
1273
1274     /* UNDEF accesses to D16-D31 if they don't exist. */
1275     if (!dc_isar_feature(aa32_simd_r32, s) &&
1276         ((a->vd | a->vn | a->vm) & 0x10)) {
1277         return false;
1278     }
1279
1280     if (!vfp_access_check(s)) {
1281         return true;
1282     }
1283
1284     assert(a->q == 0); /* enforced by decode patterns */
1285
1286     /*
1287      * Note that we have to be careful not to clobber the source operands
1288      * in the "vm == vd" case by storing the result of the first pass too
1289      * early. Since Q is 0 there are always just two passes, so instead
1290      * of a complicated loop over each pass we just unroll.
1291      */
1292     fpstatus = fpstatus_ptr(FPST_STD);
1293     tmp = neon_load_reg(a->vn, 0);
1294     tmp2 = neon_load_reg(a->vn, 1);
1295     fn(tmp, tmp, tmp2, fpstatus);
1296     tcg_temp_free_i32(tmp2);
1297
1298     tmp3 = neon_load_reg(a->vm, 0);
1299     tmp2 = neon_load_reg(a->vm, 1);
1300     fn(tmp3, tmp3, tmp2, fpstatus);
1301     tcg_temp_free_i32(tmp2);
1302     tcg_temp_free_ptr(fpstatus);
1303
1304     neon_store_reg(a->vd, 0, tmp);
1305     neon_store_reg(a->vd, 1, tmp3);
1306     return true;
1307 }
1308
1309 /*
1310  * For all the functions using this macro, size == 1 means fp16,
1311  * which is an architecture extension we don't implement yet.
1312  */
1313 #define DO_3S_FP_PAIR(INSN,FUNC)                                    \
1314     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
1315     {                                                               \
1316         if (a->size != 0) {                                         \
1317             /* TODO fp16 support */                                 \
1318             return false;                                           \
1319         }                                                           \
1320         return do_3same_fp_pair(s, a, FUNC);                        \
1321     }
1322
1323 DO_3S_FP_PAIR(VPADD, gen_helper_vfp_adds)
1324 DO_3S_FP_PAIR(VPMAX, gen_helper_vfp_maxs)
1325 DO_3S_FP_PAIR(VPMIN, gen_helper_vfp_mins)
1326
1327 static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
1328 {
1329     /* Handle a 2-reg-shift insn which can be vectorized. */
1330     int vec_size = a->q ? 16 : 8;
1331     int rd_ofs = neon_reg_offset(a->vd, 0);
1332     int rm_ofs = neon_reg_offset(a->vm, 0);
1333
1334     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1335         return false;
1336     }
1337
1338     /* UNDEF accesses to D16-D31 if they don't exist. */
1339     if (!dc_isar_feature(aa32_simd_r32, s) &&
1340         ((a->vd | a->vm) & 0x10)) {
1341         return false;
1342     }
1343
1344     if ((a->vm | a->vd) & a->q) {
1345         return false;
1346     }
1347
1348     if (!vfp_access_check(s)) {
1349         return true;
1350     }
1351
1352     fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
1353     return true;
1354 }
1355
1356 #define DO_2SH(INSN, FUNC)                                              \
1357     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1358     {                                                                   \
1359         return do_vector_2sh(s, a, FUNC);                               \
1360     }                                                                   \
1361
1362 DO_2SH(VSHL, tcg_gen_gvec_shli)
1363 DO_2SH(VSLI, gen_gvec_sli)
1364 DO_2SH(VSRI, gen_gvec_sri)
1365 DO_2SH(VSRA_S, gen_gvec_ssra)
1366 DO_2SH(VSRA_U, gen_gvec_usra)
1367 DO_2SH(VRSHR_S, gen_gvec_srshr)
1368 DO_2SH(VRSHR_U, gen_gvec_urshr)
1369 DO_2SH(VRSRA_S, gen_gvec_srsra)
1370 DO_2SH(VRSRA_U, gen_gvec_ursra)
1371
1372 static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
1373 {
1374     /* Signed shift out of range results in all-sign-bits */
1375     a->shift = MIN(a->shift, (8 << a->size) - 1);
1376     return do_vector_2sh(s, a, tcg_gen_gvec_sari);
1377 }
1378
1379 static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
1380                             int64_t shift, uint32_t oprsz, uint32_t maxsz)
1381 {
1382     tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
1383 }
1384
1385 static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
1386 {
1387     /* Shift out of range is architecturally valid and results in zero. */
1388     if (a->shift >= (8 << a->size)) {
1389         return do_vector_2sh(s, a, gen_zero_rd_2sh);
1390     } else {
1391         return do_vector_2sh(s, a, tcg_gen_gvec_shri);
1392     }
1393 }
1394
1395 static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
1396                              NeonGenTwo64OpEnvFn *fn)
1397 {
1398     /*
1399      * 2-reg-and-shift operations, size == 3 case, where the
1400      * function needs to be passed cpu_env.
1401      */
1402     TCGv_i64 constimm;
1403     int pass;
1404
1405     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1406         return false;
1407     }
1408
1409     /* UNDEF accesses to D16-D31 if they don't exist. */
1410     if (!dc_isar_feature(aa32_simd_r32, s) &&
1411         ((a->vd | a->vm) & 0x10)) {
1412         return false;
1413     }
1414
1415     if ((a->vm | a->vd) & a->q) {
1416         return false;
1417     }
1418
1419     if (!vfp_access_check(s)) {
1420         return true;
1421     }
1422
1423     /*
1424      * To avoid excessive duplication of ops we implement shift
1425      * by immediate using the variable shift operations.
1426      */
1427     constimm = tcg_const_i64(dup_const(a->size, a->shift));
1428
1429     for (pass = 0; pass < a->q + 1; pass++) {
1430         TCGv_i64 tmp = tcg_temp_new_i64();
1431
1432         neon_load_reg64(tmp, a->vm + pass);
1433         fn(tmp, cpu_env, tmp, constimm);
1434         neon_store_reg64(tmp, a->vd + pass);
1435         tcg_temp_free_i64(tmp);
1436     }
1437     tcg_temp_free_i64(constimm);
1438     return true;
1439 }
1440
1441 static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
1442                              NeonGenTwoOpEnvFn *fn)
1443 {
1444     /*
1445      * 2-reg-and-shift operations, size < 3 case, where the
1446      * helper needs to be passed cpu_env.
1447      */
1448     TCGv_i32 constimm;
1449     int pass;
1450
1451     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1452         return false;
1453     }
1454
1455     /* UNDEF accesses to D16-D31 if they don't exist. */
1456     if (!dc_isar_feature(aa32_simd_r32, s) &&
1457         ((a->vd | a->vm) & 0x10)) {
1458         return false;
1459     }
1460
1461     if ((a->vm | a->vd) & a->q) {
1462         return false;
1463     }
1464
1465     if (!vfp_access_check(s)) {
1466         return true;
1467     }
1468
1469     /*
1470      * To avoid excessive duplication of ops we implement shift
1471      * by immediate using the variable shift operations.
1472      */
1473     constimm = tcg_const_i32(dup_const(a->size, a->shift));
1474
1475     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1476         TCGv_i32 tmp = neon_load_reg(a->vm, pass);
1477         fn(tmp, cpu_env, tmp, constimm);
1478         neon_store_reg(a->vd, pass, tmp);
1479     }
1480     tcg_temp_free_i32(constimm);
1481     return true;
1482 }
1483
1484 #define DO_2SHIFT_ENV(INSN, FUNC)                                       \
1485     static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
1486     {                                                                   \
1487         return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
1488     }                                                                   \
1489     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1490     {                                                                   \
1491         static NeonGenTwoOpEnvFn * const fns[] = {                      \
1492             gen_helper_neon_##FUNC##8,                                  \
1493             gen_helper_neon_##FUNC##16,                                 \
1494             gen_helper_neon_##FUNC##32,                                 \
1495         };                                                              \
1496         assert(a->size < ARRAY_SIZE(fns));                              \
1497         return do_2shift_env_32(s, a, fns[a->size]);                    \
1498     }
1499
1500 DO_2SHIFT_ENV(VQSHLU, qshlu_s)
1501 DO_2SHIFT_ENV(VQSHL_U, qshl_u)
1502 DO_2SHIFT_ENV(VQSHL_S, qshl_s)
1503
1504 static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
1505                                 NeonGenTwo64OpFn *shiftfn,
1506                                 NeonGenNarrowEnvFn *narrowfn)
1507 {
1508     /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
1509     TCGv_i64 constimm, rm1, rm2;
1510     TCGv_i32 rd;
1511
1512     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1513         return false;
1514     }
1515
1516     /* UNDEF accesses to D16-D31 if they don't exist. */
1517     if (!dc_isar_feature(aa32_simd_r32, s) &&
1518         ((a->vd | a->vm) & 0x10)) {
1519         return false;
1520     }
1521
1522     if (a->vm & 1) {
1523         return false;
1524     }
1525
1526     if (!vfp_access_check(s)) {
1527         return true;
1528     }
1529
1530     /*
1531      * This is always a right shift, and the shiftfn is always a
1532      * left-shift helper, which thus needs the negated shift count.
1533      */
1534     constimm = tcg_const_i64(-a->shift);
1535     rm1 = tcg_temp_new_i64();
1536     rm2 = tcg_temp_new_i64();
1537
1538     /* Load both inputs first to avoid potential overwrite if rm == rd */
1539     neon_load_reg64(rm1, a->vm);
1540     neon_load_reg64(rm2, a->vm + 1);
1541
1542     shiftfn(rm1, rm1, constimm);
1543     rd = tcg_temp_new_i32();
1544     narrowfn(rd, cpu_env, rm1);
1545     neon_store_reg(a->vd, 0, rd);
1546
1547     shiftfn(rm2, rm2, constimm);
1548     rd = tcg_temp_new_i32();
1549     narrowfn(rd, cpu_env, rm2);
1550     neon_store_reg(a->vd, 1, rd);
1551
1552     tcg_temp_free_i64(rm1);
1553     tcg_temp_free_i64(rm2);
1554     tcg_temp_free_i64(constimm);
1555
1556     return true;
1557 }
1558
1559 static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
1560                                 NeonGenTwoOpFn *shiftfn,
1561                                 NeonGenNarrowEnvFn *narrowfn)
1562 {
1563     /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
1564     TCGv_i32 constimm, rm1, rm2, rm3, rm4;
1565     TCGv_i64 rtmp;
1566     uint32_t imm;
1567
1568     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1569         return false;
1570     }
1571
1572     /* UNDEF accesses to D16-D31 if they don't exist. */
1573     if (!dc_isar_feature(aa32_simd_r32, s) &&
1574         ((a->vd | a->vm) & 0x10)) {
1575         return false;
1576     }
1577
1578     if (a->vm & 1) {
1579         return false;
1580     }
1581
1582     if (!vfp_access_check(s)) {
1583         return true;
1584     }
1585
1586     /*
1587      * This is always a right shift, and the shiftfn is always a
1588      * left-shift helper, which thus needs the negated shift count
1589      * duplicated into each lane of the immediate value.
1590      */
1591     if (a->size == 1) {
1592         imm = (uint16_t)(-a->shift);
1593         imm |= imm << 16;
1594     } else {
1595         /* size == 2 */
1596         imm = -a->shift;
1597     }
1598     constimm = tcg_const_i32(imm);
1599
1600     /* Load all inputs first to avoid potential overwrite */
1601     rm1 = neon_load_reg(a->vm, 0);
1602     rm2 = neon_load_reg(a->vm, 1);
1603     rm3 = neon_load_reg(a->vm + 1, 0);
1604     rm4 = neon_load_reg(a->vm + 1, 1);
1605     rtmp = tcg_temp_new_i64();
1606
1607     shiftfn(rm1, rm1, constimm);
1608     shiftfn(rm2, rm2, constimm);
1609
1610     tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
1611     tcg_temp_free_i32(rm2);
1612
1613     narrowfn(rm1, cpu_env, rtmp);
1614     neon_store_reg(a->vd, 0, rm1);
1615
1616     shiftfn(rm3, rm3, constimm);
1617     shiftfn(rm4, rm4, constimm);
1618     tcg_temp_free_i32(constimm);
1619
1620     tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
1621     tcg_temp_free_i32(rm4);
1622
1623     narrowfn(rm3, cpu_env, rtmp);
1624     tcg_temp_free_i64(rtmp);
1625     neon_store_reg(a->vd, 1, rm3);
1626     return true;
1627 }
1628
1629 #define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
1630     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1631     {                                                                   \
1632         return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
1633     }
1634 #define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
1635     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1636     {                                                                   \
1637         return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
1638     }
1639
1640 static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1641 {
1642     tcg_gen_extrl_i64_i32(dest, src);
1643 }
1644
1645 static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1646 {
1647     gen_helper_neon_narrow_u16(dest, src);
1648 }
1649
1650 static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1651 {
1652     gen_helper_neon_narrow_u8(dest, src);
1653 }
1654
1655 DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
1656 DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
1657 DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
1658
1659 DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
1660 DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
1661 DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
1662
1663 DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
1664 DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
1665 DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
1666
1667 DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
1668 DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
1669 DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
1670 DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
1671 DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
1672 DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
1673
1674 DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
1675 DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
1676 DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
1677
1678 DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
1679 DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
1680 DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
1681
1682 DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
1683 DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
1684 DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
1685
1686 static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
1687                          NeonGenWidenFn *widenfn, bool u)
1688 {
1689     TCGv_i64 tmp;
1690     TCGv_i32 rm0, rm1;
1691     uint64_t widen_mask = 0;
1692
1693     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1694         return false;
1695     }
1696
1697     /* UNDEF accesses to D16-D31 if they don't exist. */
1698     if (!dc_isar_feature(aa32_simd_r32, s) &&
1699         ((a->vd | a->vm) & 0x10)) {
1700         return false;
1701     }
1702
1703     if (a->vd & 1) {
1704         return false;
1705     }
1706
1707     if (!vfp_access_check(s)) {
1708         return true;
1709     }
1710
1711     /*
1712      * This is a widen-and-shift operation. The shift is always less
1713      * than the width of the source type, so after widening the input
1714      * vector we can simply shift the whole 64-bit widened register,
1715      * and then clear the potential overflow bits resulting from left
1716      * bits of the narrow input appearing as right bits of the left
1717      * neighbour narrow input. Calculate a mask of bits to clear.
1718      */
1719     if ((a->shift != 0) && (a->size < 2 || u)) {
1720         int esize = 8 << a->size;
1721         widen_mask = MAKE_64BIT_MASK(0, esize);
1722         widen_mask >>= esize - a->shift;
1723         widen_mask = dup_const(a->size + 1, widen_mask);
1724     }
1725
1726     rm0 = neon_load_reg(a->vm, 0);
1727     rm1 = neon_load_reg(a->vm, 1);
1728     tmp = tcg_temp_new_i64();
1729
1730     widenfn(tmp, rm0);
1731     tcg_temp_free_i32(rm0);
1732     if (a->shift != 0) {
1733         tcg_gen_shli_i64(tmp, tmp, a->shift);
1734         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1735     }
1736     neon_store_reg64(tmp, a->vd);
1737
1738     widenfn(tmp, rm1);
1739     tcg_temp_free_i32(rm1);
1740     if (a->shift != 0) {
1741         tcg_gen_shli_i64(tmp, tmp, a->shift);
1742         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1743     }
1744     neon_store_reg64(tmp, a->vd + 1);
1745     tcg_temp_free_i64(tmp);
1746     return true;
1747 }
1748
1749 static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
1750 {
1751     static NeonGenWidenFn * const widenfn[] = {
1752         gen_helper_neon_widen_s8,
1753         gen_helper_neon_widen_s16,
1754         tcg_gen_ext_i32_i64,
1755     };
1756     return do_vshll_2sh(s, a, widenfn[a->size], false);
1757 }
1758
1759 static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
1760 {
1761     static NeonGenWidenFn * const widenfn[] = {
1762         gen_helper_neon_widen_u8,
1763         gen_helper_neon_widen_u16,
1764         tcg_gen_extu_i32_i64,
1765     };
1766     return do_vshll_2sh(s, a, widenfn[a->size], true);
1767 }
1768
1769 static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
1770                       NeonGenTwoSingleOpFn *fn)
1771 {
1772     /* FP operations in 2-reg-and-shift group */
1773     TCGv_i32 tmp, shiftv;
1774     TCGv_ptr fpstatus;
1775     int pass;
1776
1777     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1778         return false;
1779     }
1780
1781     /* UNDEF accesses to D16-D31 if they don't exist. */
1782     if (!dc_isar_feature(aa32_simd_r32, s) &&
1783         ((a->vd | a->vm) & 0x10)) {
1784         return false;
1785     }
1786
1787     if ((a->vm | a->vd) & a->q) {
1788         return false;
1789     }
1790
1791     if (!vfp_access_check(s)) {
1792         return true;
1793     }
1794
1795     fpstatus = fpstatus_ptr(FPST_STD);
1796     shiftv = tcg_const_i32(a->shift);
1797     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1798         tmp = neon_load_reg(a->vm, pass);
1799         fn(tmp, tmp, shiftv, fpstatus);
1800         neon_store_reg(a->vd, pass, tmp);
1801     }
1802     tcg_temp_free_ptr(fpstatus);
1803     tcg_temp_free_i32(shiftv);
1804     return true;
1805 }
1806
1807 #define DO_FP_2SH(INSN, FUNC)                                           \
1808     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1809     {                                                                   \
1810         return do_fp_2sh(s, a, FUNC);                                   \
1811     }
1812
1813 DO_FP_2SH(VCVT_SF, gen_helper_vfp_sltos)
1814 DO_FP_2SH(VCVT_UF, gen_helper_vfp_ultos)
1815 DO_FP_2SH(VCVT_FS, gen_helper_vfp_tosls_round_to_zero)
1816 DO_FP_2SH(VCVT_FU, gen_helper_vfp_touls_round_to_zero)
1817
1818 static uint64_t asimd_imm_const(uint32_t imm, int cmode, int op)
1819 {
1820     /*
1821      * Expand the encoded constant.
1822      * Note that cmode = 2,3,4,5,6,7,10,11,12,13 imm=0 is UNPREDICTABLE.
1823      * We choose to not special-case this and will behave as if a
1824      * valid constant encoding of 0 had been given.
1825      * cmode = 15 op = 1 must UNDEF; we assume decode has handled that.
1826      */
1827     switch (cmode) {
1828     case 0: case 1:
1829         /* no-op */
1830         break;
1831     case 2: case 3:
1832         imm <<= 8;
1833         break;
1834     case 4: case 5:
1835         imm <<= 16;
1836         break;
1837     case 6: case 7:
1838         imm <<= 24;
1839         break;
1840     case 8: case 9:
1841         imm |= imm << 16;
1842         break;
1843     case 10: case 11:
1844         imm = (imm << 8) | (imm << 24);
1845         break;
1846     case 12:
1847         imm = (imm << 8) | 0xff;
1848         break;
1849     case 13:
1850         imm = (imm << 16) | 0xffff;
1851         break;
1852     case 14:
1853         if (op) {
1854             /*
1855              * This is the only case where the top and bottom 32 bits
1856              * of the encoded constant differ.
1857              */
1858             uint64_t imm64 = 0;
1859             int n;
1860
1861             for (n = 0; n < 8; n++) {
1862                 if (imm & (1 << n)) {
1863                     imm64 |= (0xffULL << (n * 8));
1864                 }
1865             }
1866             return imm64;
1867         }
1868         imm |= (imm << 8) | (imm << 16) | (imm << 24);
1869         break;
1870     case 15:
1871         imm = ((imm & 0x80) << 24) | ((imm & 0x3f) << 19)
1872             | ((imm & 0x40) ? (0x1f << 25) : (1 << 30));
1873         break;
1874     }
1875     if (op) {
1876         imm = ~imm;
1877     }
1878     return dup_const(MO_32, imm);
1879 }
1880
1881 static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
1882                         GVecGen2iFn *fn)
1883 {
1884     uint64_t imm;
1885     int reg_ofs, vec_size;
1886
1887     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1888         return false;
1889     }
1890
1891     /* UNDEF accesses to D16-D31 if they don't exist. */
1892     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
1893         return false;
1894     }
1895
1896     if (a->vd & a->q) {
1897         return false;
1898     }
1899
1900     if (!vfp_access_check(s)) {
1901         return true;
1902     }
1903
1904     reg_ofs = neon_reg_offset(a->vd, 0);
1905     vec_size = a->q ? 16 : 8;
1906     imm = asimd_imm_const(a->imm, a->cmode, a->op);
1907
1908     fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
1909     return true;
1910 }
1911
1912 static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
1913                         int64_t c, uint32_t oprsz, uint32_t maxsz)
1914 {
1915     tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
1916 }
1917
1918 static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
1919 {
1920     /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
1921     GVecGen2iFn *fn;
1922
1923     if ((a->cmode & 1) && a->cmode < 12) {
1924         /* for op=1, the imm will be inverted, so BIC becomes AND. */
1925         fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
1926     } else {
1927         /* There is one unallocated cmode/op combination in this space */
1928         if (a->cmode == 15 && a->op == 1) {
1929             return false;
1930         }
1931         fn = gen_VMOV_1r;
1932     }
1933     return do_1reg_imm(s, a, fn);
1934 }
1935
1936 static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
1937                            NeonGenWidenFn *widenfn,
1938                            NeonGenTwo64OpFn *opfn,
1939                            bool src1_wide)
1940 {
1941     /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
1942     TCGv_i64 rn0_64, rn1_64, rm_64;
1943     TCGv_i32 rm;
1944
1945     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1946         return false;
1947     }
1948
1949     /* UNDEF accesses to D16-D31 if they don't exist. */
1950     if (!dc_isar_feature(aa32_simd_r32, s) &&
1951         ((a->vd | a->vn | a->vm) & 0x10)) {
1952         return false;
1953     }
1954
1955     if (!widenfn || !opfn) {
1956         /* size == 3 case, which is an entirely different insn group */
1957         return false;
1958     }
1959
1960     if ((a->vd & 1) || (src1_wide && (a->vn & 1))) {
1961         return false;
1962     }
1963
1964     if (!vfp_access_check(s)) {
1965         return true;
1966     }
1967
1968     rn0_64 = tcg_temp_new_i64();
1969     rn1_64 = tcg_temp_new_i64();
1970     rm_64 = tcg_temp_new_i64();
1971
1972     if (src1_wide) {
1973         neon_load_reg64(rn0_64, a->vn);
1974     } else {
1975         TCGv_i32 tmp = neon_load_reg(a->vn, 0);
1976         widenfn(rn0_64, tmp);
1977         tcg_temp_free_i32(tmp);
1978     }
1979     rm = neon_load_reg(a->vm, 0);
1980
1981     widenfn(rm_64, rm);
1982     tcg_temp_free_i32(rm);
1983     opfn(rn0_64, rn0_64, rm_64);
1984
1985     /*
1986      * Load second pass inputs before storing the first pass result, to
1987      * avoid incorrect results if a narrow input overlaps with the result.
1988      */
1989     if (src1_wide) {
1990         neon_load_reg64(rn1_64, a->vn + 1);
1991     } else {
1992         TCGv_i32 tmp = neon_load_reg(a->vn, 1);
1993         widenfn(rn1_64, tmp);
1994         tcg_temp_free_i32(tmp);
1995     }
1996     rm = neon_load_reg(a->vm, 1);
1997
1998     neon_store_reg64(rn0_64, a->vd);
1999
2000     widenfn(rm_64, rm);
2001     tcg_temp_free_i32(rm);
2002     opfn(rn1_64, rn1_64, rm_64);
2003     neon_store_reg64(rn1_64, a->vd + 1);
2004
2005     tcg_temp_free_i64(rn0_64);
2006     tcg_temp_free_i64(rn1_64);
2007     tcg_temp_free_i64(rm_64);
2008
2009     return true;
2010 }
2011
2012 #define DO_PREWIDEN(INSN, S, EXT, OP, SRC1WIDE)                         \
2013     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2014     {                                                                   \
2015         static NeonGenWidenFn * const widenfn[] = {                     \
2016             gen_helper_neon_widen_##S##8,                               \
2017             gen_helper_neon_widen_##S##16,                              \
2018             tcg_gen_##EXT##_i32_i64,                                    \
2019             NULL,                                                       \
2020         };                                                              \
2021         static NeonGenTwo64OpFn * const addfn[] = {                     \
2022             gen_helper_neon_##OP##l_u16,                                \
2023             gen_helper_neon_##OP##l_u32,                                \
2024             tcg_gen_##OP##_i64,                                         \
2025             NULL,                                                       \
2026         };                                                              \
2027         return do_prewiden_3d(s, a, widenfn[a->size],                   \
2028                               addfn[a->size], SRC1WIDE);                \
2029     }
2030
2031 DO_PREWIDEN(VADDL_S, s, ext, add, false)
2032 DO_PREWIDEN(VADDL_U, u, extu, add, false)
2033 DO_PREWIDEN(VSUBL_S, s, ext, sub, false)
2034 DO_PREWIDEN(VSUBL_U, u, extu, sub, false)
2035 DO_PREWIDEN(VADDW_S, s, ext, add, true)
2036 DO_PREWIDEN(VADDW_U, u, extu, add, true)
2037 DO_PREWIDEN(VSUBW_S, s, ext, sub, true)
2038 DO_PREWIDEN(VSUBW_U, u, extu, sub, true)
2039
2040 static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
2041                          NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
2042 {
2043     /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
2044     TCGv_i64 rn_64, rm_64;
2045     TCGv_i32 rd0, rd1;
2046
2047     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2048         return false;
2049     }
2050
2051     /* UNDEF accesses to D16-D31 if they don't exist. */
2052     if (!dc_isar_feature(aa32_simd_r32, s) &&
2053         ((a->vd | a->vn | a->vm) & 0x10)) {
2054         return false;
2055     }
2056
2057     if (!opfn || !narrowfn) {
2058         /* size == 3 case, which is an entirely different insn group */
2059         return false;
2060     }
2061
2062     if ((a->vn | a->vm) & 1) {
2063         return false;
2064     }
2065
2066     if (!vfp_access_check(s)) {
2067         return true;
2068     }
2069
2070     rn_64 = tcg_temp_new_i64();
2071     rm_64 = tcg_temp_new_i64();
2072     rd0 = tcg_temp_new_i32();
2073     rd1 = tcg_temp_new_i32();
2074
2075     neon_load_reg64(rn_64, a->vn);
2076     neon_load_reg64(rm_64, a->vm);
2077
2078     opfn(rn_64, rn_64, rm_64);
2079
2080     narrowfn(rd0, rn_64);
2081
2082     neon_load_reg64(rn_64, a->vn + 1);
2083     neon_load_reg64(rm_64, a->vm + 1);
2084
2085     opfn(rn_64, rn_64, rm_64);
2086
2087     narrowfn(rd1, rn_64);
2088
2089     neon_store_reg(a->vd, 0, rd0);
2090     neon_store_reg(a->vd, 1, rd1);
2091
2092     tcg_temp_free_i64(rn_64);
2093     tcg_temp_free_i64(rm_64);
2094
2095     return true;
2096 }
2097
2098 #define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
2099     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2100     {                                                                   \
2101         static NeonGenTwo64OpFn * const addfn[] = {                     \
2102             gen_helper_neon_##OP##l_u16,                                \
2103             gen_helper_neon_##OP##l_u32,                                \
2104             tcg_gen_##OP##_i64,                                         \
2105             NULL,                                                       \
2106         };                                                              \
2107         static NeonGenNarrowFn * const narrowfn[] = {                   \
2108             gen_helper_neon_##NARROWTYPE##_high_u8,                     \
2109             gen_helper_neon_##NARROWTYPE##_high_u16,                    \
2110             EXTOP,                                                      \
2111             NULL,                                                       \
2112         };                                                              \
2113         return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
2114     }
2115
2116 static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
2117 {
2118     tcg_gen_addi_i64(rn, rn, 1u << 31);
2119     tcg_gen_extrh_i64_i32(rd, rn);
2120 }
2121
2122 DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
2123 DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
2124 DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
2125 DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
2126
2127 static bool do_long_3d(DisasContext *s, arg_3diff *a,
2128                        NeonGenTwoOpWidenFn *opfn,
2129                        NeonGenTwo64OpFn *accfn)
2130 {
2131     /*
2132      * 3-regs different lengths, long operations.
2133      * These perform an operation on two inputs that returns a double-width
2134      * result, and then possibly perform an accumulation operation of
2135      * that result into the double-width destination.
2136      */
2137     TCGv_i64 rd0, rd1, tmp;
2138     TCGv_i32 rn, rm;
2139
2140     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2141         return false;
2142     }
2143
2144     /* UNDEF accesses to D16-D31 if they don't exist. */
2145     if (!dc_isar_feature(aa32_simd_r32, s) &&
2146         ((a->vd | a->vn | a->vm) & 0x10)) {
2147         return false;
2148     }
2149
2150     if (!opfn) {
2151         /* size == 3 case, which is an entirely different insn group */
2152         return false;
2153     }
2154
2155     if (a->vd & 1) {
2156         return false;
2157     }
2158
2159     if (!vfp_access_check(s)) {
2160         return true;
2161     }
2162
2163     rd0 = tcg_temp_new_i64();
2164     rd1 = tcg_temp_new_i64();
2165
2166     rn = neon_load_reg(a->vn, 0);
2167     rm = neon_load_reg(a->vm, 0);
2168     opfn(rd0, rn, rm);
2169     tcg_temp_free_i32(rn);
2170     tcg_temp_free_i32(rm);
2171
2172     rn = neon_load_reg(a->vn, 1);
2173     rm = neon_load_reg(a->vm, 1);
2174     opfn(rd1, rn, rm);
2175     tcg_temp_free_i32(rn);
2176     tcg_temp_free_i32(rm);
2177
2178     /* Don't store results until after all loads: they might overlap */
2179     if (accfn) {
2180         tmp = tcg_temp_new_i64();
2181         neon_load_reg64(tmp, a->vd);
2182         accfn(tmp, tmp, rd0);
2183         neon_store_reg64(tmp, a->vd);
2184         neon_load_reg64(tmp, a->vd + 1);
2185         accfn(tmp, tmp, rd1);
2186         neon_store_reg64(tmp, a->vd + 1);
2187         tcg_temp_free_i64(tmp);
2188     } else {
2189         neon_store_reg64(rd0, a->vd);
2190         neon_store_reg64(rd1, a->vd + 1);
2191     }
2192
2193     tcg_temp_free_i64(rd0);
2194     tcg_temp_free_i64(rd1);
2195
2196     return true;
2197 }
2198
2199 static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
2200 {
2201     static NeonGenTwoOpWidenFn * const opfn[] = {
2202         gen_helper_neon_abdl_s16,
2203         gen_helper_neon_abdl_s32,
2204         gen_helper_neon_abdl_s64,
2205         NULL,
2206     };
2207
2208     return do_long_3d(s, a, opfn[a->size], NULL);
2209 }
2210
2211 static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
2212 {
2213     static NeonGenTwoOpWidenFn * const opfn[] = {
2214         gen_helper_neon_abdl_u16,
2215         gen_helper_neon_abdl_u32,
2216         gen_helper_neon_abdl_u64,
2217         NULL,
2218     };
2219
2220     return do_long_3d(s, a, opfn[a->size], NULL);
2221 }
2222
2223 static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
2224 {
2225     static NeonGenTwoOpWidenFn * const opfn[] = {
2226         gen_helper_neon_abdl_s16,
2227         gen_helper_neon_abdl_s32,
2228         gen_helper_neon_abdl_s64,
2229         NULL,
2230     };
2231     static NeonGenTwo64OpFn * const addfn[] = {
2232         gen_helper_neon_addl_u16,
2233         gen_helper_neon_addl_u32,
2234         tcg_gen_add_i64,
2235         NULL,
2236     };
2237
2238     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2239 }
2240
2241 static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
2242 {
2243     static NeonGenTwoOpWidenFn * const opfn[] = {
2244         gen_helper_neon_abdl_u16,
2245         gen_helper_neon_abdl_u32,
2246         gen_helper_neon_abdl_u64,
2247         NULL,
2248     };
2249     static NeonGenTwo64OpFn * const addfn[] = {
2250         gen_helper_neon_addl_u16,
2251         gen_helper_neon_addl_u32,
2252         tcg_gen_add_i64,
2253         NULL,
2254     };
2255
2256     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2257 }
2258
2259 static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2260 {
2261     TCGv_i32 lo = tcg_temp_new_i32();
2262     TCGv_i32 hi = tcg_temp_new_i32();
2263
2264     tcg_gen_muls2_i32(lo, hi, rn, rm);
2265     tcg_gen_concat_i32_i64(rd, lo, hi);
2266
2267     tcg_temp_free_i32(lo);
2268     tcg_temp_free_i32(hi);
2269 }
2270
2271 static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2272 {
2273     TCGv_i32 lo = tcg_temp_new_i32();
2274     TCGv_i32 hi = tcg_temp_new_i32();
2275
2276     tcg_gen_mulu2_i32(lo, hi, rn, rm);
2277     tcg_gen_concat_i32_i64(rd, lo, hi);
2278
2279     tcg_temp_free_i32(lo);
2280     tcg_temp_free_i32(hi);
2281 }
2282
2283 static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
2284 {
2285     static NeonGenTwoOpWidenFn * const opfn[] = {
2286         gen_helper_neon_mull_s8,
2287         gen_helper_neon_mull_s16,
2288         gen_mull_s32,
2289         NULL,
2290     };
2291
2292     return do_long_3d(s, a, opfn[a->size], NULL);
2293 }
2294
2295 static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
2296 {
2297     static NeonGenTwoOpWidenFn * const opfn[] = {
2298         gen_helper_neon_mull_u8,
2299         gen_helper_neon_mull_u16,
2300         gen_mull_u32,
2301         NULL,
2302     };
2303
2304     return do_long_3d(s, a, opfn[a->size], NULL);
2305 }
2306
2307 #define DO_VMLAL(INSN,MULL,ACC)                                         \
2308     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2309     {                                                                   \
2310         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2311             gen_helper_neon_##MULL##8,                                  \
2312             gen_helper_neon_##MULL##16,                                 \
2313             gen_##MULL##32,                                             \
2314             NULL,                                                       \
2315         };                                                              \
2316         static NeonGenTwo64OpFn * const accfn[] = {                     \
2317             gen_helper_neon_##ACC##l_u16,                               \
2318             gen_helper_neon_##ACC##l_u32,                               \
2319             tcg_gen_##ACC##_i64,                                        \
2320             NULL,                                                       \
2321         };                                                              \
2322         return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
2323     }
2324
2325 DO_VMLAL(VMLAL_S,mull_s,add)
2326 DO_VMLAL(VMLAL_U,mull_u,add)
2327 DO_VMLAL(VMLSL_S,mull_s,sub)
2328 DO_VMLAL(VMLSL_U,mull_u,sub)
2329
2330 static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2331 {
2332     gen_helper_neon_mull_s16(rd, rn, rm);
2333     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rd, rd);
2334 }
2335
2336 static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2337 {
2338     gen_mull_s32(rd, rn, rm);
2339     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rd, rd);
2340 }
2341
2342 static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
2343 {
2344     static NeonGenTwoOpWidenFn * const opfn[] = {
2345         NULL,
2346         gen_VQDMULL_16,
2347         gen_VQDMULL_32,
2348         NULL,
2349     };
2350
2351     return do_long_3d(s, a, opfn[a->size], NULL);
2352 }
2353
2354 static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2355 {
2356     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2357 }
2358
2359 static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2360 {
2361     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2362 }
2363
2364 static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
2365 {
2366     static NeonGenTwoOpWidenFn * const opfn[] = {
2367         NULL,
2368         gen_VQDMULL_16,
2369         gen_VQDMULL_32,
2370         NULL,
2371     };
2372     static NeonGenTwo64OpFn * const accfn[] = {
2373         NULL,
2374         gen_VQDMLAL_acc_16,
2375         gen_VQDMLAL_acc_32,
2376         NULL,
2377     };
2378
2379     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2380 }
2381
2382 static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2383 {
2384     gen_helper_neon_negl_u32(rm, rm);
2385     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2386 }
2387
2388 static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2389 {
2390     tcg_gen_neg_i64(rm, rm);
2391     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2392 }
2393
2394 static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
2395 {
2396     static NeonGenTwoOpWidenFn * const opfn[] = {
2397         NULL,
2398         gen_VQDMULL_16,
2399         gen_VQDMULL_32,
2400         NULL,
2401     };
2402     static NeonGenTwo64OpFn * const accfn[] = {
2403         NULL,
2404         gen_VQDMLSL_acc_16,
2405         gen_VQDMLSL_acc_32,
2406         NULL,
2407     };
2408
2409     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2410 }
2411
2412 static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
2413 {
2414     gen_helper_gvec_3 *fn_gvec;
2415
2416     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2417         return false;
2418     }
2419
2420     /* UNDEF accesses to D16-D31 if they don't exist. */
2421     if (!dc_isar_feature(aa32_simd_r32, s) &&
2422         ((a->vd | a->vn | a->vm) & 0x10)) {
2423         return false;
2424     }
2425
2426     if (a->vd & 1) {
2427         return false;
2428     }
2429
2430     switch (a->size) {
2431     case 0:
2432         fn_gvec = gen_helper_neon_pmull_h;
2433         break;
2434     case 2:
2435         if (!dc_isar_feature(aa32_pmull, s)) {
2436             return false;
2437         }
2438         fn_gvec = gen_helper_gvec_pmull_q;
2439         break;
2440     default:
2441         return false;
2442     }
2443
2444     if (!vfp_access_check(s)) {
2445         return true;
2446     }
2447
2448     tcg_gen_gvec_3_ool(neon_reg_offset(a->vd, 0),
2449                        neon_reg_offset(a->vn, 0),
2450                        neon_reg_offset(a->vm, 0),
2451                        16, 16, 0, fn_gvec);
2452     return true;
2453 }
2454
2455 static void gen_neon_dup_low16(TCGv_i32 var)
2456 {
2457     TCGv_i32 tmp = tcg_temp_new_i32();
2458     tcg_gen_ext16u_i32(var, var);
2459     tcg_gen_shli_i32(tmp, var, 16);
2460     tcg_gen_or_i32(var, var, tmp);
2461     tcg_temp_free_i32(tmp);
2462 }
2463
2464 static void gen_neon_dup_high16(TCGv_i32 var)
2465 {
2466     TCGv_i32 tmp = tcg_temp_new_i32();
2467     tcg_gen_andi_i32(var, var, 0xffff0000);
2468     tcg_gen_shri_i32(tmp, var, 16);
2469     tcg_gen_or_i32(var, var, tmp);
2470     tcg_temp_free_i32(tmp);
2471 }
2472
2473 static inline TCGv_i32 neon_get_scalar(int size, int reg)
2474 {
2475     TCGv_i32 tmp;
2476     if (size == 1) {
2477         tmp = neon_load_reg(reg & 7, reg >> 4);
2478         if (reg & 8) {
2479             gen_neon_dup_high16(tmp);
2480         } else {
2481             gen_neon_dup_low16(tmp);
2482         }
2483     } else {
2484         tmp = neon_load_reg(reg & 15, reg >> 4);
2485     }
2486     return tmp;
2487 }
2488
2489 static bool do_2scalar(DisasContext *s, arg_2scalar *a,
2490                        NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
2491 {
2492     /*
2493      * Two registers and a scalar: perform an operation between
2494      * the input elements and the scalar, and then possibly
2495      * perform an accumulation operation of that result into the
2496      * destination.
2497      */
2498     TCGv_i32 scalar;
2499     int pass;
2500
2501     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2502         return false;
2503     }
2504
2505     /* UNDEF accesses to D16-D31 if they don't exist. */
2506     if (!dc_isar_feature(aa32_simd_r32, s) &&
2507         ((a->vd | a->vn | a->vm) & 0x10)) {
2508         return false;
2509     }
2510
2511     if (!opfn) {
2512         /* Bad size (including size == 3, which is a different insn group) */
2513         return false;
2514     }
2515
2516     if (a->q && ((a->vd | a->vn) & 1)) {
2517         return false;
2518     }
2519
2520     if (!vfp_access_check(s)) {
2521         return true;
2522     }
2523
2524     scalar = neon_get_scalar(a->size, a->vm);
2525
2526     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2527         TCGv_i32 tmp = neon_load_reg(a->vn, pass);
2528         opfn(tmp, tmp, scalar);
2529         if (accfn) {
2530             TCGv_i32 rd = neon_load_reg(a->vd, pass);
2531             accfn(tmp, rd, tmp);
2532             tcg_temp_free_i32(rd);
2533         }
2534         neon_store_reg(a->vd, pass, tmp);
2535     }
2536     tcg_temp_free_i32(scalar);
2537     return true;
2538 }
2539
2540 static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
2541 {
2542     static NeonGenTwoOpFn * const opfn[] = {
2543         NULL,
2544         gen_helper_neon_mul_u16,
2545         tcg_gen_mul_i32,
2546         NULL,
2547     };
2548
2549     return do_2scalar(s, a, opfn[a->size], NULL);
2550 }
2551
2552 static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
2553 {
2554     static NeonGenTwoOpFn * const opfn[] = {
2555         NULL,
2556         gen_helper_neon_mul_u16,
2557         tcg_gen_mul_i32,
2558         NULL,
2559     };
2560     static NeonGenTwoOpFn * const accfn[] = {
2561         NULL,
2562         gen_helper_neon_add_u16,
2563         tcg_gen_add_i32,
2564         NULL,
2565     };
2566
2567     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2568 }
2569
2570 static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
2571 {
2572     static NeonGenTwoOpFn * const opfn[] = {
2573         NULL,
2574         gen_helper_neon_mul_u16,
2575         tcg_gen_mul_i32,
2576         NULL,
2577     };
2578     static NeonGenTwoOpFn * const accfn[] = {
2579         NULL,
2580         gen_helper_neon_sub_u16,
2581         tcg_gen_sub_i32,
2582         NULL,
2583     };
2584
2585     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2586 }
2587
2588 /*
2589  * Rather than have a float-specific version of do_2scalar just for
2590  * three insns, we wrap a NeonGenTwoSingleOpFn to turn it into
2591  * a NeonGenTwoOpFn.
2592  */
2593 #define WRAP_FP_FN(WRAPNAME, FUNC)                              \
2594     static void WRAPNAME(TCGv_i32 rd, TCGv_i32 rn, TCGv_i32 rm) \
2595     {                                                           \
2596         TCGv_ptr fpstatus = fpstatus_ptr(FPST_STD);             \
2597         FUNC(rd, rn, rm, fpstatus);                             \
2598         tcg_temp_free_ptr(fpstatus);                            \
2599     }
2600
2601 WRAP_FP_FN(gen_VMUL_F_mul, gen_helper_vfp_muls)
2602 WRAP_FP_FN(gen_VMUL_F_add, gen_helper_vfp_adds)
2603 WRAP_FP_FN(gen_VMUL_F_sub, gen_helper_vfp_subs)
2604
2605 static bool trans_VMUL_F_2sc(DisasContext *s, arg_2scalar *a)
2606 {
2607     static NeonGenTwoOpFn * const opfn[] = {
2608         NULL,
2609         NULL, /* TODO: fp16 support */
2610         gen_VMUL_F_mul,
2611         NULL,
2612     };
2613
2614     return do_2scalar(s, a, opfn[a->size], NULL);
2615 }
2616
2617 static bool trans_VMLA_F_2sc(DisasContext *s, arg_2scalar *a)
2618 {
2619     static NeonGenTwoOpFn * const opfn[] = {
2620         NULL,
2621         NULL, /* TODO: fp16 support */
2622         gen_VMUL_F_mul,
2623         NULL,
2624     };
2625     static NeonGenTwoOpFn * const accfn[] = {
2626         NULL,
2627         NULL, /* TODO: fp16 support */
2628         gen_VMUL_F_add,
2629         NULL,
2630     };
2631
2632     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2633 }
2634
2635 static bool trans_VMLS_F_2sc(DisasContext *s, arg_2scalar *a)
2636 {
2637     static NeonGenTwoOpFn * const opfn[] = {
2638         NULL,
2639         NULL, /* TODO: fp16 support */
2640         gen_VMUL_F_mul,
2641         NULL,
2642     };
2643     static NeonGenTwoOpFn * const accfn[] = {
2644         NULL,
2645         NULL, /* TODO: fp16 support */
2646         gen_VMUL_F_sub,
2647         NULL,
2648     };
2649
2650     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2651 }
2652
2653 WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
2654 WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
2655 WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
2656 WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
2657
2658 static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
2659 {
2660     static NeonGenTwoOpFn * const opfn[] = {
2661         NULL,
2662         gen_VQDMULH_16,
2663         gen_VQDMULH_32,
2664         NULL,
2665     };
2666
2667     return do_2scalar(s, a, opfn[a->size], NULL);
2668 }
2669
2670 static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
2671 {
2672     static NeonGenTwoOpFn * const opfn[] = {
2673         NULL,
2674         gen_VQRDMULH_16,
2675         gen_VQRDMULH_32,
2676         NULL,
2677     };
2678
2679     return do_2scalar(s, a, opfn[a->size], NULL);
2680 }
2681
2682 static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
2683                             NeonGenThreeOpEnvFn *opfn)
2684 {
2685     /*
2686      * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
2687      * performs a kind of fused op-then-accumulate using a helper
2688      * function that takes all of rd, rn and the scalar at once.
2689      */
2690     TCGv_i32 scalar;
2691     int pass;
2692
2693     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2694         return false;
2695     }
2696
2697     if (!dc_isar_feature(aa32_rdm, s)) {
2698         return false;
2699     }
2700
2701     /* UNDEF accesses to D16-D31 if they don't exist. */
2702     if (!dc_isar_feature(aa32_simd_r32, s) &&
2703         ((a->vd | a->vn | a->vm) & 0x10)) {
2704         return false;
2705     }
2706
2707     if (!opfn) {
2708         /* Bad size (including size == 3, which is a different insn group) */
2709         return false;
2710     }
2711
2712     if (a->q && ((a->vd | a->vn) & 1)) {
2713         return false;
2714     }
2715
2716     if (!vfp_access_check(s)) {
2717         return true;
2718     }
2719
2720     scalar = neon_get_scalar(a->size, a->vm);
2721
2722     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2723         TCGv_i32 rn = neon_load_reg(a->vn, pass);
2724         TCGv_i32 rd = neon_load_reg(a->vd, pass);
2725         opfn(rd, cpu_env, rn, scalar, rd);
2726         tcg_temp_free_i32(rn);
2727         neon_store_reg(a->vd, pass, rd);
2728     }
2729     tcg_temp_free_i32(scalar);
2730
2731     return true;
2732 }
2733
2734 static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
2735 {
2736     static NeonGenThreeOpEnvFn *opfn[] = {
2737         NULL,
2738         gen_helper_neon_qrdmlah_s16,
2739         gen_helper_neon_qrdmlah_s32,
2740         NULL,
2741     };
2742     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2743 }
2744
2745 static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
2746 {
2747     static NeonGenThreeOpEnvFn *opfn[] = {
2748         NULL,
2749         gen_helper_neon_qrdmlsh_s16,
2750         gen_helper_neon_qrdmlsh_s32,
2751         NULL,
2752     };
2753     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2754 }
2755
2756 static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
2757                             NeonGenTwoOpWidenFn *opfn,
2758                             NeonGenTwo64OpFn *accfn)
2759 {
2760     /*
2761      * Two registers and a scalar, long operations: perform an
2762      * operation on the input elements and the scalar which produces
2763      * a double-width result, and then possibly perform an accumulation
2764      * operation of that result into the destination.
2765      */
2766     TCGv_i32 scalar, rn;
2767     TCGv_i64 rn0_64, rn1_64;
2768
2769     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2770         return false;
2771     }
2772
2773     /* UNDEF accesses to D16-D31 if they don't exist. */
2774     if (!dc_isar_feature(aa32_simd_r32, s) &&
2775         ((a->vd | a->vn | a->vm) & 0x10)) {
2776         return false;
2777     }
2778
2779     if (!opfn) {
2780         /* Bad size (including size == 3, which is a different insn group) */
2781         return false;
2782     }
2783
2784     if (a->vd & 1) {
2785         return false;
2786     }
2787
2788     if (!vfp_access_check(s)) {
2789         return true;
2790     }
2791
2792     scalar = neon_get_scalar(a->size, a->vm);
2793
2794     /* Load all inputs before writing any outputs, in case of overlap */
2795     rn = neon_load_reg(a->vn, 0);
2796     rn0_64 = tcg_temp_new_i64();
2797     opfn(rn0_64, rn, scalar);
2798     tcg_temp_free_i32(rn);
2799
2800     rn = neon_load_reg(a->vn, 1);
2801     rn1_64 = tcg_temp_new_i64();
2802     opfn(rn1_64, rn, scalar);
2803     tcg_temp_free_i32(rn);
2804     tcg_temp_free_i32(scalar);
2805
2806     if (accfn) {
2807         TCGv_i64 t64 = tcg_temp_new_i64();
2808         neon_load_reg64(t64, a->vd);
2809         accfn(t64, t64, rn0_64);
2810         neon_store_reg64(t64, a->vd);
2811         neon_load_reg64(t64, a->vd + 1);
2812         accfn(t64, t64, rn1_64);
2813         neon_store_reg64(t64, a->vd + 1);
2814         tcg_temp_free_i64(t64);
2815     } else {
2816         neon_store_reg64(rn0_64, a->vd);
2817         neon_store_reg64(rn1_64, a->vd + 1);
2818     }
2819     tcg_temp_free_i64(rn0_64);
2820     tcg_temp_free_i64(rn1_64);
2821     return true;
2822 }
2823
2824 static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
2825 {
2826     static NeonGenTwoOpWidenFn * const opfn[] = {
2827         NULL,
2828         gen_helper_neon_mull_s16,
2829         gen_mull_s32,
2830         NULL,
2831     };
2832
2833     return do_2scalar_long(s, a, opfn[a->size], NULL);
2834 }
2835
2836 static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
2837 {
2838     static NeonGenTwoOpWidenFn * const opfn[] = {
2839         NULL,
2840         gen_helper_neon_mull_u16,
2841         gen_mull_u32,
2842         NULL,
2843     };
2844
2845     return do_2scalar_long(s, a, opfn[a->size], NULL);
2846 }
2847
2848 #define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
2849     static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
2850     {                                                                   \
2851         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2852             NULL,                                                       \
2853             gen_helper_neon_##MULL##16,                                 \
2854             gen_##MULL##32,                                             \
2855             NULL,                                                       \
2856         };                                                              \
2857         static NeonGenTwo64OpFn * const accfn[] = {                     \
2858             NULL,                                                       \
2859             gen_helper_neon_##ACC##l_u32,                               \
2860             tcg_gen_##ACC##_i64,                                        \
2861             NULL,                                                       \
2862         };                                                              \
2863         return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
2864     }
2865
2866 DO_VMLAL_2SC(VMLAL_S, mull_s, add)
2867 DO_VMLAL_2SC(VMLAL_U, mull_u, add)
2868 DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
2869 DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
2870
2871 static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
2872 {
2873     static NeonGenTwoOpWidenFn * const opfn[] = {
2874         NULL,
2875         gen_VQDMULL_16,
2876         gen_VQDMULL_32,
2877         NULL,
2878     };
2879
2880     return do_2scalar_long(s, a, opfn[a->size], NULL);
2881 }
2882
2883 static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
2884 {
2885     static NeonGenTwoOpWidenFn * const opfn[] = {
2886         NULL,
2887         gen_VQDMULL_16,
2888         gen_VQDMULL_32,
2889         NULL,
2890     };
2891     static NeonGenTwo64OpFn * const accfn[] = {
2892         NULL,
2893         gen_VQDMLAL_acc_16,
2894         gen_VQDMLAL_acc_32,
2895         NULL,
2896     };
2897
2898     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2899 }
2900
2901 static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
2902 {
2903     static NeonGenTwoOpWidenFn * const opfn[] = {
2904         NULL,
2905         gen_VQDMULL_16,
2906         gen_VQDMULL_32,
2907         NULL,
2908     };
2909     static NeonGenTwo64OpFn * const accfn[] = {
2910         NULL,
2911         gen_VQDMLSL_acc_16,
2912         gen_VQDMLSL_acc_32,
2913         NULL,
2914     };
2915
2916     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2917 }
2918
2919 static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
2920 {
2921     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2922         return false;
2923     }
2924
2925     /* UNDEF accesses to D16-D31 if they don't exist. */
2926     if (!dc_isar_feature(aa32_simd_r32, s) &&
2927         ((a->vd | a->vn | a->vm) & 0x10)) {
2928         return false;
2929     }
2930
2931     if ((a->vn | a->vm | a->vd) & a->q) {
2932         return false;
2933     }
2934
2935     if (a->imm > 7 && !a->q) {
2936         return false;
2937     }
2938
2939     if (!vfp_access_check(s)) {
2940         return true;
2941     }
2942
2943     if (!a->q) {
2944         /* Extract 64 bits from <Vm:Vn> */
2945         TCGv_i64 left, right, dest;
2946
2947         left = tcg_temp_new_i64();
2948         right = tcg_temp_new_i64();
2949         dest = tcg_temp_new_i64();
2950
2951         neon_load_reg64(right, a->vn);
2952         neon_load_reg64(left, a->vm);
2953         tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
2954         neon_store_reg64(dest, a->vd);
2955
2956         tcg_temp_free_i64(left);
2957         tcg_temp_free_i64(right);
2958         tcg_temp_free_i64(dest);
2959     } else {
2960         /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
2961         TCGv_i64 left, middle, right, destleft, destright;
2962
2963         left = tcg_temp_new_i64();
2964         middle = tcg_temp_new_i64();
2965         right = tcg_temp_new_i64();
2966         destleft = tcg_temp_new_i64();
2967         destright = tcg_temp_new_i64();
2968
2969         if (a->imm < 8) {
2970             neon_load_reg64(right, a->vn);
2971             neon_load_reg64(middle, a->vn + 1);
2972             tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
2973             neon_load_reg64(left, a->vm);
2974             tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
2975         } else {
2976             neon_load_reg64(right, a->vn + 1);
2977             neon_load_reg64(middle, a->vm);
2978             tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
2979             neon_load_reg64(left, a->vm + 1);
2980             tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
2981         }
2982
2983         neon_store_reg64(destright, a->vd);
2984         neon_store_reg64(destleft, a->vd + 1);
2985
2986         tcg_temp_free_i64(destright);
2987         tcg_temp_free_i64(destleft);
2988         tcg_temp_free_i64(right);
2989         tcg_temp_free_i64(middle);
2990         tcg_temp_free_i64(left);
2991     }
2992     return true;
2993 }
2994
2995 static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
2996 {
2997     int n;
2998     TCGv_i32 tmp, tmp2, tmp3, tmp4;
2999     TCGv_ptr ptr1;
3000
3001     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3002         return false;
3003     }
3004
3005     /* UNDEF accesses to D16-D31 if they don't exist. */
3006     if (!dc_isar_feature(aa32_simd_r32, s) &&
3007         ((a->vd | a->vn | a->vm) & 0x10)) {
3008         return false;
3009     }
3010
3011     if (!vfp_access_check(s)) {
3012         return true;
3013     }
3014
3015     n = a->len + 1;
3016     if ((a->vn + n) > 32) {
3017         /*
3018          * This is UNPREDICTABLE; we choose to UNDEF to avoid the
3019          * helper function running off the end of the register file.
3020          */
3021         return false;
3022     }
3023     n <<= 3;
3024     if (a->op) {
3025         tmp = neon_load_reg(a->vd, 0);
3026     } else {
3027         tmp = tcg_temp_new_i32();
3028         tcg_gen_movi_i32(tmp, 0);
3029     }
3030     tmp2 = neon_load_reg(a->vm, 0);
3031     ptr1 = vfp_reg_ptr(true, a->vn);
3032     tmp4 = tcg_const_i32(n);
3033     gen_helper_neon_tbl(tmp2, tmp2, tmp, ptr1, tmp4);
3034     tcg_temp_free_i32(tmp);
3035     if (a->op) {
3036         tmp = neon_load_reg(a->vd, 1);
3037     } else {
3038         tmp = tcg_temp_new_i32();
3039         tcg_gen_movi_i32(tmp, 0);
3040     }
3041     tmp3 = neon_load_reg(a->vm, 1);
3042     gen_helper_neon_tbl(tmp3, tmp3, tmp, ptr1, tmp4);
3043     tcg_temp_free_i32(tmp4);
3044     tcg_temp_free_ptr(ptr1);
3045     neon_store_reg(a->vd, 0, tmp2);
3046     neon_store_reg(a->vd, 1, tmp3);
3047     tcg_temp_free_i32(tmp);
3048     return true;
3049 }
3050
3051 static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
3052 {
3053     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3054         return false;
3055     }
3056
3057     /* UNDEF accesses to D16-D31 if they don't exist. */
3058     if (!dc_isar_feature(aa32_simd_r32, s) &&
3059         ((a->vd | a->vm) & 0x10)) {
3060         return false;
3061     }
3062
3063     if (a->vd & a->q) {
3064         return false;
3065     }
3066
3067     if (!vfp_access_check(s)) {
3068         return true;
3069     }
3070
3071     tcg_gen_gvec_dup_mem(a->size, neon_reg_offset(a->vd, 0),
3072                          neon_element_offset(a->vm, a->index, a->size),
3073                          a->q ? 16 : 8, a->q ? 16 : 8);
3074     return true;
3075 }
3076
3077 static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
3078 {
3079     int pass, half;
3080
3081     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3082         return false;
3083     }
3084
3085     /* UNDEF accesses to D16-D31 if they don't exist. */
3086     if (!dc_isar_feature(aa32_simd_r32, s) &&
3087         ((a->vd | a->vm) & 0x10)) {
3088         return false;
3089     }
3090
3091     if ((a->vd | a->vm) & a->q) {
3092         return false;
3093     }
3094
3095     if (a->size == 3) {
3096         return false;
3097     }
3098
3099     if (!vfp_access_check(s)) {
3100         return true;
3101     }
3102
3103     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
3104         TCGv_i32 tmp[2];
3105
3106         for (half = 0; half < 2; half++) {
3107             tmp[half] = neon_load_reg(a->vm, pass * 2 + half);
3108             switch (a->size) {
3109             case 0:
3110                 tcg_gen_bswap32_i32(tmp[half], tmp[half]);
3111                 break;
3112             case 1:
3113                 gen_swap_half(tmp[half], tmp[half]);
3114                 break;
3115             case 2:
3116                 break;
3117             default:
3118                 g_assert_not_reached();
3119             }
3120         }
3121         neon_store_reg(a->vd, pass * 2, tmp[1]);
3122         neon_store_reg(a->vd, pass * 2 + 1, tmp[0]);
3123     }
3124     return true;
3125 }
3126
3127 static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
3128                               NeonGenWidenFn *widenfn,
3129                               NeonGenTwo64OpFn *opfn,
3130                               NeonGenTwo64OpFn *accfn)
3131 {
3132     /*
3133      * Pairwise long operations: widen both halves of the pair,
3134      * combine the pairs with the opfn, and then possibly accumulate
3135      * into the destination with the accfn.
3136      */
3137     int pass;
3138
3139     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3140         return false;
3141     }
3142
3143     /* UNDEF accesses to D16-D31 if they don't exist. */
3144     if (!dc_isar_feature(aa32_simd_r32, s) &&
3145         ((a->vd | a->vm) & 0x10)) {
3146         return false;
3147     }
3148
3149     if ((a->vd | a->vm) & a->q) {
3150         return false;
3151     }
3152
3153     if (!widenfn) {
3154         return false;
3155     }
3156
3157     if (!vfp_access_check(s)) {
3158         return true;
3159     }
3160
3161     for (pass = 0; pass < a->q + 1; pass++) {
3162         TCGv_i32 tmp;
3163         TCGv_i64 rm0_64, rm1_64, rd_64;
3164
3165         rm0_64 = tcg_temp_new_i64();
3166         rm1_64 = tcg_temp_new_i64();
3167         rd_64 = tcg_temp_new_i64();
3168         tmp = neon_load_reg(a->vm, pass * 2);
3169         widenfn(rm0_64, tmp);
3170         tcg_temp_free_i32(tmp);
3171         tmp = neon_load_reg(a->vm, pass * 2 + 1);
3172         widenfn(rm1_64, tmp);
3173         tcg_temp_free_i32(tmp);
3174         opfn(rd_64, rm0_64, rm1_64);
3175         tcg_temp_free_i64(rm0_64);
3176         tcg_temp_free_i64(rm1_64);
3177
3178         if (accfn) {
3179             TCGv_i64 tmp64 = tcg_temp_new_i64();
3180             neon_load_reg64(tmp64, a->vd + pass);
3181             accfn(rd_64, tmp64, rd_64);
3182             tcg_temp_free_i64(tmp64);
3183         }
3184         neon_store_reg64(rd_64, a->vd + pass);
3185         tcg_temp_free_i64(rd_64);
3186     }
3187     return true;
3188 }
3189
3190 static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
3191 {
3192     static NeonGenWidenFn * const widenfn[] = {
3193         gen_helper_neon_widen_s8,
3194         gen_helper_neon_widen_s16,
3195         tcg_gen_ext_i32_i64,
3196         NULL,
3197     };
3198     static NeonGenTwo64OpFn * const opfn[] = {
3199         gen_helper_neon_paddl_u16,
3200         gen_helper_neon_paddl_u32,
3201         tcg_gen_add_i64,
3202         NULL,
3203     };
3204
3205     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3206 }
3207
3208 static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
3209 {
3210     static NeonGenWidenFn * const widenfn[] = {
3211         gen_helper_neon_widen_u8,
3212         gen_helper_neon_widen_u16,
3213         tcg_gen_extu_i32_i64,
3214         NULL,
3215     };
3216     static NeonGenTwo64OpFn * const opfn[] = {
3217         gen_helper_neon_paddl_u16,
3218         gen_helper_neon_paddl_u32,
3219         tcg_gen_add_i64,
3220         NULL,
3221     };
3222
3223     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3224 }
3225
3226 static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
3227 {
3228     static NeonGenWidenFn * const widenfn[] = {
3229         gen_helper_neon_widen_s8,
3230         gen_helper_neon_widen_s16,
3231         tcg_gen_ext_i32_i64,
3232         NULL,
3233     };
3234     static NeonGenTwo64OpFn * const opfn[] = {
3235         gen_helper_neon_paddl_u16,
3236         gen_helper_neon_paddl_u32,
3237         tcg_gen_add_i64,
3238         NULL,
3239     };
3240     static NeonGenTwo64OpFn * const accfn[] = {
3241         gen_helper_neon_addl_u16,
3242         gen_helper_neon_addl_u32,
3243         tcg_gen_add_i64,
3244         NULL,
3245     };
3246
3247     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3248                              accfn[a->size]);
3249 }
3250
3251 static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
3252 {
3253     static NeonGenWidenFn * const widenfn[] = {
3254         gen_helper_neon_widen_u8,
3255         gen_helper_neon_widen_u16,
3256         tcg_gen_extu_i32_i64,
3257         NULL,
3258     };
3259     static NeonGenTwo64OpFn * const opfn[] = {
3260         gen_helper_neon_paddl_u16,
3261         gen_helper_neon_paddl_u32,
3262         tcg_gen_add_i64,
3263         NULL,
3264     };
3265     static NeonGenTwo64OpFn * const accfn[] = {
3266         gen_helper_neon_addl_u16,
3267         gen_helper_neon_addl_u32,
3268         tcg_gen_add_i64,
3269         NULL,
3270     };
3271
3272     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3273                              accfn[a->size]);
3274 }
3275
3276 typedef void ZipFn(TCGv_ptr, TCGv_ptr);
3277
3278 static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
3279                        ZipFn *fn)
3280 {
3281     TCGv_ptr pd, pm;
3282
3283     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3284         return false;
3285     }
3286
3287     /* UNDEF accesses to D16-D31 if they don't exist. */
3288     if (!dc_isar_feature(aa32_simd_r32, s) &&
3289         ((a->vd | a->vm) & 0x10)) {
3290         return false;
3291     }
3292
3293     if ((a->vd | a->vm) & a->q) {
3294         return false;
3295     }
3296
3297     if (!fn) {
3298         /* Bad size or size/q combination */
3299         return false;
3300     }
3301
3302     if (!vfp_access_check(s)) {
3303         return true;
3304     }
3305
3306     pd = vfp_reg_ptr(true, a->vd);
3307     pm = vfp_reg_ptr(true, a->vm);
3308     fn(pd, pm);
3309     tcg_temp_free_ptr(pd);
3310     tcg_temp_free_ptr(pm);
3311     return true;
3312 }
3313
3314 static bool trans_VUZP(DisasContext *s, arg_2misc *a)
3315 {
3316     static ZipFn * const fn[2][4] = {
3317         {
3318             gen_helper_neon_unzip8,
3319             gen_helper_neon_unzip16,
3320             NULL,
3321             NULL,
3322         }, {
3323             gen_helper_neon_qunzip8,
3324             gen_helper_neon_qunzip16,
3325             gen_helper_neon_qunzip32,
3326             NULL,
3327         }
3328     };
3329     return do_zip_uzp(s, a, fn[a->q][a->size]);
3330 }
3331
3332 static bool trans_VZIP(DisasContext *s, arg_2misc *a)
3333 {
3334     static ZipFn * const fn[2][4] = {
3335         {
3336             gen_helper_neon_zip8,
3337             gen_helper_neon_zip16,
3338             NULL,
3339             NULL,
3340         }, {
3341             gen_helper_neon_qzip8,
3342             gen_helper_neon_qzip16,
3343             gen_helper_neon_qzip32,
3344             NULL,
3345         }
3346     };
3347     return do_zip_uzp(s, a, fn[a->q][a->size]);
3348 }
3349
3350 static bool do_vmovn(DisasContext *s, arg_2misc *a,
3351                      NeonGenNarrowEnvFn *narrowfn)
3352 {
3353     TCGv_i64 rm;
3354     TCGv_i32 rd0, rd1;
3355
3356     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3357         return false;
3358     }
3359
3360     /* UNDEF accesses to D16-D31 if they don't exist. */
3361     if (!dc_isar_feature(aa32_simd_r32, s) &&
3362         ((a->vd | a->vm) & 0x10)) {
3363         return false;
3364     }
3365
3366     if (a->vm & 1) {
3367         return false;
3368     }
3369
3370     if (!narrowfn) {
3371         return false;
3372     }
3373
3374     if (!vfp_access_check(s)) {
3375         return true;
3376     }
3377
3378     rm = tcg_temp_new_i64();
3379     rd0 = tcg_temp_new_i32();
3380     rd1 = tcg_temp_new_i32();
3381
3382     neon_load_reg64(rm, a->vm);
3383     narrowfn(rd0, cpu_env, rm);
3384     neon_load_reg64(rm, a->vm + 1);
3385     narrowfn(rd1, cpu_env, rm);
3386     neon_store_reg(a->vd, 0, rd0);
3387     neon_store_reg(a->vd, 1, rd1);
3388     tcg_temp_free_i64(rm);
3389     return true;
3390 }
3391
3392 #define DO_VMOVN(INSN, FUNC)                                    \
3393     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3394     {                                                           \
3395         static NeonGenNarrowEnvFn * const narrowfn[] = {        \
3396             FUNC##8,                                            \
3397             FUNC##16,                                           \
3398             FUNC##32,                                           \
3399             NULL,                                               \
3400         };                                                      \
3401         return do_vmovn(s, a, narrowfn[a->size]);               \
3402     }
3403
3404 DO_VMOVN(VMOVN, gen_neon_narrow_u)
3405 DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
3406 DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
3407 DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
3408
3409 static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
3410 {
3411     TCGv_i32 rm0, rm1;
3412     TCGv_i64 rd;
3413     static NeonGenWidenFn * const widenfns[] = {
3414         gen_helper_neon_widen_u8,
3415         gen_helper_neon_widen_u16,
3416         tcg_gen_extu_i32_i64,
3417         NULL,
3418     };
3419     NeonGenWidenFn *widenfn = widenfns[a->size];
3420
3421     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3422         return false;
3423     }
3424
3425     /* UNDEF accesses to D16-D31 if they don't exist. */
3426     if (!dc_isar_feature(aa32_simd_r32, s) &&
3427         ((a->vd | a->vm) & 0x10)) {
3428         return false;
3429     }
3430
3431     if (a->vd & 1) {
3432         return false;
3433     }
3434
3435     if (!widenfn) {
3436         return false;
3437     }
3438
3439     if (!vfp_access_check(s)) {
3440         return true;
3441     }
3442
3443     rd = tcg_temp_new_i64();
3444
3445     rm0 = neon_load_reg(a->vm, 0);
3446     rm1 = neon_load_reg(a->vm, 1);
3447
3448     widenfn(rd, rm0);
3449     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3450     neon_store_reg64(rd, a->vd);
3451     widenfn(rd, rm1);
3452     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3453     neon_store_reg64(rd, a->vd + 1);
3454
3455     tcg_temp_free_i64(rd);
3456     tcg_temp_free_i32(rm0);
3457     tcg_temp_free_i32(rm1);
3458     return true;
3459 }
3460
3461 static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
3462 {
3463     TCGv_ptr fpst;
3464     TCGv_i32 ahp, tmp, tmp2, tmp3;
3465
3466     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3467         !dc_isar_feature(aa32_fp16_spconv, s)) {
3468         return false;
3469     }
3470
3471     /* UNDEF accesses to D16-D31 if they don't exist. */
3472     if (!dc_isar_feature(aa32_simd_r32, s) &&
3473         ((a->vd | a->vm) & 0x10)) {
3474         return false;
3475     }
3476
3477     if ((a->vm & 1) || (a->size != 1)) {
3478         return false;
3479     }
3480
3481     if (!vfp_access_check(s)) {
3482         return true;
3483     }
3484
3485     fpst = fpstatus_ptr(FPST_STD);
3486     ahp = get_ahp_flag();
3487     tmp = neon_load_reg(a->vm, 0);
3488     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3489     tmp2 = neon_load_reg(a->vm, 1);
3490     gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
3491     tcg_gen_shli_i32(tmp2, tmp2, 16);
3492     tcg_gen_or_i32(tmp2, tmp2, tmp);
3493     tcg_temp_free_i32(tmp);
3494     tmp = neon_load_reg(a->vm, 2);
3495     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3496     tmp3 = neon_load_reg(a->vm, 3);
3497     neon_store_reg(a->vd, 0, tmp2);
3498     gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
3499     tcg_gen_shli_i32(tmp3, tmp3, 16);
3500     tcg_gen_or_i32(tmp3, tmp3, tmp);
3501     neon_store_reg(a->vd, 1, tmp3);
3502     tcg_temp_free_i32(tmp);
3503     tcg_temp_free_i32(ahp);
3504     tcg_temp_free_ptr(fpst);
3505
3506     return true;
3507 }
3508
3509 static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
3510 {
3511     TCGv_ptr fpst;
3512     TCGv_i32 ahp, tmp, tmp2, tmp3;
3513
3514     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3515         !dc_isar_feature(aa32_fp16_spconv, s)) {
3516         return false;
3517     }
3518
3519     /* UNDEF accesses to D16-D31 if they don't exist. */
3520     if (!dc_isar_feature(aa32_simd_r32, s) &&
3521         ((a->vd | a->vm) & 0x10)) {
3522         return false;
3523     }
3524
3525     if ((a->vd & 1) || (a->size != 1)) {
3526         return false;
3527     }
3528
3529     if (!vfp_access_check(s)) {
3530         return true;
3531     }
3532
3533     fpst = fpstatus_ptr(FPST_STD);
3534     ahp = get_ahp_flag();
3535     tmp3 = tcg_temp_new_i32();
3536     tmp = neon_load_reg(a->vm, 0);
3537     tmp2 = neon_load_reg(a->vm, 1);
3538     tcg_gen_ext16u_i32(tmp3, tmp);
3539     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3540     neon_store_reg(a->vd, 0, tmp3);
3541     tcg_gen_shri_i32(tmp, tmp, 16);
3542     gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
3543     neon_store_reg(a->vd, 1, tmp);
3544     tmp3 = tcg_temp_new_i32();
3545     tcg_gen_ext16u_i32(tmp3, tmp2);
3546     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3547     neon_store_reg(a->vd, 2, tmp3);
3548     tcg_gen_shri_i32(tmp2, tmp2, 16);
3549     gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
3550     neon_store_reg(a->vd, 3, tmp2);
3551     tcg_temp_free_i32(ahp);
3552     tcg_temp_free_ptr(fpst);
3553
3554     return true;
3555 }
3556
3557 static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
3558 {
3559     int vec_size = a->q ? 16 : 8;
3560     int rd_ofs = neon_reg_offset(a->vd, 0);
3561     int rm_ofs = neon_reg_offset(a->vm, 0);
3562
3563     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3564         return false;
3565     }
3566