meson: target
[qemu.git] / target / arm / translate-neon.c.inc
1 /*
2  *  ARM translation: AArch32 Neon instructions
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *  Copyright (c) 2005-2007 CodeSourcery
6  *  Copyright (c) 2007 OpenedHand, Ltd.
7  *  Copyright (c) 2020 Linaro, Ltd.
8  *
9  * This library is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2 of the License, or (at your option) any later version.
13  *
14  * This library is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
21  */
22
23 /*
24  * This file is intended to be included from translate.c; it uses
25  * some macros and definitions provided by that file.
26  * It might be possible to convert it to a standalone .c file eventually.
27  */
28
29 static inline int plus1(DisasContext *s, int x)
30 {
31     return x + 1;
32 }
33
34 static inline int rsub_64(DisasContext *s, int x)
35 {
36     return 64 - x;
37 }
38
39 static inline int rsub_32(DisasContext *s, int x)
40 {
41     return 32 - x;
42 }
43 static inline int rsub_16(DisasContext *s, int x)
44 {
45     return 16 - x;
46 }
47 static inline int rsub_8(DisasContext *s, int x)
48 {
49     return 8 - x;
50 }
51
52 /* Include the generated Neon decoder */
53 #include "decode-neon-dp.c.inc"
54 #include "decode-neon-ls.c.inc"
55 #include "decode-neon-shared.c.inc"
56
57 /* Return the offset of a 2**SIZE piece of a NEON register, at index ELE,
58  * where 0 is the least significant end of the register.
59  */
60 static inline long
61 neon_element_offset(int reg, int element, MemOp size)
62 {
63     int element_size = 1 << size;
64     int ofs = element * element_size;
65 #ifdef HOST_WORDS_BIGENDIAN
66     /* Calculate the offset assuming fully little-endian,
67      * then XOR to account for the order of the 8-byte units.
68      */
69     if (element_size < 8) {
70         ofs ^= 8 - element_size;
71     }
72 #endif
73     return neon_reg_offset(reg, 0) + ofs;
74 }
75
76 static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
77 {
78     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
79
80     switch (mop) {
81     case MO_UB:
82         tcg_gen_ld8u_i32(var, cpu_env, offset);
83         break;
84     case MO_UW:
85         tcg_gen_ld16u_i32(var, cpu_env, offset);
86         break;
87     case MO_UL:
88         tcg_gen_ld_i32(var, cpu_env, offset);
89         break;
90     default:
91         g_assert_not_reached();
92     }
93 }
94
95 static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
96 {
97     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
98
99     switch (mop) {
100     case MO_UB:
101         tcg_gen_ld8u_i64(var, cpu_env, offset);
102         break;
103     case MO_UW:
104         tcg_gen_ld16u_i64(var, cpu_env, offset);
105         break;
106     case MO_UL:
107         tcg_gen_ld32u_i64(var, cpu_env, offset);
108         break;
109     case MO_Q:
110         tcg_gen_ld_i64(var, cpu_env, offset);
111         break;
112     default:
113         g_assert_not_reached();
114     }
115 }
116
117 static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
118 {
119     long offset = neon_element_offset(reg, ele, size);
120
121     switch (size) {
122     case MO_8:
123         tcg_gen_st8_i32(var, cpu_env, offset);
124         break;
125     case MO_16:
126         tcg_gen_st16_i32(var, cpu_env, offset);
127         break;
128     case MO_32:
129         tcg_gen_st_i32(var, cpu_env, offset);
130         break;
131     default:
132         g_assert_not_reached();
133     }
134 }
135
136 static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
137 {
138     long offset = neon_element_offset(reg, ele, size);
139
140     switch (size) {
141     case MO_8:
142         tcg_gen_st8_i64(var, cpu_env, offset);
143         break;
144     case MO_16:
145         tcg_gen_st16_i64(var, cpu_env, offset);
146         break;
147     case MO_32:
148         tcg_gen_st32_i64(var, cpu_env, offset);
149         break;
150     case MO_64:
151         tcg_gen_st_i64(var, cpu_env, offset);
152         break;
153     default:
154         g_assert_not_reached();
155     }
156 }
157
158 static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
159 {
160     int opr_sz;
161     TCGv_ptr fpst;
162     gen_helper_gvec_3_ptr *fn_gvec_ptr;
163
164     if (!dc_isar_feature(aa32_vcma, s)
165         || (!a->size && !dc_isar_feature(aa32_fp16_arith, s))) {
166         return false;
167     }
168
169     /* UNDEF accesses to D16-D31 if they don't exist. */
170     if (!dc_isar_feature(aa32_simd_r32, s) &&
171         ((a->vd | a->vn | a->vm) & 0x10)) {
172         return false;
173     }
174
175     if ((a->vn | a->vm | a->vd) & a->q) {
176         return false;
177     }
178
179     if (!vfp_access_check(s)) {
180         return true;
181     }
182
183     opr_sz = (1 + a->q) * 8;
184     fpst = get_fpstatus_ptr(1);
185     fn_gvec_ptr = a->size ? gen_helper_gvec_fcmlas : gen_helper_gvec_fcmlah;
186     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
187                        vfp_reg_offset(1, a->vn),
188                        vfp_reg_offset(1, a->vm),
189                        fpst, opr_sz, opr_sz, a->rot,
190                        fn_gvec_ptr);
191     tcg_temp_free_ptr(fpst);
192     return true;
193 }
194
195 static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
196 {
197     int opr_sz;
198     TCGv_ptr fpst;
199     gen_helper_gvec_3_ptr *fn_gvec_ptr;
200
201     if (!dc_isar_feature(aa32_vcma, s)
202         || (!a->size && !dc_isar_feature(aa32_fp16_arith, s))) {
203         return false;
204     }
205
206     /* UNDEF accesses to D16-D31 if they don't exist. */
207     if (!dc_isar_feature(aa32_simd_r32, s) &&
208         ((a->vd | a->vn | a->vm) & 0x10)) {
209         return false;
210     }
211
212     if ((a->vn | a->vm | a->vd) & a->q) {
213         return false;
214     }
215
216     if (!vfp_access_check(s)) {
217         return true;
218     }
219
220     opr_sz = (1 + a->q) * 8;
221     fpst = get_fpstatus_ptr(1);
222     fn_gvec_ptr = a->size ? gen_helper_gvec_fcadds : gen_helper_gvec_fcaddh;
223     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
224                        vfp_reg_offset(1, a->vn),
225                        vfp_reg_offset(1, a->vm),
226                        fpst, opr_sz, opr_sz, a->rot,
227                        fn_gvec_ptr);
228     tcg_temp_free_ptr(fpst);
229     return true;
230 }
231
232 static bool trans_VDOT(DisasContext *s, arg_VDOT *a)
233 {
234     int opr_sz;
235     gen_helper_gvec_3 *fn_gvec;
236
237     if (!dc_isar_feature(aa32_dp, s)) {
238         return false;
239     }
240
241     /* UNDEF accesses to D16-D31 if they don't exist. */
242     if (!dc_isar_feature(aa32_simd_r32, s) &&
243         ((a->vd | a->vn | a->vm) & 0x10)) {
244         return false;
245     }
246
247     if ((a->vn | a->vm | a->vd) & a->q) {
248         return false;
249     }
250
251     if (!vfp_access_check(s)) {
252         return true;
253     }
254
255     opr_sz = (1 + a->q) * 8;
256     fn_gvec = a->u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b;
257     tcg_gen_gvec_3_ool(vfp_reg_offset(1, a->vd),
258                        vfp_reg_offset(1, a->vn),
259                        vfp_reg_offset(1, a->vm),
260                        opr_sz, opr_sz, 0, fn_gvec);
261     return true;
262 }
263
264 static bool trans_VFML(DisasContext *s, arg_VFML *a)
265 {
266     int opr_sz;
267
268     if (!dc_isar_feature(aa32_fhm, s)) {
269         return false;
270     }
271
272     /* UNDEF accesses to D16-D31 if they don't exist. */
273     if (!dc_isar_feature(aa32_simd_r32, s) &&
274         (a->vd & 0x10)) {
275         return false;
276     }
277
278     if (a->vd & a->q) {
279         return false;
280     }
281
282     if (!vfp_access_check(s)) {
283         return true;
284     }
285
286     opr_sz = (1 + a->q) * 8;
287     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
288                        vfp_reg_offset(a->q, a->vn),
289                        vfp_reg_offset(a->q, a->vm),
290                        cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
291                        gen_helper_gvec_fmlal_a32);
292     return true;
293 }
294
295 static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
296 {
297     gen_helper_gvec_3_ptr *fn_gvec_ptr;
298     int opr_sz;
299     TCGv_ptr fpst;
300
301     if (!dc_isar_feature(aa32_vcma, s)) {
302         return false;
303     }
304     if (a->size == 0 && !dc_isar_feature(aa32_fp16_arith, s)) {
305         return false;
306     }
307
308     /* UNDEF accesses to D16-D31 if they don't exist. */
309     if (!dc_isar_feature(aa32_simd_r32, s) &&
310         ((a->vd | a->vn | a->vm) & 0x10)) {
311         return false;
312     }
313
314     if ((a->vd | a->vn) & a->q) {
315         return false;
316     }
317
318     if (!vfp_access_check(s)) {
319         return true;
320     }
321
322     fn_gvec_ptr = (a->size ? gen_helper_gvec_fcmlas_idx
323                    : gen_helper_gvec_fcmlah_idx);
324     opr_sz = (1 + a->q) * 8;
325     fpst = get_fpstatus_ptr(1);
326     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
327                        vfp_reg_offset(1, a->vn),
328                        vfp_reg_offset(1, a->vm),
329                        fpst, opr_sz, opr_sz,
330                        (a->index << 2) | a->rot, fn_gvec_ptr);
331     tcg_temp_free_ptr(fpst);
332     return true;
333 }
334
335 static bool trans_VDOT_scalar(DisasContext *s, arg_VDOT_scalar *a)
336 {
337     gen_helper_gvec_3 *fn_gvec;
338     int opr_sz;
339     TCGv_ptr fpst;
340
341     if (!dc_isar_feature(aa32_dp, s)) {
342         return false;
343     }
344
345     /* UNDEF accesses to D16-D31 if they don't exist. */
346     if (!dc_isar_feature(aa32_simd_r32, s) &&
347         ((a->vd | a->vn) & 0x10)) {
348         return false;
349     }
350
351     if ((a->vd | a->vn) & a->q) {
352         return false;
353     }
354
355     if (!vfp_access_check(s)) {
356         return true;
357     }
358
359     fn_gvec = a->u ? gen_helper_gvec_udot_idx_b : gen_helper_gvec_sdot_idx_b;
360     opr_sz = (1 + a->q) * 8;
361     fpst = get_fpstatus_ptr(1);
362     tcg_gen_gvec_3_ool(vfp_reg_offset(1, a->vd),
363                        vfp_reg_offset(1, a->vn),
364                        vfp_reg_offset(1, a->rm),
365                        opr_sz, opr_sz, a->index, fn_gvec);
366     tcg_temp_free_ptr(fpst);
367     return true;
368 }
369
370 static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
371 {
372     int opr_sz;
373
374     if (!dc_isar_feature(aa32_fhm, s)) {
375         return false;
376     }
377
378     /* UNDEF accesses to D16-D31 if they don't exist. */
379     if (!dc_isar_feature(aa32_simd_r32, s) &&
380         ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
381         return false;
382     }
383
384     if (a->vd & a->q) {
385         return false;
386     }
387
388     if (!vfp_access_check(s)) {
389         return true;
390     }
391
392     opr_sz = (1 + a->q) * 8;
393     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
394                        vfp_reg_offset(a->q, a->vn),
395                        vfp_reg_offset(a->q, a->rm),
396                        cpu_env, opr_sz, opr_sz,
397                        (a->index << 2) | a->s, /* is_2 == 0 */
398                        gen_helper_gvec_fmlal_idx_a32);
399     return true;
400 }
401
402 static struct {
403     int nregs;
404     int interleave;
405     int spacing;
406 } const neon_ls_element_type[11] = {
407     {1, 4, 1},
408     {1, 4, 2},
409     {4, 1, 1},
410     {2, 2, 2},
411     {1, 3, 1},
412     {1, 3, 2},
413     {3, 1, 1},
414     {1, 1, 1},
415     {1, 2, 1},
416     {1, 2, 2},
417     {2, 1, 1}
418 };
419
420 static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
421                                       int stride)
422 {
423     if (rm != 15) {
424         TCGv_i32 base;
425
426         base = load_reg(s, rn);
427         if (rm == 13) {
428             tcg_gen_addi_i32(base, base, stride);
429         } else {
430             TCGv_i32 index;
431             index = load_reg(s, rm);
432             tcg_gen_add_i32(base, base, index);
433             tcg_temp_free_i32(index);
434         }
435         store_reg(s, rn, base);
436     }
437 }
438
439 static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
440 {
441     /* Neon load/store multiple structures */
442     int nregs, interleave, spacing, reg, n;
443     MemOp endian = s->be_data;
444     int mmu_idx = get_mem_index(s);
445     int size = a->size;
446     TCGv_i64 tmp64;
447     TCGv_i32 addr, tmp;
448
449     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
450         return false;
451     }
452
453     /* UNDEF accesses to D16-D31 if they don't exist */
454     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
455         return false;
456     }
457     if (a->itype > 10) {
458         return false;
459     }
460     /* Catch UNDEF cases for bad values of align field */
461     switch (a->itype & 0xc) {
462     case 4:
463         if (a->align >= 2) {
464             return false;
465         }
466         break;
467     case 8:
468         if (a->align == 3) {
469             return false;
470         }
471         break;
472     default:
473         break;
474     }
475     nregs = neon_ls_element_type[a->itype].nregs;
476     interleave = neon_ls_element_type[a->itype].interleave;
477     spacing = neon_ls_element_type[a->itype].spacing;
478     if (size == 3 && (interleave | spacing) != 1) {
479         return false;
480     }
481
482     if (!vfp_access_check(s)) {
483         return true;
484     }
485
486     /* For our purposes, bytes are always little-endian.  */
487     if (size == 0) {
488         endian = MO_LE;
489     }
490     /*
491      * Consecutive little-endian elements from a single register
492      * can be promoted to a larger little-endian operation.
493      */
494     if (interleave == 1 && endian == MO_LE) {
495         size = 3;
496     }
497     tmp64 = tcg_temp_new_i64();
498     addr = tcg_temp_new_i32();
499     tmp = tcg_const_i32(1 << size);
500     load_reg_var(s, addr, a->rn);
501     for (reg = 0; reg < nregs; reg++) {
502         for (n = 0; n < 8 >> size; n++) {
503             int xs;
504             for (xs = 0; xs < interleave; xs++) {
505                 int tt = a->vd + reg + spacing * xs;
506
507                 if (a->l) {
508                     gen_aa32_ld_i64(s, tmp64, addr, mmu_idx, endian | size);
509                     neon_store_element64(tt, n, size, tmp64);
510                 } else {
511                     neon_load_element64(tmp64, tt, n, size);
512                     gen_aa32_st_i64(s, tmp64, addr, mmu_idx, endian | size);
513                 }
514                 tcg_gen_add_i32(addr, addr, tmp);
515             }
516         }
517     }
518     tcg_temp_free_i32(addr);
519     tcg_temp_free_i32(tmp);
520     tcg_temp_free_i64(tmp64);
521
522     gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
523     return true;
524 }
525
526 static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
527 {
528     /* Neon load single structure to all lanes */
529     int reg, stride, vec_size;
530     int vd = a->vd;
531     int size = a->size;
532     int nregs = a->n + 1;
533     TCGv_i32 addr, tmp;
534
535     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
536         return false;
537     }
538
539     /* UNDEF accesses to D16-D31 if they don't exist */
540     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
541         return false;
542     }
543
544     if (size == 3) {
545         if (nregs != 4 || a->a == 0) {
546             return false;
547         }
548         /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
549         size = 2;
550     }
551     if (nregs == 1 && a->a == 1 && size == 0) {
552         return false;
553     }
554     if (nregs == 3 && a->a == 1) {
555         return false;
556     }
557
558     if (!vfp_access_check(s)) {
559         return true;
560     }
561
562     /*
563      * VLD1 to all lanes: T bit indicates how many Dregs to write.
564      * VLD2/3/4 to all lanes: T bit indicates register stride.
565      */
566     stride = a->t ? 2 : 1;
567     vec_size = nregs == 1 ? stride * 8 : 8;
568
569     tmp = tcg_temp_new_i32();
570     addr = tcg_temp_new_i32();
571     load_reg_var(s, addr, a->rn);
572     for (reg = 0; reg < nregs; reg++) {
573         gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s),
574                         s->be_data | size);
575         if ((vd & 1) && vec_size == 16) {
576             /*
577              * We cannot write 16 bytes at once because the
578              * destination is unaligned.
579              */
580             tcg_gen_gvec_dup_i32(size, neon_reg_offset(vd, 0),
581                                  8, 8, tmp);
582             tcg_gen_gvec_mov(0, neon_reg_offset(vd + 1, 0),
583                              neon_reg_offset(vd, 0), 8, 8);
584         } else {
585             tcg_gen_gvec_dup_i32(size, neon_reg_offset(vd, 0),
586                                  vec_size, vec_size, tmp);
587         }
588         tcg_gen_addi_i32(addr, addr, 1 << size);
589         vd += stride;
590     }
591     tcg_temp_free_i32(tmp);
592     tcg_temp_free_i32(addr);
593
594     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
595
596     return true;
597 }
598
599 static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
600 {
601     /* Neon load/store single structure to one lane */
602     int reg;
603     int nregs = a->n + 1;
604     int vd = a->vd;
605     TCGv_i32 addr, tmp;
606
607     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
608         return false;
609     }
610
611     /* UNDEF accesses to D16-D31 if they don't exist */
612     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
613         return false;
614     }
615
616     /* Catch the UNDEF cases. This is unavoidably a bit messy. */
617     switch (nregs) {
618     case 1:
619         if (((a->align & (1 << a->size)) != 0) ||
620             (a->size == 2 && ((a->align & 3) == 1 || (a->align & 3) == 2))) {
621             return false;
622         }
623         break;
624     case 3:
625         if ((a->align & 1) != 0) {
626             return false;
627         }
628         /* fall through */
629     case 2:
630         if (a->size == 2 && (a->align & 2) != 0) {
631             return false;
632         }
633         break;
634     case 4:
635         if ((a->size == 2) && ((a->align & 3) == 3)) {
636             return false;
637         }
638         break;
639     default:
640         abort();
641     }
642     if ((vd + a->stride * (nregs - 1)) > 31) {
643         /*
644          * Attempts to write off the end of the register file are
645          * UNPREDICTABLE; we choose to UNDEF because otherwise we would
646          * access off the end of the array that holds the register data.
647          */
648         return false;
649     }
650
651     if (!vfp_access_check(s)) {
652         return true;
653     }
654
655     tmp = tcg_temp_new_i32();
656     addr = tcg_temp_new_i32();
657     load_reg_var(s, addr, a->rn);
658     /*
659      * TODO: if we implemented alignment exceptions, we should check
660      * addr against the alignment encoded in a->align here.
661      */
662     for (reg = 0; reg < nregs; reg++) {
663         if (a->l) {
664             gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s),
665                             s->be_data | a->size);
666             neon_store_element(vd, a->reg_idx, a->size, tmp);
667         } else { /* Store */
668             neon_load_element(tmp, vd, a->reg_idx, a->size);
669             gen_aa32_st_i32(s, tmp, addr, get_mem_index(s),
670                             s->be_data | a->size);
671         }
672         vd += a->stride;
673         tcg_gen_addi_i32(addr, addr, 1 << a->size);
674     }
675     tcg_temp_free_i32(addr);
676     tcg_temp_free_i32(tmp);
677
678     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
679
680     return true;
681 }
682
683 static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
684 {
685     int vec_size = a->q ? 16 : 8;
686     int rd_ofs = neon_reg_offset(a->vd, 0);
687     int rn_ofs = neon_reg_offset(a->vn, 0);
688     int rm_ofs = neon_reg_offset(a->vm, 0);
689
690     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
691         return false;
692     }
693
694     /* UNDEF accesses to D16-D31 if they don't exist. */
695     if (!dc_isar_feature(aa32_simd_r32, s) &&
696         ((a->vd | a->vn | a->vm) & 0x10)) {
697         return false;
698     }
699
700     if ((a->vn | a->vm | a->vd) & a->q) {
701         return false;
702     }
703
704     if (!vfp_access_check(s)) {
705         return true;
706     }
707
708     fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
709     return true;
710 }
711
712 #define DO_3SAME(INSN, FUNC)                                            \
713     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
714     {                                                                   \
715         return do_3same(s, a, FUNC);                                    \
716     }
717
718 DO_3SAME(VADD, tcg_gen_gvec_add)
719 DO_3SAME(VSUB, tcg_gen_gvec_sub)
720 DO_3SAME(VAND, tcg_gen_gvec_and)
721 DO_3SAME(VBIC, tcg_gen_gvec_andc)
722 DO_3SAME(VORR, tcg_gen_gvec_or)
723 DO_3SAME(VORN, tcg_gen_gvec_orc)
724 DO_3SAME(VEOR, tcg_gen_gvec_xor)
725 DO_3SAME(VSHL_S, gen_gvec_sshl)
726 DO_3SAME(VSHL_U, gen_gvec_ushl)
727 DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
728 DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
729 DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
730 DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
731
732 /* These insns are all gvec_bitsel but with the inputs in various orders. */
733 #define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
734     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
735                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
736                                 uint32_t oprsz, uint32_t maxsz)         \
737     {                                                                   \
738         tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
739     }                                                                   \
740     DO_3SAME(INSN, gen_##INSN##_3s)
741
742 DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
743 DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
744 DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
745
746 #define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
747     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
748     {                                                                   \
749         if (a->size == 3) {                                             \
750             return false;                                               \
751         }                                                               \
752         return do_3same(s, a, FUNC);                                    \
753     }
754
755 DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
756 DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
757 DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
758 DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
759 DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
760 DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
761 DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
762 DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
763 DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
764 DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
765 DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
766 DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
767
768 #define DO_3SAME_CMP(INSN, COND)                                        \
769     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
770                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
771                                 uint32_t oprsz, uint32_t maxsz)         \
772     {                                                                   \
773         tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
774     }                                                                   \
775     DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
776
777 DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
778 DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
779 DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
780 DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
781 DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
782
783 #define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
784     static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
785                          uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
786     {                                                                      \
787         tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
788     }
789
790 WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
791
792 static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
793 {
794     if (a->size != 0) {
795         return false;
796     }
797     return do_3same(s, a, gen_VMUL_p_3s);
798 }
799
800 #define DO_VQRDMLAH(INSN, FUNC)                                         \
801     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
802     {                                                                   \
803         if (!dc_isar_feature(aa32_rdm, s)) {                            \
804             return false;                                               \
805         }                                                               \
806         if (a->size != 1 && a->size != 2) {                             \
807             return false;                                               \
808         }                                                               \
809         return do_3same(s, a, FUNC);                                    \
810     }
811
812 DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
813 DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
814
815 #define DO_SHA1(NAME, FUNC)                                             \
816     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
817     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
818     {                                                                   \
819         if (!dc_isar_feature(aa32_sha1, s)) {                           \
820             return false;                                               \
821         }                                                               \
822         return do_3same(s, a, gen_##NAME##_3s);                         \
823     }
824
825 DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
826 DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
827 DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
828 DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
829
830 #define DO_SHA2(NAME, FUNC)                                             \
831     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
832     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
833     {                                                                   \
834         if (!dc_isar_feature(aa32_sha2, s)) {                           \
835             return false;                                               \
836         }                                                               \
837         return do_3same(s, a, gen_##NAME##_3s);                         \
838     }
839
840 DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
841 DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
842 DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
843
844 #define DO_3SAME_64(INSN, FUNC)                                         \
845     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
846                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
847                                 uint32_t oprsz, uint32_t maxsz)         \
848     {                                                                   \
849         static const GVecGen3 op = { .fni8 = FUNC };                    \
850         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op);      \
851     }                                                                   \
852     DO_3SAME(INSN, gen_##INSN##_3s)
853
854 #define DO_3SAME_64_ENV(INSN, FUNC)                                     \
855     static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)    \
856     {                                                                   \
857         FUNC(d, cpu_env, n, m);                                         \
858     }                                                                   \
859     DO_3SAME_64(INSN, gen_##INSN##_elt)
860
861 DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
862 DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
863 DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
864 DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
865 DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
866 DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
867
868 #define DO_3SAME_32(INSN, FUNC)                                         \
869     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
870                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
871                                 uint32_t oprsz, uint32_t maxsz)         \
872     {                                                                   \
873         static const GVecGen3 ops[4] = {                                \
874             { .fni4 = gen_helper_neon_##FUNC##8 },                      \
875             { .fni4 = gen_helper_neon_##FUNC##16 },                     \
876             { .fni4 = gen_helper_neon_##FUNC##32 },                     \
877             { 0 },                                                      \
878         };                                                              \
879         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
880     }                                                                   \
881     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
882     {                                                                   \
883         if (a->size > 2) {                                              \
884             return false;                                               \
885         }                                                               \
886         return do_3same(s, a, gen_##INSN##_3s);                         \
887     }
888
889 /*
890  * Some helper functions need to be passed the cpu_env. In order
891  * to use those with the gvec APIs like tcg_gen_gvec_3() we need
892  * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
893  * and which call a NeonGenTwoOpEnvFn().
894  */
895 #define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
896     static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
897     {                                                                   \
898         FUNC(d, cpu_env, n, m);                                         \
899     }
900
901 #define DO_3SAME_32_ENV(INSN, FUNC)                                     \
902     WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8);        \
903     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16);      \
904     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32);      \
905     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
906                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
907                                 uint32_t oprsz, uint32_t maxsz)         \
908     {                                                                   \
909         static const GVecGen3 ops[4] = {                                \
910             { .fni4 = gen_##INSN##_tramp8 },                            \
911             { .fni4 = gen_##INSN##_tramp16 },                           \
912             { .fni4 = gen_##INSN##_tramp32 },                           \
913             { 0 },                                                      \
914         };                                                              \
915         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
916     }                                                                   \
917     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
918     {                                                                   \
919         if (a->size > 2) {                                              \
920             return false;                                               \
921         }                                                               \
922         return do_3same(s, a, gen_##INSN##_3s);                         \
923     }
924
925 DO_3SAME_32(VHADD_S, hadd_s)
926 DO_3SAME_32(VHADD_U, hadd_u)
927 DO_3SAME_32(VHSUB_S, hsub_s)
928 DO_3SAME_32(VHSUB_U, hsub_u)
929 DO_3SAME_32(VRHADD_S, rhadd_s)
930 DO_3SAME_32(VRHADD_U, rhadd_u)
931 DO_3SAME_32(VRSHL_S, rshl_s)
932 DO_3SAME_32(VRSHL_U, rshl_u)
933
934 DO_3SAME_32_ENV(VQSHL_S, qshl_s)
935 DO_3SAME_32_ENV(VQSHL_U, qshl_u)
936 DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
937 DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
938
939 static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn)
940 {
941     /* Operations handled pairwise 32 bits at a time */
942     TCGv_i32 tmp, tmp2, tmp3;
943
944     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
945         return false;
946     }
947
948     /* UNDEF accesses to D16-D31 if they don't exist. */
949     if (!dc_isar_feature(aa32_simd_r32, s) &&
950         ((a->vd | a->vn | a->vm) & 0x10)) {
951         return false;
952     }
953
954     if (a->size == 3) {
955         return false;
956     }
957
958     if (!vfp_access_check(s)) {
959         return true;
960     }
961
962     assert(a->q == 0); /* enforced by decode patterns */
963
964     /*
965      * Note that we have to be careful not to clobber the source operands
966      * in the "vm == vd" case by storing the result of the first pass too
967      * early. Since Q is 0 there are always just two passes, so instead
968      * of a complicated loop over each pass we just unroll.
969      */
970     tmp = neon_load_reg(a->vn, 0);
971     tmp2 = neon_load_reg(a->vn, 1);
972     fn(tmp, tmp, tmp2);
973     tcg_temp_free_i32(tmp2);
974
975     tmp3 = neon_load_reg(a->vm, 0);
976     tmp2 = neon_load_reg(a->vm, 1);
977     fn(tmp3, tmp3, tmp2);
978     tcg_temp_free_i32(tmp2);
979
980     neon_store_reg(a->vd, 0, tmp);
981     neon_store_reg(a->vd, 1, tmp3);
982     return true;
983 }
984
985 #define DO_3SAME_PAIR(INSN, func)                                       \
986     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
987     {                                                                   \
988         static NeonGenTwoOpFn * const fns[] = {                         \
989             gen_helper_neon_##func##8,                                  \
990             gen_helper_neon_##func##16,                                 \
991             gen_helper_neon_##func##32,                                 \
992         };                                                              \
993         if (a->size > 2) {                                              \
994             return false;                                               \
995         }                                                               \
996         return do_3same_pair(s, a, fns[a->size]);                       \
997     }
998
999 /* 32-bit pairwise ops end up the same as the elementwise versions.  */
1000 #define gen_helper_neon_pmax_s32  tcg_gen_smax_i32
1001 #define gen_helper_neon_pmax_u32  tcg_gen_umax_i32
1002 #define gen_helper_neon_pmin_s32  tcg_gen_smin_i32
1003 #define gen_helper_neon_pmin_u32  tcg_gen_umin_i32
1004 #define gen_helper_neon_padd_u32  tcg_gen_add_i32
1005
1006 DO_3SAME_PAIR(VPMAX_S, pmax_s)
1007 DO_3SAME_PAIR(VPMIN_S, pmin_s)
1008 DO_3SAME_PAIR(VPMAX_U, pmax_u)
1009 DO_3SAME_PAIR(VPMIN_U, pmin_u)
1010 DO_3SAME_PAIR(VPADD, padd_u)
1011
1012 #define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
1013     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16);    \
1014     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32);    \
1015     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
1016                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
1017                                 uint32_t oprsz, uint32_t maxsz)         \
1018     {                                                                   \
1019         static const GVecGen3 ops[2] = {                                \
1020             { .fni4 = gen_##INSN##_tramp16 },                           \
1021             { .fni4 = gen_##INSN##_tramp32 },                           \
1022         };                                                              \
1023         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
1024     }                                                                   \
1025     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1026     {                                                                   \
1027         if (a->size != 1 && a->size != 2) {                             \
1028             return false;                                               \
1029         }                                                               \
1030         return do_3same(s, a, gen_##INSN##_3s);                         \
1031     }
1032
1033 DO_3SAME_VQDMULH(VQDMULH, qdmulh)
1034 DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
1035
1036 static bool do_3same_fp(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn,
1037                         bool reads_vd)
1038 {
1039     /*
1040      * FP operations handled elementwise 32 bits at a time.
1041      * If reads_vd is true then the old value of Vd will be
1042      * loaded before calling the callback function. This is
1043      * used for multiply-accumulate type operations.
1044      */
1045     TCGv_i32 tmp, tmp2;
1046     int pass;
1047
1048     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1049         return false;
1050     }
1051
1052     /* UNDEF accesses to D16-D31 if they don't exist. */
1053     if (!dc_isar_feature(aa32_simd_r32, s) &&
1054         ((a->vd | a->vn | a->vm) & 0x10)) {
1055         return false;
1056     }
1057
1058     if ((a->vn | a->vm | a->vd) & a->q) {
1059         return false;
1060     }
1061
1062     if (!vfp_access_check(s)) {
1063         return true;
1064     }
1065
1066     TCGv_ptr fpstatus = get_fpstatus_ptr(1);
1067     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1068         tmp = neon_load_reg(a->vn, pass);
1069         tmp2 = neon_load_reg(a->vm, pass);
1070         if (reads_vd) {
1071             TCGv_i32 tmp_rd = neon_load_reg(a->vd, pass);
1072             fn(tmp_rd, tmp, tmp2, fpstatus);
1073             neon_store_reg(a->vd, pass, tmp_rd);
1074             tcg_temp_free_i32(tmp);
1075         } else {
1076             fn(tmp, tmp, tmp2, fpstatus);
1077             neon_store_reg(a->vd, pass, tmp);
1078         }
1079         tcg_temp_free_i32(tmp2);
1080     }
1081     tcg_temp_free_ptr(fpstatus);
1082     return true;
1083 }
1084
1085 /*
1086  * For all the functions using this macro, size == 1 means fp16,
1087  * which is an architecture extension we don't implement yet.
1088  */
1089 #define DO_3S_FP_GVEC(INSN,FUNC)                                        \
1090     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
1091                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
1092                                 uint32_t oprsz, uint32_t maxsz)         \
1093     {                                                                   \
1094         TCGv_ptr fpst = get_fpstatus_ptr(1);                            \
1095         tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
1096                            oprsz, maxsz, 0, FUNC);                      \
1097         tcg_temp_free_ptr(fpst);                                        \
1098     }                                                                   \
1099     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
1100     {                                                                   \
1101         if (a->size != 0) {                                             \
1102             /* TODO fp16 support */                                     \
1103             return false;                                               \
1104         }                                                               \
1105         return do_3same(s, a, gen_##INSN##_3s);                         \
1106     }
1107
1108
1109 DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s)
1110 DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s)
1111 DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s)
1112 DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s)
1113
1114 /*
1115  * For all the functions using this macro, size == 1 means fp16,
1116  * which is an architecture extension we don't implement yet.
1117  */
1118 #define DO_3S_FP(INSN,FUNC,READS_VD)                                \
1119     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
1120     {                                                               \
1121         if (a->size != 0) {                                         \
1122             /* TODO fp16 support */                                 \
1123             return false;                                           \
1124         }                                                           \
1125         return do_3same_fp(s, a, FUNC, READS_VD);                   \
1126     }
1127
1128 DO_3S_FP(VCEQ, gen_helper_neon_ceq_f32, false)
1129 DO_3S_FP(VCGE, gen_helper_neon_cge_f32, false)
1130 DO_3S_FP(VCGT, gen_helper_neon_cgt_f32, false)
1131 DO_3S_FP(VACGE, gen_helper_neon_acge_f32, false)
1132 DO_3S_FP(VACGT, gen_helper_neon_acgt_f32, false)
1133 DO_3S_FP(VMAX, gen_helper_vfp_maxs, false)
1134 DO_3S_FP(VMIN, gen_helper_vfp_mins, false)
1135
1136 static void gen_VMLA_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
1137                             TCGv_ptr fpstatus)
1138 {
1139     gen_helper_vfp_muls(vn, vn, vm, fpstatus);
1140     gen_helper_vfp_adds(vd, vd, vn, fpstatus);
1141 }
1142
1143 static void gen_VMLS_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
1144                             TCGv_ptr fpstatus)
1145 {
1146     gen_helper_vfp_muls(vn, vn, vm, fpstatus);
1147     gen_helper_vfp_subs(vd, vd, vn, fpstatus);
1148 }
1149
1150 DO_3S_FP(VMLA, gen_VMLA_fp_3s, true)
1151 DO_3S_FP(VMLS, gen_VMLS_fp_3s, true)
1152
1153 static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
1154 {
1155     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1156         return false;
1157     }
1158
1159     if (a->size != 0) {
1160         /* TODO fp16 support */
1161         return false;
1162     }
1163
1164     return do_3same_fp(s, a, gen_helper_vfp_maxnums, false);
1165 }
1166
1167 static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
1168 {
1169     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1170         return false;
1171     }
1172
1173     if (a->size != 0) {
1174         /* TODO fp16 support */
1175         return false;
1176     }
1177
1178     return do_3same_fp(s, a, gen_helper_vfp_minnums, false);
1179 }
1180
1181 WRAP_ENV_FN(gen_VRECPS_tramp, gen_helper_recps_f32)
1182
1183 static void gen_VRECPS_fp_3s(unsigned vece, uint32_t rd_ofs,
1184                              uint32_t rn_ofs, uint32_t rm_ofs,
1185                              uint32_t oprsz, uint32_t maxsz)
1186 {
1187     static const GVecGen3 ops = { .fni4 = gen_VRECPS_tramp };
1188     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops);
1189 }
1190
1191 static bool trans_VRECPS_fp_3s(DisasContext *s, arg_3same *a)
1192 {
1193     if (a->size != 0) {
1194         /* TODO fp16 support */
1195         return false;
1196     }
1197
1198     return do_3same(s, a, gen_VRECPS_fp_3s);
1199 }
1200
1201 WRAP_ENV_FN(gen_VRSQRTS_tramp, gen_helper_rsqrts_f32)
1202
1203 static void gen_VRSQRTS_fp_3s(unsigned vece, uint32_t rd_ofs,
1204                               uint32_t rn_ofs, uint32_t rm_ofs,
1205                               uint32_t oprsz, uint32_t maxsz)
1206 {
1207     static const GVecGen3 ops = { .fni4 = gen_VRSQRTS_tramp };
1208     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops);
1209 }
1210
1211 static bool trans_VRSQRTS_fp_3s(DisasContext *s, arg_3same *a)
1212 {
1213     if (a->size != 0) {
1214         /* TODO fp16 support */
1215         return false;
1216     }
1217
1218     return do_3same(s, a, gen_VRSQRTS_fp_3s);
1219 }
1220
1221 static void gen_VFMA_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
1222                             TCGv_ptr fpstatus)
1223 {
1224     gen_helper_vfp_muladds(vd, vn, vm, vd, fpstatus);
1225 }
1226
1227 static bool trans_VFMA_fp_3s(DisasContext *s, arg_3same *a)
1228 {
1229     if (!dc_isar_feature(aa32_simdfmac, s)) {
1230         return false;
1231     }
1232
1233     if (a->size != 0) {
1234         /* TODO fp16 support */
1235         return false;
1236     }
1237
1238     return do_3same_fp(s, a, gen_VFMA_fp_3s, true);
1239 }
1240
1241 static void gen_VFMS_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
1242                             TCGv_ptr fpstatus)
1243 {
1244     gen_helper_vfp_negs(vn, vn);
1245     gen_helper_vfp_muladds(vd, vn, vm, vd, fpstatus);
1246 }
1247
1248 static bool trans_VFMS_fp_3s(DisasContext *s, arg_3same *a)
1249 {
1250     if (!dc_isar_feature(aa32_simdfmac, s)) {
1251         return false;
1252     }
1253
1254     if (a->size != 0) {
1255         /* TODO fp16 support */
1256         return false;
1257     }
1258
1259     return do_3same_fp(s, a, gen_VFMS_fp_3s, true);
1260 }
1261
1262 static bool do_3same_fp_pair(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn)
1263 {
1264     /* FP operations handled pairwise 32 bits at a time */
1265     TCGv_i32 tmp, tmp2, tmp3;
1266     TCGv_ptr fpstatus;
1267
1268     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1269         return false;
1270     }
1271
1272     /* UNDEF accesses to D16-D31 if they don't exist. */
1273     if (!dc_isar_feature(aa32_simd_r32, s) &&
1274         ((a->vd | a->vn | a->vm) & 0x10)) {
1275         return false;
1276     }
1277
1278     if (!vfp_access_check(s)) {
1279         return true;
1280     }
1281
1282     assert(a->q == 0); /* enforced by decode patterns */
1283
1284     /*
1285      * Note that we have to be careful not to clobber the source operands
1286      * in the "vm == vd" case by storing the result of the first pass too
1287      * early. Since Q is 0 there are always just two passes, so instead
1288      * of a complicated loop over each pass we just unroll.
1289      */
1290     fpstatus = get_fpstatus_ptr(1);
1291     tmp = neon_load_reg(a->vn, 0);
1292     tmp2 = neon_load_reg(a->vn, 1);
1293     fn(tmp, tmp, tmp2, fpstatus);
1294     tcg_temp_free_i32(tmp2);
1295
1296     tmp3 = neon_load_reg(a->vm, 0);
1297     tmp2 = neon_load_reg(a->vm, 1);
1298     fn(tmp3, tmp3, tmp2, fpstatus);
1299     tcg_temp_free_i32(tmp2);
1300     tcg_temp_free_ptr(fpstatus);
1301
1302     neon_store_reg(a->vd, 0, tmp);
1303     neon_store_reg(a->vd, 1, tmp3);
1304     return true;
1305 }
1306
1307 /*
1308  * For all the functions using this macro, size == 1 means fp16,
1309  * which is an architecture extension we don't implement yet.
1310  */
1311 #define DO_3S_FP_PAIR(INSN,FUNC)                                    \
1312     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
1313     {                                                               \
1314         if (a->size != 0) {                                         \
1315             /* TODO fp16 support */                                 \
1316             return false;                                           \
1317         }                                                           \
1318         return do_3same_fp_pair(s, a, FUNC);                        \
1319     }
1320
1321 DO_3S_FP_PAIR(VPADD, gen_helper_vfp_adds)
1322 DO_3S_FP_PAIR(VPMAX, gen_helper_vfp_maxs)
1323 DO_3S_FP_PAIR(VPMIN, gen_helper_vfp_mins)
1324
1325 static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
1326 {
1327     /* Handle a 2-reg-shift insn which can be vectorized. */
1328     int vec_size = a->q ? 16 : 8;
1329     int rd_ofs = neon_reg_offset(a->vd, 0);
1330     int rm_ofs = neon_reg_offset(a->vm, 0);
1331
1332     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1333         return false;
1334     }
1335
1336     /* UNDEF accesses to D16-D31 if they don't exist. */
1337     if (!dc_isar_feature(aa32_simd_r32, s) &&
1338         ((a->vd | a->vm) & 0x10)) {
1339         return false;
1340     }
1341
1342     if ((a->vm | a->vd) & a->q) {
1343         return false;
1344     }
1345
1346     if (!vfp_access_check(s)) {
1347         return true;
1348     }
1349
1350     fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
1351     return true;
1352 }
1353
1354 #define DO_2SH(INSN, FUNC)                                              \
1355     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1356     {                                                                   \
1357         return do_vector_2sh(s, a, FUNC);                               \
1358     }                                                                   \
1359
1360 DO_2SH(VSHL, tcg_gen_gvec_shli)
1361 DO_2SH(VSLI, gen_gvec_sli)
1362 DO_2SH(VSRI, gen_gvec_sri)
1363 DO_2SH(VSRA_S, gen_gvec_ssra)
1364 DO_2SH(VSRA_U, gen_gvec_usra)
1365 DO_2SH(VRSHR_S, gen_gvec_srshr)
1366 DO_2SH(VRSHR_U, gen_gvec_urshr)
1367 DO_2SH(VRSRA_S, gen_gvec_srsra)
1368 DO_2SH(VRSRA_U, gen_gvec_ursra)
1369
1370 static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
1371 {
1372     /* Signed shift out of range results in all-sign-bits */
1373     a->shift = MIN(a->shift, (8 << a->size) - 1);
1374     return do_vector_2sh(s, a, tcg_gen_gvec_sari);
1375 }
1376
1377 static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
1378                             int64_t shift, uint32_t oprsz, uint32_t maxsz)
1379 {
1380     tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
1381 }
1382
1383 static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
1384 {
1385     /* Shift out of range is architecturally valid and results in zero. */
1386     if (a->shift >= (8 << a->size)) {
1387         return do_vector_2sh(s, a, gen_zero_rd_2sh);
1388     } else {
1389         return do_vector_2sh(s, a, tcg_gen_gvec_shri);
1390     }
1391 }
1392
1393 static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
1394                              NeonGenTwo64OpEnvFn *fn)
1395 {
1396     /*
1397      * 2-reg-and-shift operations, size == 3 case, where the
1398      * function needs to be passed cpu_env.
1399      */
1400     TCGv_i64 constimm;
1401     int pass;
1402
1403     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1404         return false;
1405     }
1406
1407     /* UNDEF accesses to D16-D31 if they don't exist. */
1408     if (!dc_isar_feature(aa32_simd_r32, s) &&
1409         ((a->vd | a->vm) & 0x10)) {
1410         return false;
1411     }
1412
1413     if ((a->vm | a->vd) & a->q) {
1414         return false;
1415     }
1416
1417     if (!vfp_access_check(s)) {
1418         return true;
1419     }
1420
1421     /*
1422      * To avoid excessive duplication of ops we implement shift
1423      * by immediate using the variable shift operations.
1424      */
1425     constimm = tcg_const_i64(dup_const(a->size, a->shift));
1426
1427     for (pass = 0; pass < a->q + 1; pass++) {
1428         TCGv_i64 tmp = tcg_temp_new_i64();
1429
1430         neon_load_reg64(tmp, a->vm + pass);
1431         fn(tmp, cpu_env, tmp, constimm);
1432         neon_store_reg64(tmp, a->vd + pass);
1433         tcg_temp_free_i64(tmp);
1434     }
1435     tcg_temp_free_i64(constimm);
1436     return true;
1437 }
1438
1439 static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
1440                              NeonGenTwoOpEnvFn *fn)
1441 {
1442     /*
1443      * 2-reg-and-shift operations, size < 3 case, where the
1444      * helper needs to be passed cpu_env.
1445      */
1446     TCGv_i32 constimm;
1447     int pass;
1448
1449     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1450         return false;
1451     }
1452
1453     /* UNDEF accesses to D16-D31 if they don't exist. */
1454     if (!dc_isar_feature(aa32_simd_r32, s) &&
1455         ((a->vd | a->vm) & 0x10)) {
1456         return false;
1457     }
1458
1459     if ((a->vm | a->vd) & a->q) {
1460         return false;
1461     }
1462
1463     if (!vfp_access_check(s)) {
1464         return true;
1465     }
1466
1467     /*
1468      * To avoid excessive duplication of ops we implement shift
1469      * by immediate using the variable shift operations.
1470      */
1471     constimm = tcg_const_i32(dup_const(a->size, a->shift));
1472
1473     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1474         TCGv_i32 tmp = neon_load_reg(a->vm, pass);
1475         fn(tmp, cpu_env, tmp, constimm);
1476         neon_store_reg(a->vd, pass, tmp);
1477     }
1478     tcg_temp_free_i32(constimm);
1479     return true;
1480 }
1481
1482 #define DO_2SHIFT_ENV(INSN, FUNC)                                       \
1483     static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
1484     {                                                                   \
1485         return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
1486     }                                                                   \
1487     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1488     {                                                                   \
1489         static NeonGenTwoOpEnvFn * const fns[] = {                      \
1490             gen_helper_neon_##FUNC##8,                                  \
1491             gen_helper_neon_##FUNC##16,                                 \
1492             gen_helper_neon_##FUNC##32,                                 \
1493         };                                                              \
1494         assert(a->size < ARRAY_SIZE(fns));                              \
1495         return do_2shift_env_32(s, a, fns[a->size]);                    \
1496     }
1497
1498 DO_2SHIFT_ENV(VQSHLU, qshlu_s)
1499 DO_2SHIFT_ENV(VQSHL_U, qshl_u)
1500 DO_2SHIFT_ENV(VQSHL_S, qshl_s)
1501
1502 static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
1503                                 NeonGenTwo64OpFn *shiftfn,
1504                                 NeonGenNarrowEnvFn *narrowfn)
1505 {
1506     /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
1507     TCGv_i64 constimm, rm1, rm2;
1508     TCGv_i32 rd;
1509
1510     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1511         return false;
1512     }
1513
1514     /* UNDEF accesses to D16-D31 if they don't exist. */
1515     if (!dc_isar_feature(aa32_simd_r32, s) &&
1516         ((a->vd | a->vm) & 0x10)) {
1517         return false;
1518     }
1519
1520     if (a->vm & 1) {
1521         return false;
1522     }
1523
1524     if (!vfp_access_check(s)) {
1525         return true;
1526     }
1527
1528     /*
1529      * This is always a right shift, and the shiftfn is always a
1530      * left-shift helper, which thus needs the negated shift count.
1531      */
1532     constimm = tcg_const_i64(-a->shift);
1533     rm1 = tcg_temp_new_i64();
1534     rm2 = tcg_temp_new_i64();
1535
1536     /* Load both inputs first to avoid potential overwrite if rm == rd */
1537     neon_load_reg64(rm1, a->vm);
1538     neon_load_reg64(rm2, a->vm + 1);
1539
1540     shiftfn(rm1, rm1, constimm);
1541     rd = tcg_temp_new_i32();
1542     narrowfn(rd, cpu_env, rm1);
1543     neon_store_reg(a->vd, 0, rd);
1544
1545     shiftfn(rm2, rm2, constimm);
1546     rd = tcg_temp_new_i32();
1547     narrowfn(rd, cpu_env, rm2);
1548     neon_store_reg(a->vd, 1, rd);
1549
1550     tcg_temp_free_i64(rm1);
1551     tcg_temp_free_i64(rm2);
1552     tcg_temp_free_i64(constimm);
1553
1554     return true;
1555 }
1556
1557 static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
1558                                 NeonGenTwoOpFn *shiftfn,
1559                                 NeonGenNarrowEnvFn *narrowfn)
1560 {
1561     /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
1562     TCGv_i32 constimm, rm1, rm2, rm3, rm4;
1563     TCGv_i64 rtmp;
1564     uint32_t imm;
1565
1566     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1567         return false;
1568     }
1569
1570     /* UNDEF accesses to D16-D31 if they don't exist. */
1571     if (!dc_isar_feature(aa32_simd_r32, s) &&
1572         ((a->vd | a->vm) & 0x10)) {
1573         return false;
1574     }
1575
1576     if (a->vm & 1) {
1577         return false;
1578     }
1579
1580     if (!vfp_access_check(s)) {
1581         return true;
1582     }
1583
1584     /*
1585      * This is always a right shift, and the shiftfn is always a
1586      * left-shift helper, which thus needs the negated shift count
1587      * duplicated into each lane of the immediate value.
1588      */
1589     if (a->size == 1) {
1590         imm = (uint16_t)(-a->shift);
1591         imm |= imm << 16;
1592     } else {
1593         /* size == 2 */
1594         imm = -a->shift;
1595     }
1596     constimm = tcg_const_i32(imm);
1597
1598     /* Load all inputs first to avoid potential overwrite */
1599     rm1 = neon_load_reg(a->vm, 0);
1600     rm2 = neon_load_reg(a->vm, 1);
1601     rm3 = neon_load_reg(a->vm + 1, 0);
1602     rm4 = neon_load_reg(a->vm + 1, 1);
1603     rtmp = tcg_temp_new_i64();
1604
1605     shiftfn(rm1, rm1, constimm);
1606     shiftfn(rm2, rm2, constimm);
1607
1608     tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
1609     tcg_temp_free_i32(rm2);
1610
1611     narrowfn(rm1, cpu_env, rtmp);
1612     neon_store_reg(a->vd, 0, rm1);
1613
1614     shiftfn(rm3, rm3, constimm);
1615     shiftfn(rm4, rm4, constimm);
1616     tcg_temp_free_i32(constimm);
1617
1618     tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
1619     tcg_temp_free_i32(rm4);
1620
1621     narrowfn(rm3, cpu_env, rtmp);
1622     tcg_temp_free_i64(rtmp);
1623     neon_store_reg(a->vd, 1, rm3);
1624     return true;
1625 }
1626
1627 #define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
1628     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1629     {                                                                   \
1630         return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
1631     }
1632 #define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
1633     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1634     {                                                                   \
1635         return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
1636     }
1637
1638 static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1639 {
1640     tcg_gen_extrl_i64_i32(dest, src);
1641 }
1642
1643 static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1644 {
1645     gen_helper_neon_narrow_u16(dest, src);
1646 }
1647
1648 static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1649 {
1650     gen_helper_neon_narrow_u8(dest, src);
1651 }
1652
1653 DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
1654 DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
1655 DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
1656
1657 DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
1658 DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
1659 DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
1660
1661 DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
1662 DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
1663 DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
1664
1665 DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
1666 DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
1667 DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
1668 DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
1669 DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
1670 DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
1671
1672 DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
1673 DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
1674 DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
1675
1676 DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
1677 DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
1678 DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
1679
1680 DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
1681 DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
1682 DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
1683
1684 static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
1685                          NeonGenWidenFn *widenfn, bool u)
1686 {
1687     TCGv_i64 tmp;
1688     TCGv_i32 rm0, rm1;
1689     uint64_t widen_mask = 0;
1690
1691     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1692         return false;
1693     }
1694
1695     /* UNDEF accesses to D16-D31 if they don't exist. */
1696     if (!dc_isar_feature(aa32_simd_r32, s) &&
1697         ((a->vd | a->vm) & 0x10)) {
1698         return false;
1699     }
1700
1701     if (a->vd & 1) {
1702         return false;
1703     }
1704
1705     if (!vfp_access_check(s)) {
1706         return true;
1707     }
1708
1709     /*
1710      * This is a widen-and-shift operation. The shift is always less
1711      * than the width of the source type, so after widening the input
1712      * vector we can simply shift the whole 64-bit widened register,
1713      * and then clear the potential overflow bits resulting from left
1714      * bits of the narrow input appearing as right bits of the left
1715      * neighbour narrow input. Calculate a mask of bits to clear.
1716      */
1717     if ((a->shift != 0) && (a->size < 2 || u)) {
1718         int esize = 8 << a->size;
1719         widen_mask = MAKE_64BIT_MASK(0, esize);
1720         widen_mask >>= esize - a->shift;
1721         widen_mask = dup_const(a->size + 1, widen_mask);
1722     }
1723
1724     rm0 = neon_load_reg(a->vm, 0);
1725     rm1 = neon_load_reg(a->vm, 1);
1726     tmp = tcg_temp_new_i64();
1727
1728     widenfn(tmp, rm0);
1729     tcg_temp_free_i32(rm0);
1730     if (a->shift != 0) {
1731         tcg_gen_shli_i64(tmp, tmp, a->shift);
1732         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1733     }
1734     neon_store_reg64(tmp, a->vd);
1735
1736     widenfn(tmp, rm1);
1737     tcg_temp_free_i32(rm1);
1738     if (a->shift != 0) {
1739         tcg_gen_shli_i64(tmp, tmp, a->shift);
1740         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1741     }
1742     neon_store_reg64(tmp, a->vd + 1);
1743     tcg_temp_free_i64(tmp);
1744     return true;
1745 }
1746
1747 static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
1748 {
1749     static NeonGenWidenFn * const widenfn[] = {
1750         gen_helper_neon_widen_s8,
1751         gen_helper_neon_widen_s16,
1752         tcg_gen_ext_i32_i64,
1753     };
1754     return do_vshll_2sh(s, a, widenfn[a->size], false);
1755 }
1756
1757 static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
1758 {
1759     static NeonGenWidenFn * const widenfn[] = {
1760         gen_helper_neon_widen_u8,
1761         gen_helper_neon_widen_u16,
1762         tcg_gen_extu_i32_i64,
1763     };
1764     return do_vshll_2sh(s, a, widenfn[a->size], true);
1765 }
1766
1767 static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
1768                       NeonGenTwoSingleOpFn *fn)
1769 {
1770     /* FP operations in 2-reg-and-shift group */
1771     TCGv_i32 tmp, shiftv;
1772     TCGv_ptr fpstatus;
1773     int pass;
1774
1775     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1776         return false;
1777     }
1778
1779     /* UNDEF accesses to D16-D31 if they don't exist. */
1780     if (!dc_isar_feature(aa32_simd_r32, s) &&
1781         ((a->vd | a->vm) & 0x10)) {
1782         return false;
1783     }
1784
1785     if ((a->vm | a->vd) & a->q) {
1786         return false;
1787     }
1788
1789     if (!vfp_access_check(s)) {
1790         return true;
1791     }
1792
1793     fpstatus = get_fpstatus_ptr(1);
1794     shiftv = tcg_const_i32(a->shift);
1795     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1796         tmp = neon_load_reg(a->vm, pass);
1797         fn(tmp, tmp, shiftv, fpstatus);
1798         neon_store_reg(a->vd, pass, tmp);
1799     }
1800     tcg_temp_free_ptr(fpstatus);
1801     tcg_temp_free_i32(shiftv);
1802     return true;
1803 }
1804
1805 #define DO_FP_2SH(INSN, FUNC)                                           \
1806     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1807     {                                                                   \
1808         return do_fp_2sh(s, a, FUNC);                                   \
1809     }
1810
1811 DO_FP_2SH(VCVT_SF, gen_helper_vfp_sltos)
1812 DO_FP_2SH(VCVT_UF, gen_helper_vfp_ultos)
1813 DO_FP_2SH(VCVT_FS, gen_helper_vfp_tosls_round_to_zero)
1814 DO_FP_2SH(VCVT_FU, gen_helper_vfp_touls_round_to_zero)
1815
1816 static uint64_t asimd_imm_const(uint32_t imm, int cmode, int op)
1817 {
1818     /*
1819      * Expand the encoded constant.
1820      * Note that cmode = 2,3,4,5,6,7,10,11,12,13 imm=0 is UNPREDICTABLE.
1821      * We choose to not special-case this and will behave as if a
1822      * valid constant encoding of 0 had been given.
1823      * cmode = 15 op = 1 must UNDEF; we assume decode has handled that.
1824      */
1825     switch (cmode) {
1826     case 0: case 1:
1827         /* no-op */
1828         break;
1829     case 2: case 3:
1830         imm <<= 8;
1831         break;
1832     case 4: case 5:
1833         imm <<= 16;
1834         break;
1835     case 6: case 7:
1836         imm <<= 24;
1837         break;
1838     case 8: case 9:
1839         imm |= imm << 16;
1840         break;
1841     case 10: case 11:
1842         imm = (imm << 8) | (imm << 24);
1843         break;
1844     case 12:
1845         imm = (imm << 8) | 0xff;
1846         break;
1847     case 13:
1848         imm = (imm << 16) | 0xffff;
1849         break;
1850     case 14:
1851         if (op) {
1852             /*
1853              * This is the only case where the top and bottom 32 bits
1854              * of the encoded constant differ.
1855              */
1856             uint64_t imm64 = 0;
1857             int n;
1858
1859             for (n = 0; n < 8; n++) {
1860                 if (imm & (1 << n)) {
1861                     imm64 |= (0xffULL << (n * 8));
1862                 }
1863             }
1864             return imm64;
1865         }
1866         imm |= (imm << 8) | (imm << 16) | (imm << 24);
1867         break;
1868     case 15:
1869         imm = ((imm & 0x80) << 24) | ((imm & 0x3f) << 19)
1870             | ((imm & 0x40) ? (0x1f << 25) : (1 << 30));
1871         break;
1872     }
1873     if (op) {
1874         imm = ~imm;
1875     }
1876     return dup_const(MO_32, imm);
1877 }
1878
1879 static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
1880                         GVecGen2iFn *fn)
1881 {
1882     uint64_t imm;
1883     int reg_ofs, vec_size;
1884
1885     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1886         return false;
1887     }
1888
1889     /* UNDEF accesses to D16-D31 if they don't exist. */
1890     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
1891         return false;
1892     }
1893
1894     if (a->vd & a->q) {
1895         return false;
1896     }
1897
1898     if (!vfp_access_check(s)) {
1899         return true;
1900     }
1901
1902     reg_ofs = neon_reg_offset(a->vd, 0);
1903     vec_size = a->q ? 16 : 8;
1904     imm = asimd_imm_const(a->imm, a->cmode, a->op);
1905
1906     fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
1907     return true;
1908 }
1909
1910 static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
1911                         int64_t c, uint32_t oprsz, uint32_t maxsz)
1912 {
1913     tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
1914 }
1915
1916 static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
1917 {
1918     /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
1919     GVecGen2iFn *fn;
1920
1921     if ((a->cmode & 1) && a->cmode < 12) {
1922         /* for op=1, the imm will be inverted, so BIC becomes AND. */
1923         fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
1924     } else {
1925         /* There is one unallocated cmode/op combination in this space */
1926         if (a->cmode == 15 && a->op == 1) {
1927             return false;
1928         }
1929         fn = gen_VMOV_1r;
1930     }
1931     return do_1reg_imm(s, a, fn);
1932 }
1933
1934 static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
1935                            NeonGenWidenFn *widenfn,
1936                            NeonGenTwo64OpFn *opfn,
1937                            bool src1_wide)
1938 {
1939     /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
1940     TCGv_i64 rn0_64, rn1_64, rm_64;
1941     TCGv_i32 rm;
1942
1943     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1944         return false;
1945     }
1946
1947     /* UNDEF accesses to D16-D31 if they don't exist. */
1948     if (!dc_isar_feature(aa32_simd_r32, s) &&
1949         ((a->vd | a->vn | a->vm) & 0x10)) {
1950         return false;
1951     }
1952
1953     if (!widenfn || !opfn) {
1954         /* size == 3 case, which is an entirely different insn group */
1955         return false;
1956     }
1957
1958     if ((a->vd & 1) || (src1_wide && (a->vn & 1))) {
1959         return false;
1960     }
1961
1962     if (!vfp_access_check(s)) {
1963         return true;
1964     }
1965
1966     rn0_64 = tcg_temp_new_i64();
1967     rn1_64 = tcg_temp_new_i64();
1968     rm_64 = tcg_temp_new_i64();
1969
1970     if (src1_wide) {
1971         neon_load_reg64(rn0_64, a->vn);
1972     } else {
1973         TCGv_i32 tmp = neon_load_reg(a->vn, 0);
1974         widenfn(rn0_64, tmp);
1975         tcg_temp_free_i32(tmp);
1976     }
1977     rm = neon_load_reg(a->vm, 0);
1978
1979     widenfn(rm_64, rm);
1980     tcg_temp_free_i32(rm);
1981     opfn(rn0_64, rn0_64, rm_64);
1982
1983     /*
1984      * Load second pass inputs before storing the first pass result, to
1985      * avoid incorrect results if a narrow input overlaps with the result.
1986      */
1987     if (src1_wide) {
1988         neon_load_reg64(rn1_64, a->vn + 1);
1989     } else {
1990         TCGv_i32 tmp = neon_load_reg(a->vn, 1);
1991         widenfn(rn1_64, tmp);
1992         tcg_temp_free_i32(tmp);
1993     }
1994     rm = neon_load_reg(a->vm, 1);
1995
1996     neon_store_reg64(rn0_64, a->vd);
1997
1998     widenfn(rm_64, rm);
1999     tcg_temp_free_i32(rm);
2000     opfn(rn1_64, rn1_64, rm_64);
2001     neon_store_reg64(rn1_64, a->vd + 1);
2002
2003     tcg_temp_free_i64(rn0_64);
2004     tcg_temp_free_i64(rn1_64);
2005     tcg_temp_free_i64(rm_64);
2006
2007     return true;
2008 }
2009
2010 #define DO_PREWIDEN(INSN, S, EXT, OP, SRC1WIDE)                         \
2011     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2012     {                                                                   \
2013         static NeonGenWidenFn * const widenfn[] = {                     \
2014             gen_helper_neon_widen_##S##8,                               \
2015             gen_helper_neon_widen_##S##16,                              \
2016             tcg_gen_##EXT##_i32_i64,                                    \
2017             NULL,                                                       \
2018         };                                                              \
2019         static NeonGenTwo64OpFn * const addfn[] = {                     \
2020             gen_helper_neon_##OP##l_u16,                                \
2021             gen_helper_neon_##OP##l_u32,                                \
2022             tcg_gen_##OP##_i64,                                         \
2023             NULL,                                                       \
2024         };                                                              \
2025         return do_prewiden_3d(s, a, widenfn[a->size],                   \
2026                               addfn[a->size], SRC1WIDE);                \
2027     }
2028
2029 DO_PREWIDEN(VADDL_S, s, ext, add, false)
2030 DO_PREWIDEN(VADDL_U, u, extu, add, false)
2031 DO_PREWIDEN(VSUBL_S, s, ext, sub, false)
2032 DO_PREWIDEN(VSUBL_U, u, extu, sub, false)
2033 DO_PREWIDEN(VADDW_S, s, ext, add, true)
2034 DO_PREWIDEN(VADDW_U, u, extu, add, true)
2035 DO_PREWIDEN(VSUBW_S, s, ext, sub, true)
2036 DO_PREWIDEN(VSUBW_U, u, extu, sub, true)
2037
2038 static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
2039                          NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
2040 {
2041     /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
2042     TCGv_i64 rn_64, rm_64;
2043     TCGv_i32 rd0, rd1;
2044
2045     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2046         return false;
2047     }
2048
2049     /* UNDEF accesses to D16-D31 if they don't exist. */
2050     if (!dc_isar_feature(aa32_simd_r32, s) &&
2051         ((a->vd | a->vn | a->vm) & 0x10)) {
2052         return false;
2053     }
2054
2055     if (!opfn || !narrowfn) {
2056         /* size == 3 case, which is an entirely different insn group */
2057         return false;
2058     }
2059
2060     if ((a->vn | a->vm) & 1) {
2061         return false;
2062     }
2063
2064     if (!vfp_access_check(s)) {
2065         return true;
2066     }
2067
2068     rn_64 = tcg_temp_new_i64();
2069     rm_64 = tcg_temp_new_i64();
2070     rd0 = tcg_temp_new_i32();
2071     rd1 = tcg_temp_new_i32();
2072
2073     neon_load_reg64(rn_64, a->vn);
2074     neon_load_reg64(rm_64, a->vm);
2075
2076     opfn(rn_64, rn_64, rm_64);
2077
2078     narrowfn(rd0, rn_64);
2079
2080     neon_load_reg64(rn_64, a->vn + 1);
2081     neon_load_reg64(rm_64, a->vm + 1);
2082
2083     opfn(rn_64, rn_64, rm_64);
2084
2085     narrowfn(rd1, rn_64);
2086
2087     neon_store_reg(a->vd, 0, rd0);
2088     neon_store_reg(a->vd, 1, rd1);
2089
2090     tcg_temp_free_i64(rn_64);
2091     tcg_temp_free_i64(rm_64);
2092
2093     return true;
2094 }
2095
2096 #define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
2097     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2098     {                                                                   \
2099         static NeonGenTwo64OpFn * const addfn[] = {                     \
2100             gen_helper_neon_##OP##l_u16,                                \
2101             gen_helper_neon_##OP##l_u32,                                \
2102             tcg_gen_##OP##_i64,                                         \
2103             NULL,                                                       \
2104         };                                                              \
2105         static NeonGenNarrowFn * const narrowfn[] = {                   \
2106             gen_helper_neon_##NARROWTYPE##_high_u8,                     \
2107             gen_helper_neon_##NARROWTYPE##_high_u16,                    \
2108             EXTOP,                                                      \
2109             NULL,                                                       \
2110         };                                                              \
2111         return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
2112     }
2113
2114 static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
2115 {
2116     tcg_gen_addi_i64(rn, rn, 1u << 31);
2117     tcg_gen_extrh_i64_i32(rd, rn);
2118 }
2119
2120 DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
2121 DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
2122 DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
2123 DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
2124
2125 static bool do_long_3d(DisasContext *s, arg_3diff *a,
2126                        NeonGenTwoOpWidenFn *opfn,
2127                        NeonGenTwo64OpFn *accfn)
2128 {
2129     /*
2130      * 3-regs different lengths, long operations.
2131      * These perform an operation on two inputs that returns a double-width
2132      * result, and then possibly perform an accumulation operation of
2133      * that result into the double-width destination.
2134      */
2135     TCGv_i64 rd0, rd1, tmp;
2136     TCGv_i32 rn, rm;
2137
2138     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2139         return false;
2140     }
2141
2142     /* UNDEF accesses to D16-D31 if they don't exist. */
2143     if (!dc_isar_feature(aa32_simd_r32, s) &&
2144         ((a->vd | a->vn | a->vm) & 0x10)) {
2145         return false;
2146     }
2147
2148     if (!opfn) {
2149         /* size == 3 case, which is an entirely different insn group */
2150         return false;
2151     }
2152
2153     if (a->vd & 1) {
2154         return false;
2155     }
2156
2157     if (!vfp_access_check(s)) {
2158         return true;
2159     }
2160
2161     rd0 = tcg_temp_new_i64();
2162     rd1 = tcg_temp_new_i64();
2163
2164     rn = neon_load_reg(a->vn, 0);
2165     rm = neon_load_reg(a->vm, 0);
2166     opfn(rd0, rn, rm);
2167     tcg_temp_free_i32(rn);
2168     tcg_temp_free_i32(rm);
2169
2170     rn = neon_load_reg(a->vn, 1);
2171     rm = neon_load_reg(a->vm, 1);
2172     opfn(rd1, rn, rm);
2173     tcg_temp_free_i32(rn);
2174     tcg_temp_free_i32(rm);
2175
2176     /* Don't store results until after all loads: they might overlap */
2177     if (accfn) {
2178         tmp = tcg_temp_new_i64();
2179         neon_load_reg64(tmp, a->vd);
2180         accfn(tmp, tmp, rd0);
2181         neon_store_reg64(tmp, a->vd);
2182         neon_load_reg64(tmp, a->vd + 1);
2183         accfn(tmp, tmp, rd1);
2184         neon_store_reg64(tmp, a->vd + 1);
2185         tcg_temp_free_i64(tmp);
2186     } else {
2187         neon_store_reg64(rd0, a->vd);
2188         neon_store_reg64(rd1, a->vd + 1);
2189     }
2190
2191     tcg_temp_free_i64(rd0);
2192     tcg_temp_free_i64(rd1);
2193
2194     return true;
2195 }
2196
2197 static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
2198 {
2199     static NeonGenTwoOpWidenFn * const opfn[] = {
2200         gen_helper_neon_abdl_s16,
2201         gen_helper_neon_abdl_s32,
2202         gen_helper_neon_abdl_s64,
2203         NULL,
2204     };
2205
2206     return do_long_3d(s, a, opfn[a->size], NULL);
2207 }
2208
2209 static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
2210 {
2211     static NeonGenTwoOpWidenFn * const opfn[] = {
2212         gen_helper_neon_abdl_u16,
2213         gen_helper_neon_abdl_u32,
2214         gen_helper_neon_abdl_u64,
2215         NULL,
2216     };
2217
2218     return do_long_3d(s, a, opfn[a->size], NULL);
2219 }
2220
2221 static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
2222 {
2223     static NeonGenTwoOpWidenFn * const opfn[] = {
2224         gen_helper_neon_abdl_s16,
2225         gen_helper_neon_abdl_s32,
2226         gen_helper_neon_abdl_s64,
2227         NULL,
2228     };
2229     static NeonGenTwo64OpFn * const addfn[] = {
2230         gen_helper_neon_addl_u16,
2231         gen_helper_neon_addl_u32,
2232         tcg_gen_add_i64,
2233         NULL,
2234     };
2235
2236     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2237 }
2238
2239 static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
2240 {
2241     static NeonGenTwoOpWidenFn * const opfn[] = {
2242         gen_helper_neon_abdl_u16,
2243         gen_helper_neon_abdl_u32,
2244         gen_helper_neon_abdl_u64,
2245         NULL,
2246     };
2247     static NeonGenTwo64OpFn * const addfn[] = {
2248         gen_helper_neon_addl_u16,
2249         gen_helper_neon_addl_u32,
2250         tcg_gen_add_i64,
2251         NULL,
2252     };
2253
2254     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2255 }
2256
2257 static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2258 {
2259     TCGv_i32 lo = tcg_temp_new_i32();
2260     TCGv_i32 hi = tcg_temp_new_i32();
2261
2262     tcg_gen_muls2_i32(lo, hi, rn, rm);
2263     tcg_gen_concat_i32_i64(rd, lo, hi);
2264
2265     tcg_temp_free_i32(lo);
2266     tcg_temp_free_i32(hi);
2267 }
2268
2269 static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2270 {
2271     TCGv_i32 lo = tcg_temp_new_i32();
2272     TCGv_i32 hi = tcg_temp_new_i32();
2273
2274     tcg_gen_mulu2_i32(lo, hi, rn, rm);
2275     tcg_gen_concat_i32_i64(rd, lo, hi);
2276
2277     tcg_temp_free_i32(lo);
2278     tcg_temp_free_i32(hi);
2279 }
2280
2281 static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
2282 {
2283     static NeonGenTwoOpWidenFn * const opfn[] = {
2284         gen_helper_neon_mull_s8,
2285         gen_helper_neon_mull_s16,
2286         gen_mull_s32,
2287         NULL,
2288     };
2289
2290     return do_long_3d(s, a, opfn[a->size], NULL);
2291 }
2292
2293 static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
2294 {
2295     static NeonGenTwoOpWidenFn * const opfn[] = {
2296         gen_helper_neon_mull_u8,
2297         gen_helper_neon_mull_u16,
2298         gen_mull_u32,
2299         NULL,
2300     };
2301
2302     return do_long_3d(s, a, opfn[a->size], NULL);
2303 }
2304
2305 #define DO_VMLAL(INSN,MULL,ACC)                                         \
2306     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2307     {                                                                   \
2308         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2309             gen_helper_neon_##MULL##8,                                  \
2310             gen_helper_neon_##MULL##16,                                 \
2311             gen_##MULL##32,                                             \
2312             NULL,                                                       \
2313         };                                                              \
2314         static NeonGenTwo64OpFn * const accfn[] = {                     \
2315             gen_helper_neon_##ACC##l_u16,                               \
2316             gen_helper_neon_##ACC##l_u32,                               \
2317             tcg_gen_##ACC##_i64,                                        \
2318             NULL,                                                       \
2319         };                                                              \
2320         return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
2321     }
2322
2323 DO_VMLAL(VMLAL_S,mull_s,add)
2324 DO_VMLAL(VMLAL_U,mull_u,add)
2325 DO_VMLAL(VMLSL_S,mull_s,sub)
2326 DO_VMLAL(VMLSL_U,mull_u,sub)
2327
2328 static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2329 {
2330     gen_helper_neon_mull_s16(rd, rn, rm);
2331     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rd, rd);
2332 }
2333
2334 static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2335 {
2336     gen_mull_s32(rd, rn, rm);
2337     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rd, rd);
2338 }
2339
2340 static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
2341 {
2342     static NeonGenTwoOpWidenFn * const opfn[] = {
2343         NULL,
2344         gen_VQDMULL_16,
2345         gen_VQDMULL_32,
2346         NULL,
2347     };
2348
2349     return do_long_3d(s, a, opfn[a->size], NULL);
2350 }
2351
2352 static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2353 {
2354     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2355 }
2356
2357 static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2358 {
2359     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2360 }
2361
2362 static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
2363 {
2364     static NeonGenTwoOpWidenFn * const opfn[] = {
2365         NULL,
2366         gen_VQDMULL_16,
2367         gen_VQDMULL_32,
2368         NULL,
2369     };
2370     static NeonGenTwo64OpFn * const accfn[] = {
2371         NULL,
2372         gen_VQDMLAL_acc_16,
2373         gen_VQDMLAL_acc_32,
2374         NULL,
2375     };
2376
2377     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2378 }
2379
2380 static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2381 {
2382     gen_helper_neon_negl_u32(rm, rm);
2383     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2384 }
2385
2386 static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2387 {
2388     tcg_gen_neg_i64(rm, rm);
2389     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2390 }
2391
2392 static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
2393 {
2394     static NeonGenTwoOpWidenFn * const opfn[] = {
2395         NULL,
2396         gen_VQDMULL_16,
2397         gen_VQDMULL_32,
2398         NULL,
2399     };
2400     static NeonGenTwo64OpFn * const accfn[] = {
2401         NULL,
2402         gen_VQDMLSL_acc_16,
2403         gen_VQDMLSL_acc_32,
2404         NULL,
2405     };
2406
2407     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2408 }
2409
2410 static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
2411 {
2412     gen_helper_gvec_3 *fn_gvec;
2413
2414     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2415         return false;
2416     }
2417
2418     /* UNDEF accesses to D16-D31 if they don't exist. */
2419     if (!dc_isar_feature(aa32_simd_r32, s) &&
2420         ((a->vd | a->vn | a->vm) & 0x10)) {
2421         return false;
2422     }
2423
2424     if (a->vd & 1) {
2425         return false;
2426     }
2427
2428     switch (a->size) {
2429     case 0:
2430         fn_gvec = gen_helper_neon_pmull_h;
2431         break;
2432     case 2:
2433         if (!dc_isar_feature(aa32_pmull, s)) {
2434             return false;
2435         }
2436         fn_gvec = gen_helper_gvec_pmull_q;
2437         break;
2438     default:
2439         return false;
2440     }
2441
2442     if (!vfp_access_check(s)) {
2443         return true;
2444     }
2445
2446     tcg_gen_gvec_3_ool(neon_reg_offset(a->vd, 0),
2447                        neon_reg_offset(a->vn, 0),
2448                        neon_reg_offset(a->vm, 0),
2449                        16, 16, 0, fn_gvec);
2450     return true;
2451 }
2452
2453 static void gen_neon_dup_low16(TCGv_i32 var)
2454 {
2455     TCGv_i32 tmp = tcg_temp_new_i32();
2456     tcg_gen_ext16u_i32(var, var);
2457     tcg_gen_shli_i32(tmp, var, 16);
2458     tcg_gen_or_i32(var, var, tmp);
2459     tcg_temp_free_i32(tmp);
2460 }
2461
2462 static void gen_neon_dup_high16(TCGv_i32 var)
2463 {
2464     TCGv_i32 tmp = tcg_temp_new_i32();
2465     tcg_gen_andi_i32(var, var, 0xffff0000);
2466     tcg_gen_shri_i32(tmp, var, 16);
2467     tcg_gen_or_i32(var, var, tmp);
2468     tcg_temp_free_i32(tmp);
2469 }
2470
2471 static inline TCGv_i32 neon_get_scalar(int size, int reg)
2472 {
2473     TCGv_i32 tmp;
2474     if (size == 1) {
2475         tmp = neon_load_reg(reg & 7, reg >> 4);
2476         if (reg & 8) {
2477             gen_neon_dup_high16(tmp);
2478         } else {
2479             gen_neon_dup_low16(tmp);
2480         }
2481     } else {
2482         tmp = neon_load_reg(reg & 15, reg >> 4);
2483     }
2484     return tmp;
2485 }
2486
2487 static bool do_2scalar(DisasContext *s, arg_2scalar *a,
2488                        NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
2489 {
2490     /*
2491      * Two registers and a scalar: perform an operation between
2492      * the input elements and the scalar, and then possibly
2493      * perform an accumulation operation of that result into the
2494      * destination.
2495      */
2496     TCGv_i32 scalar;
2497     int pass;
2498
2499     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2500         return false;
2501     }
2502
2503     /* UNDEF accesses to D16-D31 if they don't exist. */
2504     if (!dc_isar_feature(aa32_simd_r32, s) &&
2505         ((a->vd | a->vn | a->vm) & 0x10)) {
2506         return false;
2507     }
2508
2509     if (!opfn) {
2510         /* Bad size (including size == 3, which is a different insn group) */
2511         return false;
2512     }
2513
2514     if (a->q && ((a->vd | a->vn) & 1)) {
2515         return false;
2516     }
2517
2518     if (!vfp_access_check(s)) {
2519         return true;
2520     }
2521
2522     scalar = neon_get_scalar(a->size, a->vm);
2523
2524     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2525         TCGv_i32 tmp = neon_load_reg(a->vn, pass);
2526         opfn(tmp, tmp, scalar);
2527         if (accfn) {
2528             TCGv_i32 rd = neon_load_reg(a->vd, pass);
2529             accfn(tmp, rd, tmp);
2530             tcg_temp_free_i32(rd);
2531         }
2532         neon_store_reg(a->vd, pass, tmp);
2533     }
2534     tcg_temp_free_i32(scalar);
2535     return true;
2536 }
2537
2538 static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
2539 {
2540     static NeonGenTwoOpFn * const opfn[] = {
2541         NULL,
2542         gen_helper_neon_mul_u16,
2543         tcg_gen_mul_i32,
2544         NULL,
2545     };
2546
2547     return do_2scalar(s, a, opfn[a->size], NULL);
2548 }
2549
2550 static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
2551 {
2552     static NeonGenTwoOpFn * const opfn[] = {
2553         NULL,
2554         gen_helper_neon_mul_u16,
2555         tcg_gen_mul_i32,
2556         NULL,
2557     };
2558     static NeonGenTwoOpFn * const accfn[] = {
2559         NULL,
2560         gen_helper_neon_add_u16,
2561         tcg_gen_add_i32,
2562         NULL,
2563     };
2564
2565     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2566 }
2567
2568 static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
2569 {
2570     static NeonGenTwoOpFn * const opfn[] = {
2571         NULL,
2572         gen_helper_neon_mul_u16,
2573         tcg_gen_mul_i32,
2574         NULL,
2575     };
2576     static NeonGenTwoOpFn * const accfn[] = {
2577         NULL,
2578         gen_helper_neon_sub_u16,
2579         tcg_gen_sub_i32,
2580         NULL,
2581     };
2582
2583     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2584 }
2585
2586 /*
2587  * Rather than have a float-specific version of do_2scalar just for
2588  * three insns, we wrap a NeonGenTwoSingleOpFn to turn it into
2589  * a NeonGenTwoOpFn.
2590  */
2591 #define WRAP_FP_FN(WRAPNAME, FUNC)                              \
2592     static void WRAPNAME(TCGv_i32 rd, TCGv_i32 rn, TCGv_i32 rm) \
2593     {                                                           \
2594         TCGv_ptr fpstatus = get_fpstatus_ptr(1);                \
2595         FUNC(rd, rn, rm, fpstatus);                             \
2596         tcg_temp_free_ptr(fpstatus);                            \
2597     }
2598
2599 WRAP_FP_FN(gen_VMUL_F_mul, gen_helper_vfp_muls)
2600 WRAP_FP_FN(gen_VMUL_F_add, gen_helper_vfp_adds)
2601 WRAP_FP_FN(gen_VMUL_F_sub, gen_helper_vfp_subs)
2602
2603 static bool trans_VMUL_F_2sc(DisasContext *s, arg_2scalar *a)
2604 {
2605     static NeonGenTwoOpFn * const opfn[] = {
2606         NULL,
2607         NULL, /* TODO: fp16 support */
2608         gen_VMUL_F_mul,
2609         NULL,
2610     };
2611
2612     return do_2scalar(s, a, opfn[a->size], NULL);
2613 }
2614
2615 static bool trans_VMLA_F_2sc(DisasContext *s, arg_2scalar *a)
2616 {
2617     static NeonGenTwoOpFn * const opfn[] = {
2618         NULL,
2619         NULL, /* TODO: fp16 support */
2620         gen_VMUL_F_mul,
2621         NULL,
2622     };
2623     static NeonGenTwoOpFn * const accfn[] = {
2624         NULL,
2625         NULL, /* TODO: fp16 support */
2626         gen_VMUL_F_add,
2627         NULL,
2628     };
2629
2630     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2631 }
2632
2633 static bool trans_VMLS_F_2sc(DisasContext *s, arg_2scalar *a)
2634 {
2635     static NeonGenTwoOpFn * const opfn[] = {
2636         NULL,
2637         NULL, /* TODO: fp16 support */
2638         gen_VMUL_F_mul,
2639         NULL,
2640     };
2641     static NeonGenTwoOpFn * const accfn[] = {
2642         NULL,
2643         NULL, /* TODO: fp16 support */
2644         gen_VMUL_F_sub,
2645         NULL,
2646     };
2647
2648     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2649 }
2650
2651 WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
2652 WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
2653 WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
2654 WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
2655
2656 static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
2657 {
2658     static NeonGenTwoOpFn * const opfn[] = {
2659         NULL,
2660         gen_VQDMULH_16,
2661         gen_VQDMULH_32,
2662         NULL,
2663     };
2664
2665     return do_2scalar(s, a, opfn[a->size], NULL);
2666 }
2667
2668 static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
2669 {
2670     static NeonGenTwoOpFn * const opfn[] = {
2671         NULL,
2672         gen_VQRDMULH_16,
2673         gen_VQRDMULH_32,
2674         NULL,
2675     };
2676
2677     return do_2scalar(s, a, opfn[a->size], NULL);
2678 }
2679
2680 static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
2681                             NeonGenThreeOpEnvFn *opfn)
2682 {
2683     /*
2684      * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
2685      * performs a kind of fused op-then-accumulate using a helper
2686      * function that takes all of rd, rn and the scalar at once.
2687      */
2688     TCGv_i32 scalar;
2689     int pass;
2690
2691     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2692         return false;
2693     }
2694
2695     if (!dc_isar_feature(aa32_rdm, s)) {
2696         return false;
2697     }
2698
2699     /* UNDEF accesses to D16-D31 if they don't exist. */
2700     if (!dc_isar_feature(aa32_simd_r32, s) &&
2701         ((a->vd | a->vn | a->vm) & 0x10)) {
2702         return false;
2703     }
2704
2705     if (!opfn) {
2706         /* Bad size (including size == 3, which is a different insn group) */
2707         return false;
2708     }
2709
2710     if (a->q && ((a->vd | a->vn) & 1)) {
2711         return false;
2712     }
2713
2714     if (!vfp_access_check(s)) {
2715         return true;
2716     }
2717
2718     scalar = neon_get_scalar(a->size, a->vm);
2719
2720     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2721         TCGv_i32 rn = neon_load_reg(a->vn, pass);
2722         TCGv_i32 rd = neon_load_reg(a->vd, pass);
2723         opfn(rd, cpu_env, rn, scalar, rd);
2724         tcg_temp_free_i32(rn);
2725         neon_store_reg(a->vd, pass, rd);
2726     }
2727     tcg_temp_free_i32(scalar);
2728
2729     return true;
2730 }
2731
2732 static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
2733 {
2734     static NeonGenThreeOpEnvFn *opfn[] = {
2735         NULL,
2736         gen_helper_neon_qrdmlah_s16,
2737         gen_helper_neon_qrdmlah_s32,
2738         NULL,
2739     };
2740     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2741 }
2742
2743 static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
2744 {
2745     static NeonGenThreeOpEnvFn *opfn[] = {
2746         NULL,
2747         gen_helper_neon_qrdmlsh_s16,
2748         gen_helper_neon_qrdmlsh_s32,
2749         NULL,
2750     };
2751     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2752 }
2753
2754 static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
2755                             NeonGenTwoOpWidenFn *opfn,
2756                             NeonGenTwo64OpFn *accfn)
2757 {
2758     /*
2759      * Two registers and a scalar, long operations: perform an
2760      * operation on the input elements and the scalar which produces
2761      * a double-width result, and then possibly perform an accumulation
2762      * operation of that result into the destination.
2763      */
2764     TCGv_i32 scalar, rn;
2765     TCGv_i64 rn0_64, rn1_64;
2766
2767     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2768         return false;
2769     }
2770
2771     /* UNDEF accesses to D16-D31 if they don't exist. */
2772     if (!dc_isar_feature(aa32_simd_r32, s) &&
2773         ((a->vd | a->vn | a->vm) & 0x10)) {
2774         return false;
2775     }
2776
2777     if (!opfn) {
2778         /* Bad size (including size == 3, which is a different insn group) */
2779         return false;
2780     }
2781
2782     if (a->vd & 1) {
2783         return false;
2784     }
2785
2786     if (!vfp_access_check(s)) {
2787         return true;
2788     }
2789
2790     scalar = neon_get_scalar(a->size, a->vm);
2791
2792     /* Load all inputs before writing any outputs, in case of overlap */
2793     rn = neon_load_reg(a->vn, 0);
2794     rn0_64 = tcg_temp_new_i64();
2795     opfn(rn0_64, rn, scalar);
2796     tcg_temp_free_i32(rn);
2797
2798     rn = neon_load_reg(a->vn, 1);
2799     rn1_64 = tcg_temp_new_i64();
2800     opfn(rn1_64, rn, scalar);
2801     tcg_temp_free_i32(rn);
2802     tcg_temp_free_i32(scalar);
2803
2804     if (accfn) {
2805         TCGv_i64 t64 = tcg_temp_new_i64();
2806         neon_load_reg64(t64, a->vd);
2807         accfn(t64, t64, rn0_64);
2808         neon_store_reg64(t64, a->vd);
2809         neon_load_reg64(t64, a->vd + 1);
2810         accfn(t64, t64, rn1_64);
2811         neon_store_reg64(t64, a->vd + 1);
2812         tcg_temp_free_i64(t64);
2813     } else {
2814         neon_store_reg64(rn0_64, a->vd);
2815         neon_store_reg64(rn1_64, a->vd + 1);
2816     }
2817     tcg_temp_free_i64(rn0_64);
2818     tcg_temp_free_i64(rn1_64);
2819     return true;
2820 }
2821
2822 static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
2823 {
2824     static NeonGenTwoOpWidenFn * const opfn[] = {
2825         NULL,
2826         gen_helper_neon_mull_s16,
2827         gen_mull_s32,
2828         NULL,
2829     };
2830
2831     return do_2scalar_long(s, a, opfn[a->size], NULL);
2832 }
2833
2834 static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
2835 {
2836     static NeonGenTwoOpWidenFn * const opfn[] = {
2837         NULL,
2838         gen_helper_neon_mull_u16,
2839         gen_mull_u32,
2840         NULL,
2841     };
2842
2843     return do_2scalar_long(s, a, opfn[a->size], NULL);
2844 }
2845
2846 #define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
2847     static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
2848     {                                                                   \
2849         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2850             NULL,                                                       \
2851             gen_helper_neon_##MULL##16,                                 \
2852             gen_##MULL##32,                                             \
2853             NULL,                                                       \
2854         };                                                              \
2855         static NeonGenTwo64OpFn * const accfn[] = {                     \
2856             NULL,                                                       \
2857             gen_helper_neon_##ACC##l_u32,                               \
2858             tcg_gen_##ACC##_i64,                                        \
2859             NULL,                                                       \
2860         };                                                              \
2861         return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
2862     }
2863
2864 DO_VMLAL_2SC(VMLAL_S, mull_s, add)
2865 DO_VMLAL_2SC(VMLAL_U, mull_u, add)
2866 DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
2867 DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
2868
2869 static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
2870 {
2871     static NeonGenTwoOpWidenFn * const opfn[] = {
2872         NULL,
2873         gen_VQDMULL_16,
2874         gen_VQDMULL_32,
2875         NULL,
2876     };
2877
2878     return do_2scalar_long(s, a, opfn[a->size], NULL);
2879 }
2880
2881 static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
2882 {
2883     static NeonGenTwoOpWidenFn * const opfn[] = {
2884         NULL,
2885         gen_VQDMULL_16,
2886         gen_VQDMULL_32,
2887         NULL,
2888     };
2889     static NeonGenTwo64OpFn * const accfn[] = {
2890         NULL,
2891         gen_VQDMLAL_acc_16,
2892         gen_VQDMLAL_acc_32,
2893         NULL,
2894     };
2895
2896     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2897 }
2898
2899 static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
2900 {
2901     static NeonGenTwoOpWidenFn * const opfn[] = {
2902         NULL,
2903         gen_VQDMULL_16,
2904         gen_VQDMULL_32,
2905         NULL,
2906     };
2907     static NeonGenTwo64OpFn * const accfn[] = {
2908         NULL,
2909         gen_VQDMLSL_acc_16,
2910         gen_VQDMLSL_acc_32,
2911         NULL,
2912     };
2913
2914     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2915 }
2916
2917 static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
2918 {
2919     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2920         return false;
2921     }
2922
2923     /* UNDEF accesses to D16-D31 if they don't exist. */
2924     if (!dc_isar_feature(aa32_simd_r32, s) &&
2925         ((a->vd | a->vn | a->vm) & 0x10)) {
2926         return false;
2927     }
2928
2929     if ((a->vn | a->vm | a->vd) & a->q) {
2930         return false;
2931     }
2932
2933     if (a->imm > 7 && !a->q) {
2934         return false;
2935     }
2936
2937     if (!vfp_access_check(s)) {
2938         return true;
2939     }
2940
2941     if (!a->q) {
2942         /* Extract 64 bits from <Vm:Vn> */
2943         TCGv_i64 left, right, dest;
2944
2945         left = tcg_temp_new_i64();
2946         right = tcg_temp_new_i64();
2947         dest = tcg_temp_new_i64();
2948
2949         neon_load_reg64(right, a->vn);
2950         neon_load_reg64(left, a->vm);
2951         tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
2952         neon_store_reg64(dest, a->vd);
2953
2954         tcg_temp_free_i64(left);
2955         tcg_temp_free_i64(right);
2956         tcg_temp_free_i64(dest);
2957     } else {
2958         /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
2959         TCGv_i64 left, middle, right, destleft, destright;
2960
2961         left = tcg_temp_new_i64();
2962         middle = tcg_temp_new_i64();
2963         right = tcg_temp_new_i64();
2964         destleft = tcg_temp_new_i64();
2965         destright = tcg_temp_new_i64();
2966
2967         if (a->imm < 8) {
2968             neon_load_reg64(right, a->vn);
2969             neon_load_reg64(middle, a->vn + 1);
2970             tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
2971             neon_load_reg64(left, a->vm);
2972             tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
2973         } else {
2974             neon_load_reg64(right, a->vn + 1);
2975             neon_load_reg64(middle, a->vm);
2976             tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
2977             neon_load_reg64(left, a->vm + 1);
2978             tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
2979         }
2980
2981         neon_store_reg64(destright, a->vd);
2982         neon_store_reg64(destleft, a->vd + 1);
2983
2984         tcg_temp_free_i64(destright);
2985         tcg_temp_free_i64(destleft);
2986         tcg_temp_free_i64(right);
2987         tcg_temp_free_i64(middle);
2988         tcg_temp_free_i64(left);
2989     }
2990     return true;
2991 }
2992
2993 static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
2994 {
2995     int n;
2996     TCGv_i32 tmp, tmp2, tmp3, tmp4;
2997     TCGv_ptr ptr1;
2998
2999     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3000         return false;
3001     }
3002
3003     /* UNDEF accesses to D16-D31 if they don't exist. */
3004     if (!dc_isar_feature(aa32_simd_r32, s) &&
3005         ((a->vd | a->vn | a->vm) & 0x10)) {
3006         return false;
3007     }
3008
3009     if (!vfp_access_check(s)) {
3010         return true;
3011     }
3012
3013     n = a->len + 1;
3014     if ((a->vn + n) > 32) {
3015         /*
3016          * This is UNPREDICTABLE; we choose to UNDEF to avoid the
3017          * helper function running off the end of the register file.
3018          */
3019         return false;
3020     }
3021     n <<= 3;
3022     if (a->op) {
3023         tmp = neon_load_reg(a->vd, 0);
3024     } else {
3025         tmp = tcg_temp_new_i32();
3026         tcg_gen_movi_i32(tmp, 0);
3027     }
3028     tmp2 = neon_load_reg(a->vm, 0);
3029     ptr1 = vfp_reg_ptr(true, a->vn);
3030     tmp4 = tcg_const_i32(n);
3031     gen_helper_neon_tbl(tmp2, tmp2, tmp, ptr1, tmp4);
3032     tcg_temp_free_i32(tmp);
3033     if (a->op) {
3034         tmp = neon_load_reg(a->vd, 1);
3035     } else {
3036         tmp = tcg_temp_new_i32();
3037         tcg_gen_movi_i32(tmp, 0);
3038     }
3039     tmp3 = neon_load_reg(a->vm, 1);
3040     gen_helper_neon_tbl(tmp3, tmp3, tmp, ptr1, tmp4);
3041     tcg_temp_free_i32(tmp4);
3042     tcg_temp_free_ptr(ptr1);
3043     neon_store_reg(a->vd, 0, tmp2);
3044     neon_store_reg(a->vd, 1, tmp3);
3045     tcg_temp_free_i32(tmp);
3046     return true;
3047 }
3048
3049 static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
3050 {
3051     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3052         return false;
3053     }
3054
3055     /* UNDEF accesses to D16-D31 if they don't exist. */
3056     if (!dc_isar_feature(aa32_simd_r32, s) &&
3057         ((a->vd | a->vm) & 0x10)) {
3058         return false;
3059     }
3060
3061     if (a->vd & a->q) {
3062         return false;
3063     }
3064
3065     if (!vfp_access_check(s)) {
3066         return true;
3067     }
3068
3069     tcg_gen_gvec_dup_mem(a->size, neon_reg_offset(a->vd, 0),
3070                          neon_element_offset(a->vm, a->index, a->size),
3071                          a->q ? 16 : 8, a->q ? 16 : 8);
3072     return true;
3073 }
3074
3075 static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
3076 {
3077     int pass, half;
3078
3079     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3080         return false;
3081     }
3082
3083     /* UNDEF accesses to D16-D31 if they don't exist. */
3084     if (!dc_isar_feature(aa32_simd_r32, s) &&
3085         ((a->vd | a->vm) & 0x10)) {
3086         return false;
3087     }
3088
3089     if ((a->vd | a->vm) & a->q) {
3090         return false;
3091     }
3092
3093     if (a->size == 3) {
3094         return false;
3095     }
3096
3097     if (!vfp_access_check(s)) {
3098         return true;
3099     }
3100
3101     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
3102         TCGv_i32 tmp[2];
3103
3104         for (half = 0; half < 2; half++) {
3105             tmp[half] = neon_load_reg(a->vm, pass * 2 + half);
3106             switch (a->size) {
3107             case 0:
3108                 tcg_gen_bswap32_i32(tmp[half], tmp[half]);
3109                 break;
3110             case 1:
3111                 gen_swap_half(tmp[half], tmp[half]);
3112                 break;
3113             case 2:
3114                 break;
3115             default:
3116                 g_assert_not_reached();
3117             }
3118         }
3119         neon_store_reg(a->vd, pass * 2, tmp[1]);
3120         neon_store_reg(a->vd, pass * 2 + 1, tmp[0]);
3121     }
3122     return true;
3123 }
3124
3125 static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
3126                               NeonGenWidenFn *widenfn,
3127                               NeonGenTwo64OpFn *opfn,
3128                               NeonGenTwo64OpFn *accfn)
3129 {
3130     /*
3131      * Pairwise long operations: widen both halves of the pair,
3132      * combine the pairs with the opfn, and then possibly accumulate
3133      * into the destination with the accfn.
3134      */
3135     int pass;
3136
3137     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3138         return false;
3139     }
3140
3141     /* UNDEF accesses to D16-D31 if they don't exist. */
3142     if (!dc_isar_feature(aa32_simd_r32, s) &&
3143         ((a->vd | a->vm) & 0x10)) {
3144         return false;
3145     }
3146
3147     if ((a->vd | a->vm) & a->q) {
3148         return false;
3149     }
3150
3151     if (!widenfn) {
3152         return false;
3153     }
3154
3155     if (!vfp_access_check(s)) {
3156         return true;
3157     }
3158
3159     for (pass = 0; pass < a->q + 1; pass++) {
3160         TCGv_i32 tmp;
3161         TCGv_i64 rm0_64, rm1_64, rd_64;
3162
3163         rm0_64 = tcg_temp_new_i64();
3164         rm1_64 = tcg_temp_new_i64();
3165         rd_64 = tcg_temp_new_i64();
3166         tmp = neon_load_reg(a->vm, pass * 2);
3167         widenfn(rm0_64, tmp);
3168         tcg_temp_free_i32(tmp);
3169         tmp = neon_load_reg(a->vm, pass * 2 + 1);
3170         widenfn(rm1_64, tmp);
3171         tcg_temp_free_i32(tmp);
3172         opfn(rd_64, rm0_64, rm1_64);
3173         tcg_temp_free_i64(rm0_64);
3174         tcg_temp_free_i64(rm1_64);
3175
3176         if (accfn) {
3177             TCGv_i64 tmp64 = tcg_temp_new_i64();
3178             neon_load_reg64(tmp64, a->vd + pass);
3179             accfn(rd_64, tmp64, rd_64);
3180             tcg_temp_free_i64(tmp64);
3181         }
3182         neon_store_reg64(rd_64, a->vd + pass);
3183         tcg_temp_free_i64(rd_64);
3184     }
3185     return true;
3186 }
3187
3188 static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
3189 {
3190     static NeonGenWidenFn * const widenfn[] = {
3191         gen_helper_neon_widen_s8,
3192         gen_helper_neon_widen_s16,
3193         tcg_gen_ext_i32_i64,
3194         NULL,
3195     };
3196     static NeonGenTwo64OpFn * const opfn[] = {
3197         gen_helper_neon_paddl_u16,
3198         gen_helper_neon_paddl_u32,
3199         tcg_gen_add_i64,
3200         NULL,
3201     };
3202
3203     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3204 }
3205
3206 static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
3207 {
3208     static NeonGenWidenFn * const widenfn[] = {
3209         gen_helper_neon_widen_u8,
3210         gen_helper_neon_widen_u16,
3211         tcg_gen_extu_i32_i64,
3212         NULL,
3213     };
3214     static NeonGenTwo64OpFn * const opfn[] = {
3215         gen_helper_neon_paddl_u16,
3216         gen_helper_neon_paddl_u32,
3217         tcg_gen_add_i64,
3218         NULL,
3219     };
3220
3221     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3222 }
3223
3224 static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
3225 {
3226     static NeonGenWidenFn * const widenfn[] = {
3227         gen_helper_neon_widen_s8,
3228         gen_helper_neon_widen_s16,
3229         tcg_gen_ext_i32_i64,
3230         NULL,
3231     };
3232     static NeonGenTwo64OpFn * const opfn[] = {
3233         gen_helper_neon_paddl_u16,
3234         gen_helper_neon_paddl_u32,
3235         tcg_gen_add_i64,
3236         NULL,
3237     };
3238     static NeonGenTwo64OpFn * const accfn[] = {
3239         gen_helper_neon_addl_u16,
3240         gen_helper_neon_addl_u32,
3241         tcg_gen_add_i64,
3242         NULL,
3243     };
3244
3245     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3246                              accfn[a->size]);
3247 }
3248
3249 static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
3250 {
3251     static NeonGenWidenFn * const widenfn[] = {
3252         gen_helper_neon_widen_u8,
3253         gen_helper_neon_widen_u16,
3254         tcg_gen_extu_i32_i64,
3255         NULL,
3256     };
3257     static NeonGenTwo64OpFn * const opfn[] = {
3258         gen_helper_neon_paddl_u16,
3259         gen_helper_neon_paddl_u32,
3260         tcg_gen_add_i64,
3261         NULL,
3262     };
3263     static NeonGenTwo64OpFn * const accfn[] = {
3264         gen_helper_neon_addl_u16,
3265         gen_helper_neon_addl_u32,
3266         tcg_gen_add_i64,
3267         NULL,
3268     };
3269
3270     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3271                              accfn[a->size]);
3272 }
3273
3274 typedef void ZipFn(TCGv_ptr, TCGv_ptr);
3275
3276 static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
3277                        ZipFn *fn)
3278 {
3279     TCGv_ptr pd, pm;
3280
3281     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3282         return false;
3283     }
3284
3285     /* UNDEF accesses to D16-D31 if they don't exist. */
3286     if (!dc_isar_feature(aa32_simd_r32, s) &&
3287         ((a->vd | a->vm) & 0x10)) {
3288         return false;
3289     }
3290
3291     if ((a->vd | a->vm) & a->q) {
3292         return false;
3293     }
3294
3295     if (!fn) {
3296         /* Bad size or size/q combination */
3297         return false;
3298     }
3299
3300     if (!vfp_access_check(s)) {
3301         return true;
3302     }
3303
3304     pd = vfp_reg_ptr(true, a->vd);
3305     pm = vfp_reg_ptr(true, a->vm);
3306     fn(pd, pm);
3307     tcg_temp_free_ptr(pd);
3308     tcg_temp_free_ptr(pm);
3309     return true;
3310 }
3311
3312 static bool trans_VUZP(DisasContext *s, arg_2misc *a)
3313 {
3314     static ZipFn * const fn[2][4] = {
3315         {
3316             gen_helper_neon_unzip8,
3317             gen_helper_neon_unzip16,
3318             NULL,
3319             NULL,
3320         }, {
3321             gen_helper_neon_qunzip8,
3322             gen_helper_neon_qunzip16,
3323             gen_helper_neon_qunzip32,
3324             NULL,
3325         }
3326     };
3327     return do_zip_uzp(s, a, fn[a->q][a->size]);
3328 }
3329
3330 static bool trans_VZIP(DisasContext *s, arg_2misc *a)
3331 {
3332     static ZipFn * const fn[2][4] = {
3333         {
3334             gen_helper_neon_zip8,
3335             gen_helper_neon_zip16,
3336             NULL,
3337             NULL,
3338         }, {
3339             gen_helper_neon_qzip8,
3340             gen_helper_neon_qzip16,
3341             gen_helper_neon_qzip32,
3342             NULL,
3343         }
3344     };
3345     return do_zip_uzp(s, a, fn[a->q][a->size]);
3346 }
3347
3348 static bool do_vmovn(DisasContext *s, arg_2misc *a,
3349                      NeonGenNarrowEnvFn *narrowfn)
3350 {
3351     TCGv_i64 rm;
3352     TCGv_i32 rd0, rd1;
3353
3354     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3355         return false;
3356     }
3357
3358     /* UNDEF accesses to D16-D31 if they don't exist. */
3359     if (!dc_isar_feature(aa32_simd_r32, s) &&
3360         ((a->vd | a->vm) & 0x10)) {
3361         return false;
3362     }
3363
3364     if (a->vm & 1) {
3365         return false;
3366     }
3367
3368     if (!narrowfn) {
3369         return false;
3370     }
3371
3372     if (!vfp_access_check(s)) {
3373         return true;
3374     }
3375
3376     rm = tcg_temp_new_i64();
3377     rd0 = tcg_temp_new_i32();
3378     rd1 = tcg_temp_new_i32();
3379
3380     neon_load_reg64(rm, a->vm);
3381     narrowfn(rd0, cpu_env, rm);
3382     neon_load_reg64(rm, a->vm + 1);
3383     narrowfn(rd1, cpu_env, rm);
3384     neon_store_reg(a->vd, 0, rd0);
3385     neon_store_reg(a->vd, 1, rd1);
3386     tcg_temp_free_i64(rm);
3387     return true;
3388 }
3389
3390 #define DO_VMOVN(INSN, FUNC)                                    \
3391     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3392     {                                                           \
3393         static NeonGenNarrowEnvFn * const narrowfn[] = {        \
3394             FUNC##8,                                            \
3395             FUNC##16,                                           \
3396             FUNC##32,                                           \
3397             NULL,                                               \
3398         };                                                      \
3399         return do_vmovn(s, a, narrowfn[a->size]);               \
3400     }
3401
3402 DO_VMOVN(VMOVN, gen_neon_narrow_u)
3403 DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
3404 DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
3405 DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
3406
3407 static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
3408 {
3409     TCGv_i32 rm0, rm1;
3410     TCGv_i64 rd;
3411     static NeonGenWidenFn * const widenfns[] = {
3412         gen_helper_neon_widen_u8,
3413         gen_helper_neon_widen_u16,
3414         tcg_gen_extu_i32_i64,
3415         NULL,
3416     };
3417     NeonGenWidenFn *widenfn = widenfns[a->size];
3418
3419     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3420         return false;
3421     }
3422
3423     /* UNDEF accesses to D16-D31 if they don't exist. */
3424     if (!dc_isar_feature(aa32_simd_r32, s) &&
3425         ((a->vd | a->vm) & 0x10)) {
3426         return false;
3427     }
3428
3429     if (a->vd & 1) {
3430         return false;
3431     }
3432
3433     if (!widenfn) {
3434         return false;
3435     }
3436
3437     if (!vfp_access_check(s)) {
3438         return true;
3439     }
3440
3441     rd = tcg_temp_new_i64();
3442
3443     rm0 = neon_load_reg(a->vm, 0);
3444     rm1 = neon_load_reg(a->vm, 1);
3445
3446     widenfn(rd, rm0);
3447     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3448     neon_store_reg64(rd, a->vd);
3449     widenfn(rd, rm1);
3450     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3451     neon_store_reg64(rd, a->vd + 1);
3452
3453     tcg_temp_free_i64(rd);
3454     tcg_temp_free_i32(rm0);
3455     tcg_temp_free_i32(rm1);
3456     return true;
3457 }
3458
3459 static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
3460 {
3461     TCGv_ptr fpst;
3462     TCGv_i32 ahp, tmp, tmp2, tmp3;
3463
3464     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3465         !dc_isar_feature(aa32_fp16_spconv, s)) {
3466         return false;
3467     }
3468
3469     /* UNDEF accesses to D16-D31 if they don't exist. */
3470     if (!dc_isar_feature(aa32_simd_r32, s) &&
3471         ((a->vd | a->vm) & 0x10)) {
3472         return false;
3473     }
3474
3475     if ((a->vm & 1) || (a->size != 1)) {
3476         return false;
3477     }
3478
3479     if (!vfp_access_check(s)) {
3480         return true;
3481     }
3482
3483     fpst = get_fpstatus_ptr(true);
3484     ahp = get_ahp_flag();
3485     tmp = neon_load_reg(a->vm, 0);
3486     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3487     tmp2 = neon_load_reg(a->vm, 1);
3488     gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
3489     tcg_gen_shli_i32(tmp2, tmp2, 16);
3490     tcg_gen_or_i32(tmp2, tmp2, tmp);
3491     tcg_temp_free_i32(tmp);
3492     tmp = neon_load_reg(a->vm, 2);
3493     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3494     tmp3 = neon_load_reg(a->vm, 3);
3495     neon_store_reg(a->vd, 0, tmp2);
3496     gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
3497     tcg_gen_shli_i32(tmp3, tmp3, 16);
3498     tcg_gen_or_i32(tmp3, tmp3, tmp);
3499     neon_store_reg(a->vd, 1, tmp3);
3500     tcg_temp_free_i32(tmp);
3501     tcg_temp_free_i32(ahp);
3502     tcg_temp_free_ptr(fpst);
3503
3504     return true;
3505 }
3506
3507 static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
3508 {
3509     TCGv_ptr fpst;
3510     TCGv_i32 ahp, tmp, tmp2, tmp3;
3511
3512     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3513         !dc_isar_feature(aa32_fp16_spconv, s)) {
3514         return false;
3515     }
3516
3517     /* UNDEF accesses to D16-D31 if they don't exist. */
3518     if (!dc_isar_feature(aa32_simd_r32, s) &&
3519         ((a->vd | a->vm) & 0x10)) {
3520         return false;
3521     }
3522
3523     if ((a->vd & 1) || (a->size != 1)) {
3524         return false;
3525     }
3526
3527     if (!vfp_access_check(s)) {
3528         return true;
3529     }
3530
3531     fpst = get_fpstatus_ptr(true);
3532     ahp = get_ahp_flag();
3533     tmp3 = tcg_temp_new_i32();
3534     tmp = neon_load_reg(a->vm, 0);
3535     tmp2 = neon_load_reg(a->vm, 1);
3536     tcg_gen_ext16u_i32(tmp3, tmp);
3537     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3538     neon_store_reg(a->vd, 0, tmp3);
3539     tcg_gen_shri_i32(tmp, tmp, 16);
3540     gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
3541     neon_store_reg(a->vd, 1, tmp);
3542     tmp3 = tcg_temp_new_i32();
3543     tcg_gen_ext16u_i32(tmp3, tmp2);
3544     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3545     neon_store_reg(a->vd, 2, tmp3);
3546     tcg_gen_shri_i32(tmp2, tmp2, 16);
3547     gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
3548     neon_store_reg(a->vd, 3, tmp2);
3549     tcg_temp_free_i32(ahp);
3550     tcg_temp_free_ptr(fpst);
3551
3552     return true;
3553 }
3554
3555 static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
3556 {
3557     int vec_size = a->q ? 16 : 8;
3558     int rd_ofs = neon_reg_offset(a->vd, 0);
3559     int rm_ofs = neon_reg_offset(a->vm, 0);
3560
3561     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3562         return false;
3563     }
3564
3565     /* UNDEF accesses to D16-D31 if they don't exist. */
3566     if (!dc_isar_feature(aa32_simd_r32, s) &&
3567         ((a->vd | a->vm) & 0x10)) {
3568         return false;