target/arm: Implement fp16 for Neon VFMA, VMFS
[qemu.git] / target / arm / translate-neon.c.inc
1 /*
2  *  ARM translation: AArch32 Neon instructions
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *  Copyright (c) 2005-2007 CodeSourcery
6  *  Copyright (c) 2007 OpenedHand, Ltd.
7  *  Copyright (c) 2020 Linaro, Ltd.
8  *
9  * This library is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2 of the License, or (at your option) any later version.
13  *
14  * This library is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
21  */
22
23 /*
24  * This file is intended to be included from translate.c; it uses
25  * some macros and definitions provided by that file.
26  * It might be possible to convert it to a standalone .c file eventually.
27  */
28
29 static inline int plus1(DisasContext *s, int x)
30 {
31     return x + 1;
32 }
33
34 static inline int rsub_64(DisasContext *s, int x)
35 {
36     return 64 - x;
37 }
38
39 static inline int rsub_32(DisasContext *s, int x)
40 {
41     return 32 - x;
42 }
43 static inline int rsub_16(DisasContext *s, int x)
44 {
45     return 16 - x;
46 }
47 static inline int rsub_8(DisasContext *s, int x)
48 {
49     return 8 - x;
50 }
51
52 /* Include the generated Neon decoder */
53 #include "decode-neon-dp.c.inc"
54 #include "decode-neon-ls.c.inc"
55 #include "decode-neon-shared.c.inc"
56
57 /* Return the offset of a 2**SIZE piece of a NEON register, at index ELE,
58  * where 0 is the least significant end of the register.
59  */
60 static inline long
61 neon_element_offset(int reg, int element, MemOp size)
62 {
63     int element_size = 1 << size;
64     int ofs = element * element_size;
65 #ifdef HOST_WORDS_BIGENDIAN
66     /* Calculate the offset assuming fully little-endian,
67      * then XOR to account for the order of the 8-byte units.
68      */
69     if (element_size < 8) {
70         ofs ^= 8 - element_size;
71     }
72 #endif
73     return neon_reg_offset(reg, 0) + ofs;
74 }
75
76 static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
77 {
78     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
79
80     switch (mop) {
81     case MO_UB:
82         tcg_gen_ld8u_i32(var, cpu_env, offset);
83         break;
84     case MO_UW:
85         tcg_gen_ld16u_i32(var, cpu_env, offset);
86         break;
87     case MO_UL:
88         tcg_gen_ld_i32(var, cpu_env, offset);
89         break;
90     default:
91         g_assert_not_reached();
92     }
93 }
94
95 static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
96 {
97     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
98
99     switch (mop) {
100     case MO_UB:
101         tcg_gen_ld8u_i64(var, cpu_env, offset);
102         break;
103     case MO_UW:
104         tcg_gen_ld16u_i64(var, cpu_env, offset);
105         break;
106     case MO_UL:
107         tcg_gen_ld32u_i64(var, cpu_env, offset);
108         break;
109     case MO_Q:
110         tcg_gen_ld_i64(var, cpu_env, offset);
111         break;
112     default:
113         g_assert_not_reached();
114     }
115 }
116
117 static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
118 {
119     long offset = neon_element_offset(reg, ele, size);
120
121     switch (size) {
122     case MO_8:
123         tcg_gen_st8_i32(var, cpu_env, offset);
124         break;
125     case MO_16:
126         tcg_gen_st16_i32(var, cpu_env, offset);
127         break;
128     case MO_32:
129         tcg_gen_st_i32(var, cpu_env, offset);
130         break;
131     default:
132         g_assert_not_reached();
133     }
134 }
135
136 static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
137 {
138     long offset = neon_element_offset(reg, ele, size);
139
140     switch (size) {
141     case MO_8:
142         tcg_gen_st8_i64(var, cpu_env, offset);
143         break;
144     case MO_16:
145         tcg_gen_st16_i64(var, cpu_env, offset);
146         break;
147     case MO_32:
148         tcg_gen_st32_i64(var, cpu_env, offset);
149         break;
150     case MO_64:
151         tcg_gen_st_i64(var, cpu_env, offset);
152         break;
153     default:
154         g_assert_not_reached();
155     }
156 }
157
158 static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
159 {
160     int opr_sz;
161     TCGv_ptr fpst;
162     gen_helper_gvec_3_ptr *fn_gvec_ptr;
163
164     if (!dc_isar_feature(aa32_vcma, s)
165         || (!a->size && !dc_isar_feature(aa32_fp16_arith, s))) {
166         return false;
167     }
168
169     /* UNDEF accesses to D16-D31 if they don't exist. */
170     if (!dc_isar_feature(aa32_simd_r32, s) &&
171         ((a->vd | a->vn | a->vm) & 0x10)) {
172         return false;
173     }
174
175     if ((a->vn | a->vm | a->vd) & a->q) {
176         return false;
177     }
178
179     if (!vfp_access_check(s)) {
180         return true;
181     }
182
183     opr_sz = (1 + a->q) * 8;
184     fpst = fpstatus_ptr(a->size == 0 ? FPST_STD_F16 : FPST_STD);
185     fn_gvec_ptr = a->size ? gen_helper_gvec_fcmlas : gen_helper_gvec_fcmlah;
186     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
187                        vfp_reg_offset(1, a->vn),
188                        vfp_reg_offset(1, a->vm),
189                        fpst, opr_sz, opr_sz, a->rot,
190                        fn_gvec_ptr);
191     tcg_temp_free_ptr(fpst);
192     return true;
193 }
194
195 static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
196 {
197     int opr_sz;
198     TCGv_ptr fpst;
199     gen_helper_gvec_3_ptr *fn_gvec_ptr;
200
201     if (!dc_isar_feature(aa32_vcma, s)
202         || (!a->size && !dc_isar_feature(aa32_fp16_arith, s))) {
203         return false;
204     }
205
206     /* UNDEF accesses to D16-D31 if they don't exist. */
207     if (!dc_isar_feature(aa32_simd_r32, s) &&
208         ((a->vd | a->vn | a->vm) & 0x10)) {
209         return false;
210     }
211
212     if ((a->vn | a->vm | a->vd) & a->q) {
213         return false;
214     }
215
216     if (!vfp_access_check(s)) {
217         return true;
218     }
219
220     opr_sz = (1 + a->q) * 8;
221     fpst = fpstatus_ptr(a->size == 0 ? FPST_STD_F16 : FPST_STD);
222     fn_gvec_ptr = a->size ? gen_helper_gvec_fcadds : gen_helper_gvec_fcaddh;
223     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
224                        vfp_reg_offset(1, a->vn),
225                        vfp_reg_offset(1, a->vm),
226                        fpst, opr_sz, opr_sz, a->rot,
227                        fn_gvec_ptr);
228     tcg_temp_free_ptr(fpst);
229     return true;
230 }
231
232 static bool trans_VDOT(DisasContext *s, arg_VDOT *a)
233 {
234     int opr_sz;
235     gen_helper_gvec_3 *fn_gvec;
236
237     if (!dc_isar_feature(aa32_dp, s)) {
238         return false;
239     }
240
241     /* UNDEF accesses to D16-D31 if they don't exist. */
242     if (!dc_isar_feature(aa32_simd_r32, s) &&
243         ((a->vd | a->vn | a->vm) & 0x10)) {
244         return false;
245     }
246
247     if ((a->vn | a->vm | a->vd) & a->q) {
248         return false;
249     }
250
251     if (!vfp_access_check(s)) {
252         return true;
253     }
254
255     opr_sz = (1 + a->q) * 8;
256     fn_gvec = a->u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b;
257     tcg_gen_gvec_3_ool(vfp_reg_offset(1, a->vd),
258                        vfp_reg_offset(1, a->vn),
259                        vfp_reg_offset(1, a->vm),
260                        opr_sz, opr_sz, 0, fn_gvec);
261     return true;
262 }
263
264 static bool trans_VFML(DisasContext *s, arg_VFML *a)
265 {
266     int opr_sz;
267
268     if (!dc_isar_feature(aa32_fhm, s)) {
269         return false;
270     }
271
272     /* UNDEF accesses to D16-D31 if they don't exist. */
273     if (!dc_isar_feature(aa32_simd_r32, s) &&
274         (a->vd & 0x10)) {
275         return false;
276     }
277
278     if (a->vd & a->q) {
279         return false;
280     }
281
282     if (!vfp_access_check(s)) {
283         return true;
284     }
285
286     opr_sz = (1 + a->q) * 8;
287     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
288                        vfp_reg_offset(a->q, a->vn),
289                        vfp_reg_offset(a->q, a->vm),
290                        cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
291                        gen_helper_gvec_fmlal_a32);
292     return true;
293 }
294
295 static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
296 {
297     gen_helper_gvec_3_ptr *fn_gvec_ptr;
298     int opr_sz;
299     TCGv_ptr fpst;
300
301     if (!dc_isar_feature(aa32_vcma, s)) {
302         return false;
303     }
304     if (a->size == 0 && !dc_isar_feature(aa32_fp16_arith, s)) {
305         return false;
306     }
307
308     /* UNDEF accesses to D16-D31 if they don't exist. */
309     if (!dc_isar_feature(aa32_simd_r32, s) &&
310         ((a->vd | a->vn | a->vm) & 0x10)) {
311         return false;
312     }
313
314     if ((a->vd | a->vn) & a->q) {
315         return false;
316     }
317
318     if (!vfp_access_check(s)) {
319         return true;
320     }
321
322     fn_gvec_ptr = (a->size ? gen_helper_gvec_fcmlas_idx
323                    : gen_helper_gvec_fcmlah_idx);
324     opr_sz = (1 + a->q) * 8;
325     fpst = fpstatus_ptr(a->size == 0 ? FPST_STD_F16 : FPST_STD);
326     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
327                        vfp_reg_offset(1, a->vn),
328                        vfp_reg_offset(1, a->vm),
329                        fpst, opr_sz, opr_sz,
330                        (a->index << 2) | a->rot, fn_gvec_ptr);
331     tcg_temp_free_ptr(fpst);
332     return true;
333 }
334
335 static bool trans_VDOT_scalar(DisasContext *s, arg_VDOT_scalar *a)
336 {
337     gen_helper_gvec_3 *fn_gvec;
338     int opr_sz;
339     TCGv_ptr fpst;
340
341     if (!dc_isar_feature(aa32_dp, s)) {
342         return false;
343     }
344
345     /* UNDEF accesses to D16-D31 if they don't exist. */
346     if (!dc_isar_feature(aa32_simd_r32, s) &&
347         ((a->vd | a->vn) & 0x10)) {
348         return false;
349     }
350
351     if ((a->vd | a->vn) & a->q) {
352         return false;
353     }
354
355     if (!vfp_access_check(s)) {
356         return true;
357     }
358
359     fn_gvec = a->u ? gen_helper_gvec_udot_idx_b : gen_helper_gvec_sdot_idx_b;
360     opr_sz = (1 + a->q) * 8;
361     fpst = fpstatus_ptr(FPST_STD);
362     tcg_gen_gvec_3_ool(vfp_reg_offset(1, a->vd),
363                        vfp_reg_offset(1, a->vn),
364                        vfp_reg_offset(1, a->rm),
365                        opr_sz, opr_sz, a->index, fn_gvec);
366     tcg_temp_free_ptr(fpst);
367     return true;
368 }
369
370 static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
371 {
372     int opr_sz;
373
374     if (!dc_isar_feature(aa32_fhm, s)) {
375         return false;
376     }
377
378     /* UNDEF accesses to D16-D31 if they don't exist. */
379     if (!dc_isar_feature(aa32_simd_r32, s) &&
380         ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
381         return false;
382     }
383
384     if (a->vd & a->q) {
385         return false;
386     }
387
388     if (!vfp_access_check(s)) {
389         return true;
390     }
391
392     opr_sz = (1 + a->q) * 8;
393     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
394                        vfp_reg_offset(a->q, a->vn),
395                        vfp_reg_offset(a->q, a->rm),
396                        cpu_env, opr_sz, opr_sz,
397                        (a->index << 2) | a->s, /* is_2 == 0 */
398                        gen_helper_gvec_fmlal_idx_a32);
399     return true;
400 }
401
402 static struct {
403     int nregs;
404     int interleave;
405     int spacing;
406 } const neon_ls_element_type[11] = {
407     {1, 4, 1},
408     {1, 4, 2},
409     {4, 1, 1},
410     {2, 2, 2},
411     {1, 3, 1},
412     {1, 3, 2},
413     {3, 1, 1},
414     {1, 1, 1},
415     {1, 2, 1},
416     {1, 2, 2},
417     {2, 1, 1}
418 };
419
420 static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
421                                       int stride)
422 {
423     if (rm != 15) {
424         TCGv_i32 base;
425
426         base = load_reg(s, rn);
427         if (rm == 13) {
428             tcg_gen_addi_i32(base, base, stride);
429         } else {
430             TCGv_i32 index;
431             index = load_reg(s, rm);
432             tcg_gen_add_i32(base, base, index);
433             tcg_temp_free_i32(index);
434         }
435         store_reg(s, rn, base);
436     }
437 }
438
439 static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
440 {
441     /* Neon load/store multiple structures */
442     int nregs, interleave, spacing, reg, n;
443     MemOp endian = s->be_data;
444     int mmu_idx = get_mem_index(s);
445     int size = a->size;
446     TCGv_i64 tmp64;
447     TCGv_i32 addr, tmp;
448
449     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
450         return false;
451     }
452
453     /* UNDEF accesses to D16-D31 if they don't exist */
454     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
455         return false;
456     }
457     if (a->itype > 10) {
458         return false;
459     }
460     /* Catch UNDEF cases for bad values of align field */
461     switch (a->itype & 0xc) {
462     case 4:
463         if (a->align >= 2) {
464             return false;
465         }
466         break;
467     case 8:
468         if (a->align == 3) {
469             return false;
470         }
471         break;
472     default:
473         break;
474     }
475     nregs = neon_ls_element_type[a->itype].nregs;
476     interleave = neon_ls_element_type[a->itype].interleave;
477     spacing = neon_ls_element_type[a->itype].spacing;
478     if (size == 3 && (interleave | spacing) != 1) {
479         return false;
480     }
481
482     if (!vfp_access_check(s)) {
483         return true;
484     }
485
486     /* For our purposes, bytes are always little-endian.  */
487     if (size == 0) {
488         endian = MO_LE;
489     }
490     /*
491      * Consecutive little-endian elements from a single register
492      * can be promoted to a larger little-endian operation.
493      */
494     if (interleave == 1 && endian == MO_LE) {
495         size = 3;
496     }
497     tmp64 = tcg_temp_new_i64();
498     addr = tcg_temp_new_i32();
499     tmp = tcg_const_i32(1 << size);
500     load_reg_var(s, addr, a->rn);
501     for (reg = 0; reg < nregs; reg++) {
502         for (n = 0; n < 8 >> size; n++) {
503             int xs;
504             for (xs = 0; xs < interleave; xs++) {
505                 int tt = a->vd + reg + spacing * xs;
506
507                 if (a->l) {
508                     gen_aa32_ld_i64(s, tmp64, addr, mmu_idx, endian | size);
509                     neon_store_element64(tt, n, size, tmp64);
510                 } else {
511                     neon_load_element64(tmp64, tt, n, size);
512                     gen_aa32_st_i64(s, tmp64, addr, mmu_idx, endian | size);
513                 }
514                 tcg_gen_add_i32(addr, addr, tmp);
515             }
516         }
517     }
518     tcg_temp_free_i32(addr);
519     tcg_temp_free_i32(tmp);
520     tcg_temp_free_i64(tmp64);
521
522     gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
523     return true;
524 }
525
526 static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
527 {
528     /* Neon load single structure to all lanes */
529     int reg, stride, vec_size;
530     int vd = a->vd;
531     int size = a->size;
532     int nregs = a->n + 1;
533     TCGv_i32 addr, tmp;
534
535     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
536         return false;
537     }
538
539     /* UNDEF accesses to D16-D31 if they don't exist */
540     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
541         return false;
542     }
543
544     if (size == 3) {
545         if (nregs != 4 || a->a == 0) {
546             return false;
547         }
548         /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
549         size = 2;
550     }
551     if (nregs == 1 && a->a == 1 && size == 0) {
552         return false;
553     }
554     if (nregs == 3 && a->a == 1) {
555         return false;
556     }
557
558     if (!vfp_access_check(s)) {
559         return true;
560     }
561
562     /*
563      * VLD1 to all lanes: T bit indicates how many Dregs to write.
564      * VLD2/3/4 to all lanes: T bit indicates register stride.
565      */
566     stride = a->t ? 2 : 1;
567     vec_size = nregs == 1 ? stride * 8 : 8;
568
569     tmp = tcg_temp_new_i32();
570     addr = tcg_temp_new_i32();
571     load_reg_var(s, addr, a->rn);
572     for (reg = 0; reg < nregs; reg++) {
573         gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s),
574                         s->be_data | size);
575         if ((vd & 1) && vec_size == 16) {
576             /*
577              * We cannot write 16 bytes at once because the
578              * destination is unaligned.
579              */
580             tcg_gen_gvec_dup_i32(size, neon_reg_offset(vd, 0),
581                                  8, 8, tmp);
582             tcg_gen_gvec_mov(0, neon_reg_offset(vd + 1, 0),
583                              neon_reg_offset(vd, 0), 8, 8);
584         } else {
585             tcg_gen_gvec_dup_i32(size, neon_reg_offset(vd, 0),
586                                  vec_size, vec_size, tmp);
587         }
588         tcg_gen_addi_i32(addr, addr, 1 << size);
589         vd += stride;
590     }
591     tcg_temp_free_i32(tmp);
592     tcg_temp_free_i32(addr);
593
594     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
595
596     return true;
597 }
598
599 static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
600 {
601     /* Neon load/store single structure to one lane */
602     int reg;
603     int nregs = a->n + 1;
604     int vd = a->vd;
605     TCGv_i32 addr, tmp;
606
607     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
608         return false;
609     }
610
611     /* UNDEF accesses to D16-D31 if they don't exist */
612     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
613         return false;
614     }
615
616     /* Catch the UNDEF cases. This is unavoidably a bit messy. */
617     switch (nregs) {
618     case 1:
619         if (((a->align & (1 << a->size)) != 0) ||
620             (a->size == 2 && ((a->align & 3) == 1 || (a->align & 3) == 2))) {
621             return false;
622         }
623         break;
624     case 3:
625         if ((a->align & 1) != 0) {
626             return false;
627         }
628         /* fall through */
629     case 2:
630         if (a->size == 2 && (a->align & 2) != 0) {
631             return false;
632         }
633         break;
634     case 4:
635         if ((a->size == 2) && ((a->align & 3) == 3)) {
636             return false;
637         }
638         break;
639     default:
640         abort();
641     }
642     if ((vd + a->stride * (nregs - 1)) > 31) {
643         /*
644          * Attempts to write off the end of the register file are
645          * UNPREDICTABLE; we choose to UNDEF because otherwise we would
646          * access off the end of the array that holds the register data.
647          */
648         return false;
649     }
650
651     if (!vfp_access_check(s)) {
652         return true;
653     }
654
655     tmp = tcg_temp_new_i32();
656     addr = tcg_temp_new_i32();
657     load_reg_var(s, addr, a->rn);
658     /*
659      * TODO: if we implemented alignment exceptions, we should check
660      * addr against the alignment encoded in a->align here.
661      */
662     for (reg = 0; reg < nregs; reg++) {
663         if (a->l) {
664             gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s),
665                             s->be_data | a->size);
666             neon_store_element(vd, a->reg_idx, a->size, tmp);
667         } else { /* Store */
668             neon_load_element(tmp, vd, a->reg_idx, a->size);
669             gen_aa32_st_i32(s, tmp, addr, get_mem_index(s),
670                             s->be_data | a->size);
671         }
672         vd += a->stride;
673         tcg_gen_addi_i32(addr, addr, 1 << a->size);
674     }
675     tcg_temp_free_i32(addr);
676     tcg_temp_free_i32(tmp);
677
678     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
679
680     return true;
681 }
682
683 static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
684 {
685     int vec_size = a->q ? 16 : 8;
686     int rd_ofs = neon_reg_offset(a->vd, 0);
687     int rn_ofs = neon_reg_offset(a->vn, 0);
688     int rm_ofs = neon_reg_offset(a->vm, 0);
689
690     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
691         return false;
692     }
693
694     /* UNDEF accesses to D16-D31 if they don't exist. */
695     if (!dc_isar_feature(aa32_simd_r32, s) &&
696         ((a->vd | a->vn | a->vm) & 0x10)) {
697         return false;
698     }
699
700     if ((a->vn | a->vm | a->vd) & a->q) {
701         return false;
702     }
703
704     if (!vfp_access_check(s)) {
705         return true;
706     }
707
708     fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
709     return true;
710 }
711
712 #define DO_3SAME(INSN, FUNC)                                            \
713     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
714     {                                                                   \
715         return do_3same(s, a, FUNC);                                    \
716     }
717
718 DO_3SAME(VADD, tcg_gen_gvec_add)
719 DO_3SAME(VSUB, tcg_gen_gvec_sub)
720 DO_3SAME(VAND, tcg_gen_gvec_and)
721 DO_3SAME(VBIC, tcg_gen_gvec_andc)
722 DO_3SAME(VORR, tcg_gen_gvec_or)
723 DO_3SAME(VORN, tcg_gen_gvec_orc)
724 DO_3SAME(VEOR, tcg_gen_gvec_xor)
725 DO_3SAME(VSHL_S, gen_gvec_sshl)
726 DO_3SAME(VSHL_U, gen_gvec_ushl)
727 DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
728 DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
729 DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
730 DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
731
732 /* These insns are all gvec_bitsel but with the inputs in various orders. */
733 #define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
734     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
735                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
736                                 uint32_t oprsz, uint32_t maxsz)         \
737     {                                                                   \
738         tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
739     }                                                                   \
740     DO_3SAME(INSN, gen_##INSN##_3s)
741
742 DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
743 DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
744 DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
745
746 #define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
747     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
748     {                                                                   \
749         if (a->size == 3) {                                             \
750             return false;                                               \
751         }                                                               \
752         return do_3same(s, a, FUNC);                                    \
753     }
754
755 DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
756 DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
757 DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
758 DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
759 DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
760 DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
761 DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
762 DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
763 DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
764 DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
765 DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
766 DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
767
768 #define DO_3SAME_CMP(INSN, COND)                                        \
769     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
770                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
771                                 uint32_t oprsz, uint32_t maxsz)         \
772     {                                                                   \
773         tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
774     }                                                                   \
775     DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
776
777 DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
778 DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
779 DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
780 DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
781 DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
782
783 #define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
784     static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
785                          uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
786     {                                                                      \
787         tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
788     }
789
790 WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
791
792 static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
793 {
794     if (a->size != 0) {
795         return false;
796     }
797     return do_3same(s, a, gen_VMUL_p_3s);
798 }
799
800 #define DO_VQRDMLAH(INSN, FUNC)                                         \
801     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
802     {                                                                   \
803         if (!dc_isar_feature(aa32_rdm, s)) {                            \
804             return false;                                               \
805         }                                                               \
806         if (a->size != 1 && a->size != 2) {                             \
807             return false;                                               \
808         }                                                               \
809         return do_3same(s, a, FUNC);                                    \
810     }
811
812 DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
813 DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
814
815 #define DO_SHA1(NAME, FUNC)                                             \
816     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
817     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
818     {                                                                   \
819         if (!dc_isar_feature(aa32_sha1, s)) {                           \
820             return false;                                               \
821         }                                                               \
822         return do_3same(s, a, gen_##NAME##_3s);                         \
823     }
824
825 DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
826 DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
827 DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
828 DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
829
830 #define DO_SHA2(NAME, FUNC)                                             \
831     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
832     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
833     {                                                                   \
834         if (!dc_isar_feature(aa32_sha2, s)) {                           \
835             return false;                                               \
836         }                                                               \
837         return do_3same(s, a, gen_##NAME##_3s);                         \
838     }
839
840 DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
841 DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
842 DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
843
844 #define DO_3SAME_64(INSN, FUNC)                                         \
845     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
846                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
847                                 uint32_t oprsz, uint32_t maxsz)         \
848     {                                                                   \
849         static const GVecGen3 op = { .fni8 = FUNC };                    \
850         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op);      \
851     }                                                                   \
852     DO_3SAME(INSN, gen_##INSN##_3s)
853
854 #define DO_3SAME_64_ENV(INSN, FUNC)                                     \
855     static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)    \
856     {                                                                   \
857         FUNC(d, cpu_env, n, m);                                         \
858     }                                                                   \
859     DO_3SAME_64(INSN, gen_##INSN##_elt)
860
861 DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
862 DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
863 DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
864 DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
865 DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
866 DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
867
868 #define DO_3SAME_32(INSN, FUNC)                                         \
869     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
870                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
871                                 uint32_t oprsz, uint32_t maxsz)         \
872     {                                                                   \
873         static const GVecGen3 ops[4] = {                                \
874             { .fni4 = gen_helper_neon_##FUNC##8 },                      \
875             { .fni4 = gen_helper_neon_##FUNC##16 },                     \
876             { .fni4 = gen_helper_neon_##FUNC##32 },                     \
877             { 0 },                                                      \
878         };                                                              \
879         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
880     }                                                                   \
881     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
882     {                                                                   \
883         if (a->size > 2) {                                              \
884             return false;                                               \
885         }                                                               \
886         return do_3same(s, a, gen_##INSN##_3s);                         \
887     }
888
889 /*
890  * Some helper functions need to be passed the cpu_env. In order
891  * to use those with the gvec APIs like tcg_gen_gvec_3() we need
892  * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
893  * and which call a NeonGenTwoOpEnvFn().
894  */
895 #define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
896     static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
897     {                                                                   \
898         FUNC(d, cpu_env, n, m);                                         \
899     }
900
901 #define DO_3SAME_32_ENV(INSN, FUNC)                                     \
902     WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8);        \
903     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16);      \
904     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32);      \
905     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
906                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
907                                 uint32_t oprsz, uint32_t maxsz)         \
908     {                                                                   \
909         static const GVecGen3 ops[4] = {                                \
910             { .fni4 = gen_##INSN##_tramp8 },                            \
911             { .fni4 = gen_##INSN##_tramp16 },                           \
912             { .fni4 = gen_##INSN##_tramp32 },                           \
913             { 0 },                                                      \
914         };                                                              \
915         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
916     }                                                                   \
917     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
918     {                                                                   \
919         if (a->size > 2) {                                              \
920             return false;                                               \
921         }                                                               \
922         return do_3same(s, a, gen_##INSN##_3s);                         \
923     }
924
925 DO_3SAME_32(VHADD_S, hadd_s)
926 DO_3SAME_32(VHADD_U, hadd_u)
927 DO_3SAME_32(VHSUB_S, hsub_s)
928 DO_3SAME_32(VHSUB_U, hsub_u)
929 DO_3SAME_32(VRHADD_S, rhadd_s)
930 DO_3SAME_32(VRHADD_U, rhadd_u)
931 DO_3SAME_32(VRSHL_S, rshl_s)
932 DO_3SAME_32(VRSHL_U, rshl_u)
933
934 DO_3SAME_32_ENV(VQSHL_S, qshl_s)
935 DO_3SAME_32_ENV(VQSHL_U, qshl_u)
936 DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
937 DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
938
939 static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn)
940 {
941     /* Operations handled pairwise 32 bits at a time */
942     TCGv_i32 tmp, tmp2, tmp3;
943
944     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
945         return false;
946     }
947
948     /* UNDEF accesses to D16-D31 if they don't exist. */
949     if (!dc_isar_feature(aa32_simd_r32, s) &&
950         ((a->vd | a->vn | a->vm) & 0x10)) {
951         return false;
952     }
953
954     if (a->size == 3) {
955         return false;
956     }
957
958     if (!vfp_access_check(s)) {
959         return true;
960     }
961
962     assert(a->q == 0); /* enforced by decode patterns */
963
964     /*
965      * Note that we have to be careful not to clobber the source operands
966      * in the "vm == vd" case by storing the result of the first pass too
967      * early. Since Q is 0 there are always just two passes, so instead
968      * of a complicated loop over each pass we just unroll.
969      */
970     tmp = neon_load_reg(a->vn, 0);
971     tmp2 = neon_load_reg(a->vn, 1);
972     fn(tmp, tmp, tmp2);
973     tcg_temp_free_i32(tmp2);
974
975     tmp3 = neon_load_reg(a->vm, 0);
976     tmp2 = neon_load_reg(a->vm, 1);
977     fn(tmp3, tmp3, tmp2);
978     tcg_temp_free_i32(tmp2);
979
980     neon_store_reg(a->vd, 0, tmp);
981     neon_store_reg(a->vd, 1, tmp3);
982     return true;
983 }
984
985 #define DO_3SAME_PAIR(INSN, func)                                       \
986     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
987     {                                                                   \
988         static NeonGenTwoOpFn * const fns[] = {                         \
989             gen_helper_neon_##func##8,                                  \
990             gen_helper_neon_##func##16,                                 \
991             gen_helper_neon_##func##32,                                 \
992         };                                                              \
993         if (a->size > 2) {                                              \
994             return false;                                               \
995         }                                                               \
996         return do_3same_pair(s, a, fns[a->size]);                       \
997     }
998
999 /* 32-bit pairwise ops end up the same as the elementwise versions.  */
1000 #define gen_helper_neon_pmax_s32  tcg_gen_smax_i32
1001 #define gen_helper_neon_pmax_u32  tcg_gen_umax_i32
1002 #define gen_helper_neon_pmin_s32  tcg_gen_smin_i32
1003 #define gen_helper_neon_pmin_u32  tcg_gen_umin_i32
1004 #define gen_helper_neon_padd_u32  tcg_gen_add_i32
1005
1006 DO_3SAME_PAIR(VPMAX_S, pmax_s)
1007 DO_3SAME_PAIR(VPMIN_S, pmin_s)
1008 DO_3SAME_PAIR(VPMAX_U, pmax_u)
1009 DO_3SAME_PAIR(VPMIN_U, pmin_u)
1010 DO_3SAME_PAIR(VPADD, padd_u)
1011
1012 #define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
1013     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16);    \
1014     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32);    \
1015     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
1016                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
1017                                 uint32_t oprsz, uint32_t maxsz)         \
1018     {                                                                   \
1019         static const GVecGen3 ops[2] = {                                \
1020             { .fni4 = gen_##INSN##_tramp16 },                           \
1021             { .fni4 = gen_##INSN##_tramp32 },                           \
1022         };                                                              \
1023         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
1024     }                                                                   \
1025     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1026     {                                                                   \
1027         if (a->size != 1 && a->size != 2) {                             \
1028             return false;                                               \
1029         }                                                               \
1030         return do_3same(s, a, gen_##INSN##_3s);                         \
1031     }
1032
1033 DO_3SAME_VQDMULH(VQDMULH, qdmulh)
1034 DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
1035
1036 #define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC)                              \
1037     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
1038                          uint32_t rn_ofs, uint32_t rm_ofs,              \
1039                          uint32_t oprsz, uint32_t maxsz)                \
1040     {                                                                   \
1041         TCGv_ptr fpst = fpstatus_ptr(FPST);                             \
1042         tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
1043                            oprsz, maxsz, 0, FUNC);                      \
1044         tcg_temp_free_ptr(fpst);                                        \
1045     }
1046
1047 #define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC)                                 \
1048     WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC)                 \
1049     WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC)             \
1050     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
1051     {                                                                   \
1052         if (a->size != 0) {                                             \
1053             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
1054                 return false;                                           \
1055             }                                                           \
1056             return do_3same(s, a, gen_##INSN##_fp16_3s);                \
1057         }                                                               \
1058         return do_3same(s, a, gen_##INSN##_fp32_3s);                    \
1059     }
1060
1061
1062 DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
1063 DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
1064 DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
1065 DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
1066 DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
1067 DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
1068 DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
1069 DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
1070 DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
1071 DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
1072 DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
1073 DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
1074 DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
1075 DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
1076 DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
1077
1078 WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
1079 WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
1080 WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
1081 WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
1082
1083 static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
1084 {
1085     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1086         return false;
1087     }
1088
1089     if (a->size != 0) {
1090         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1091             return false;
1092         }
1093         return do_3same(s, a, gen_VMAXNM_fp16_3s);
1094     }
1095     return do_3same(s, a, gen_VMAXNM_fp32_3s);
1096 }
1097
1098 static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
1099 {
1100     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1101         return false;
1102     }
1103
1104     if (a->size != 0) {
1105         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1106             return false;
1107         }
1108         return do_3same(s, a, gen_VMINNM_fp16_3s);
1109     }
1110     return do_3same(s, a, gen_VMINNM_fp32_3s);
1111 }
1112
1113 WRAP_ENV_FN(gen_VRECPS_tramp, gen_helper_recps_f32)
1114
1115 static void gen_VRECPS_fp_3s(unsigned vece, uint32_t rd_ofs,
1116                              uint32_t rn_ofs, uint32_t rm_ofs,
1117                              uint32_t oprsz, uint32_t maxsz)
1118 {
1119     static const GVecGen3 ops = { .fni4 = gen_VRECPS_tramp };
1120     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops);
1121 }
1122
1123 static bool trans_VRECPS_fp_3s(DisasContext *s, arg_3same *a)
1124 {
1125     if (a->size != 0) {
1126         /* TODO fp16 support */
1127         return false;
1128     }
1129
1130     return do_3same(s, a, gen_VRECPS_fp_3s);
1131 }
1132
1133 WRAP_ENV_FN(gen_VRSQRTS_tramp, gen_helper_rsqrts_f32)
1134
1135 static void gen_VRSQRTS_fp_3s(unsigned vece, uint32_t rd_ofs,
1136                               uint32_t rn_ofs, uint32_t rm_ofs,
1137                               uint32_t oprsz, uint32_t maxsz)
1138 {
1139     static const GVecGen3 ops = { .fni4 = gen_VRSQRTS_tramp };
1140     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops);
1141 }
1142
1143 static bool trans_VRSQRTS_fp_3s(DisasContext *s, arg_3same *a)
1144 {
1145     if (a->size != 0) {
1146         /* TODO fp16 support */
1147         return false;
1148     }
1149
1150     return do_3same(s, a, gen_VRSQRTS_fp_3s);
1151 }
1152
1153 static bool do_3same_fp_pair(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn)
1154 {
1155     /* FP operations handled pairwise 32 bits at a time */
1156     TCGv_i32 tmp, tmp2, tmp3;
1157     TCGv_ptr fpstatus;
1158
1159     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1160         return false;
1161     }
1162
1163     /* UNDEF accesses to D16-D31 if they don't exist. */
1164     if (!dc_isar_feature(aa32_simd_r32, s) &&
1165         ((a->vd | a->vn | a->vm) & 0x10)) {
1166         return false;
1167     }
1168
1169     if (!vfp_access_check(s)) {
1170         return true;
1171     }
1172
1173     assert(a->q == 0); /* enforced by decode patterns */
1174
1175     /*
1176      * Note that we have to be careful not to clobber the source operands
1177      * in the "vm == vd" case by storing the result of the first pass too
1178      * early. Since Q is 0 there are always just two passes, so instead
1179      * of a complicated loop over each pass we just unroll.
1180      */
1181     fpstatus = fpstatus_ptr(FPST_STD);
1182     tmp = neon_load_reg(a->vn, 0);
1183     tmp2 = neon_load_reg(a->vn, 1);
1184     fn(tmp, tmp, tmp2, fpstatus);
1185     tcg_temp_free_i32(tmp2);
1186
1187     tmp3 = neon_load_reg(a->vm, 0);
1188     tmp2 = neon_load_reg(a->vm, 1);
1189     fn(tmp3, tmp3, tmp2, fpstatus);
1190     tcg_temp_free_i32(tmp2);
1191     tcg_temp_free_ptr(fpstatus);
1192
1193     neon_store_reg(a->vd, 0, tmp);
1194     neon_store_reg(a->vd, 1, tmp3);
1195     return true;
1196 }
1197
1198 /*
1199  * For all the functions using this macro, size == 1 means fp16,
1200  * which is an architecture extension we don't implement yet.
1201  */
1202 #define DO_3S_FP_PAIR(INSN,FUNC)                                    \
1203     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
1204     {                                                               \
1205         if (a->size != 0) {                                         \
1206             /* TODO fp16 support */                                 \
1207             return false;                                           \
1208         }                                                           \
1209         return do_3same_fp_pair(s, a, FUNC);                        \
1210     }
1211
1212 DO_3S_FP_PAIR(VPADD, gen_helper_vfp_adds)
1213 DO_3S_FP_PAIR(VPMAX, gen_helper_vfp_maxs)
1214 DO_3S_FP_PAIR(VPMIN, gen_helper_vfp_mins)
1215
1216 static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
1217 {
1218     /* Handle a 2-reg-shift insn which can be vectorized. */
1219     int vec_size = a->q ? 16 : 8;
1220     int rd_ofs = neon_reg_offset(a->vd, 0);
1221     int rm_ofs = neon_reg_offset(a->vm, 0);
1222
1223     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1224         return false;
1225     }
1226
1227     /* UNDEF accesses to D16-D31 if they don't exist. */
1228     if (!dc_isar_feature(aa32_simd_r32, s) &&
1229         ((a->vd | a->vm) & 0x10)) {
1230         return false;
1231     }
1232
1233     if ((a->vm | a->vd) & a->q) {
1234         return false;
1235     }
1236
1237     if (!vfp_access_check(s)) {
1238         return true;
1239     }
1240
1241     fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
1242     return true;
1243 }
1244
1245 #define DO_2SH(INSN, FUNC)                                              \
1246     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1247     {                                                                   \
1248         return do_vector_2sh(s, a, FUNC);                               \
1249     }                                                                   \
1250
1251 DO_2SH(VSHL, tcg_gen_gvec_shli)
1252 DO_2SH(VSLI, gen_gvec_sli)
1253 DO_2SH(VSRI, gen_gvec_sri)
1254 DO_2SH(VSRA_S, gen_gvec_ssra)
1255 DO_2SH(VSRA_U, gen_gvec_usra)
1256 DO_2SH(VRSHR_S, gen_gvec_srshr)
1257 DO_2SH(VRSHR_U, gen_gvec_urshr)
1258 DO_2SH(VRSRA_S, gen_gvec_srsra)
1259 DO_2SH(VRSRA_U, gen_gvec_ursra)
1260
1261 static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
1262 {
1263     /* Signed shift out of range results in all-sign-bits */
1264     a->shift = MIN(a->shift, (8 << a->size) - 1);
1265     return do_vector_2sh(s, a, tcg_gen_gvec_sari);
1266 }
1267
1268 static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
1269                             int64_t shift, uint32_t oprsz, uint32_t maxsz)
1270 {
1271     tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
1272 }
1273
1274 static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
1275 {
1276     /* Shift out of range is architecturally valid and results in zero. */
1277     if (a->shift >= (8 << a->size)) {
1278         return do_vector_2sh(s, a, gen_zero_rd_2sh);
1279     } else {
1280         return do_vector_2sh(s, a, tcg_gen_gvec_shri);
1281     }
1282 }
1283
1284 static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
1285                              NeonGenTwo64OpEnvFn *fn)
1286 {
1287     /*
1288      * 2-reg-and-shift operations, size == 3 case, where the
1289      * function needs to be passed cpu_env.
1290      */
1291     TCGv_i64 constimm;
1292     int pass;
1293
1294     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1295         return false;
1296     }
1297
1298     /* UNDEF accesses to D16-D31 if they don't exist. */
1299     if (!dc_isar_feature(aa32_simd_r32, s) &&
1300         ((a->vd | a->vm) & 0x10)) {
1301         return false;
1302     }
1303
1304     if ((a->vm | a->vd) & a->q) {
1305         return false;
1306     }
1307
1308     if (!vfp_access_check(s)) {
1309         return true;
1310     }
1311
1312     /*
1313      * To avoid excessive duplication of ops we implement shift
1314      * by immediate using the variable shift operations.
1315      */
1316     constimm = tcg_const_i64(dup_const(a->size, a->shift));
1317
1318     for (pass = 0; pass < a->q + 1; pass++) {
1319         TCGv_i64 tmp = tcg_temp_new_i64();
1320
1321         neon_load_reg64(tmp, a->vm + pass);
1322         fn(tmp, cpu_env, tmp, constimm);
1323         neon_store_reg64(tmp, a->vd + pass);
1324         tcg_temp_free_i64(tmp);
1325     }
1326     tcg_temp_free_i64(constimm);
1327     return true;
1328 }
1329
1330 static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
1331                              NeonGenTwoOpEnvFn *fn)
1332 {
1333     /*
1334      * 2-reg-and-shift operations, size < 3 case, where the
1335      * helper needs to be passed cpu_env.
1336      */
1337     TCGv_i32 constimm;
1338     int pass;
1339
1340     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1341         return false;
1342     }
1343
1344     /* UNDEF accesses to D16-D31 if they don't exist. */
1345     if (!dc_isar_feature(aa32_simd_r32, s) &&
1346         ((a->vd | a->vm) & 0x10)) {
1347         return false;
1348     }
1349
1350     if ((a->vm | a->vd) & a->q) {
1351         return false;
1352     }
1353
1354     if (!vfp_access_check(s)) {
1355         return true;
1356     }
1357
1358     /*
1359      * To avoid excessive duplication of ops we implement shift
1360      * by immediate using the variable shift operations.
1361      */
1362     constimm = tcg_const_i32(dup_const(a->size, a->shift));
1363
1364     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1365         TCGv_i32 tmp = neon_load_reg(a->vm, pass);
1366         fn(tmp, cpu_env, tmp, constimm);
1367         neon_store_reg(a->vd, pass, tmp);
1368     }
1369     tcg_temp_free_i32(constimm);
1370     return true;
1371 }
1372
1373 #define DO_2SHIFT_ENV(INSN, FUNC)                                       \
1374     static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
1375     {                                                                   \
1376         return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
1377     }                                                                   \
1378     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1379     {                                                                   \
1380         static NeonGenTwoOpEnvFn * const fns[] = {                      \
1381             gen_helper_neon_##FUNC##8,                                  \
1382             gen_helper_neon_##FUNC##16,                                 \
1383             gen_helper_neon_##FUNC##32,                                 \
1384         };                                                              \
1385         assert(a->size < ARRAY_SIZE(fns));                              \
1386         return do_2shift_env_32(s, a, fns[a->size]);                    \
1387     }
1388
1389 DO_2SHIFT_ENV(VQSHLU, qshlu_s)
1390 DO_2SHIFT_ENV(VQSHL_U, qshl_u)
1391 DO_2SHIFT_ENV(VQSHL_S, qshl_s)
1392
1393 static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
1394                                 NeonGenTwo64OpFn *shiftfn,
1395                                 NeonGenNarrowEnvFn *narrowfn)
1396 {
1397     /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
1398     TCGv_i64 constimm, rm1, rm2;
1399     TCGv_i32 rd;
1400
1401     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1402         return false;
1403     }
1404
1405     /* UNDEF accesses to D16-D31 if they don't exist. */
1406     if (!dc_isar_feature(aa32_simd_r32, s) &&
1407         ((a->vd | a->vm) & 0x10)) {
1408         return false;
1409     }
1410
1411     if (a->vm & 1) {
1412         return false;
1413     }
1414
1415     if (!vfp_access_check(s)) {
1416         return true;
1417     }
1418
1419     /*
1420      * This is always a right shift, and the shiftfn is always a
1421      * left-shift helper, which thus needs the negated shift count.
1422      */
1423     constimm = tcg_const_i64(-a->shift);
1424     rm1 = tcg_temp_new_i64();
1425     rm2 = tcg_temp_new_i64();
1426
1427     /* Load both inputs first to avoid potential overwrite if rm == rd */
1428     neon_load_reg64(rm1, a->vm);
1429     neon_load_reg64(rm2, a->vm + 1);
1430
1431     shiftfn(rm1, rm1, constimm);
1432     rd = tcg_temp_new_i32();
1433     narrowfn(rd, cpu_env, rm1);
1434     neon_store_reg(a->vd, 0, rd);
1435
1436     shiftfn(rm2, rm2, constimm);
1437     rd = tcg_temp_new_i32();
1438     narrowfn(rd, cpu_env, rm2);
1439     neon_store_reg(a->vd, 1, rd);
1440
1441     tcg_temp_free_i64(rm1);
1442     tcg_temp_free_i64(rm2);
1443     tcg_temp_free_i64(constimm);
1444
1445     return true;
1446 }
1447
1448 static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
1449                                 NeonGenTwoOpFn *shiftfn,
1450                                 NeonGenNarrowEnvFn *narrowfn)
1451 {
1452     /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
1453     TCGv_i32 constimm, rm1, rm2, rm3, rm4;
1454     TCGv_i64 rtmp;
1455     uint32_t imm;
1456
1457     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1458         return false;
1459     }
1460
1461     /* UNDEF accesses to D16-D31 if they don't exist. */
1462     if (!dc_isar_feature(aa32_simd_r32, s) &&
1463         ((a->vd | a->vm) & 0x10)) {
1464         return false;
1465     }
1466
1467     if (a->vm & 1) {
1468         return false;
1469     }
1470
1471     if (!vfp_access_check(s)) {
1472         return true;
1473     }
1474
1475     /*
1476      * This is always a right shift, and the shiftfn is always a
1477      * left-shift helper, which thus needs the negated shift count
1478      * duplicated into each lane of the immediate value.
1479      */
1480     if (a->size == 1) {
1481         imm = (uint16_t)(-a->shift);
1482         imm |= imm << 16;
1483     } else {
1484         /* size == 2 */
1485         imm = -a->shift;
1486     }
1487     constimm = tcg_const_i32(imm);
1488
1489     /* Load all inputs first to avoid potential overwrite */
1490     rm1 = neon_load_reg(a->vm, 0);
1491     rm2 = neon_load_reg(a->vm, 1);
1492     rm3 = neon_load_reg(a->vm + 1, 0);
1493     rm4 = neon_load_reg(a->vm + 1, 1);
1494     rtmp = tcg_temp_new_i64();
1495
1496     shiftfn(rm1, rm1, constimm);
1497     shiftfn(rm2, rm2, constimm);
1498
1499     tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
1500     tcg_temp_free_i32(rm2);
1501
1502     narrowfn(rm1, cpu_env, rtmp);
1503     neon_store_reg(a->vd, 0, rm1);
1504
1505     shiftfn(rm3, rm3, constimm);
1506     shiftfn(rm4, rm4, constimm);
1507     tcg_temp_free_i32(constimm);
1508
1509     tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
1510     tcg_temp_free_i32(rm4);
1511
1512     narrowfn(rm3, cpu_env, rtmp);
1513     tcg_temp_free_i64(rtmp);
1514     neon_store_reg(a->vd, 1, rm3);
1515     return true;
1516 }
1517
1518 #define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
1519     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1520     {                                                                   \
1521         return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
1522     }
1523 #define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
1524     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1525     {                                                                   \
1526         return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
1527     }
1528
1529 static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1530 {
1531     tcg_gen_extrl_i64_i32(dest, src);
1532 }
1533
1534 static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1535 {
1536     gen_helper_neon_narrow_u16(dest, src);
1537 }
1538
1539 static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1540 {
1541     gen_helper_neon_narrow_u8(dest, src);
1542 }
1543
1544 DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
1545 DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
1546 DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
1547
1548 DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
1549 DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
1550 DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
1551
1552 DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
1553 DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
1554 DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
1555
1556 DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
1557 DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
1558 DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
1559 DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
1560 DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
1561 DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
1562
1563 DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
1564 DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
1565 DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
1566
1567 DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
1568 DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
1569 DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
1570
1571 DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
1572 DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
1573 DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
1574
1575 static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
1576                          NeonGenWidenFn *widenfn, bool u)
1577 {
1578     TCGv_i64 tmp;
1579     TCGv_i32 rm0, rm1;
1580     uint64_t widen_mask = 0;
1581
1582     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1583         return false;
1584     }
1585
1586     /* UNDEF accesses to D16-D31 if they don't exist. */
1587     if (!dc_isar_feature(aa32_simd_r32, s) &&
1588         ((a->vd | a->vm) & 0x10)) {
1589         return false;
1590     }
1591
1592     if (a->vd & 1) {
1593         return false;
1594     }
1595
1596     if (!vfp_access_check(s)) {
1597         return true;
1598     }
1599
1600     /*
1601      * This is a widen-and-shift operation. The shift is always less
1602      * than the width of the source type, so after widening the input
1603      * vector we can simply shift the whole 64-bit widened register,
1604      * and then clear the potential overflow bits resulting from left
1605      * bits of the narrow input appearing as right bits of the left
1606      * neighbour narrow input. Calculate a mask of bits to clear.
1607      */
1608     if ((a->shift != 0) && (a->size < 2 || u)) {
1609         int esize = 8 << a->size;
1610         widen_mask = MAKE_64BIT_MASK(0, esize);
1611         widen_mask >>= esize - a->shift;
1612         widen_mask = dup_const(a->size + 1, widen_mask);
1613     }
1614
1615     rm0 = neon_load_reg(a->vm, 0);
1616     rm1 = neon_load_reg(a->vm, 1);
1617     tmp = tcg_temp_new_i64();
1618
1619     widenfn(tmp, rm0);
1620     tcg_temp_free_i32(rm0);
1621     if (a->shift != 0) {
1622         tcg_gen_shli_i64(tmp, tmp, a->shift);
1623         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1624     }
1625     neon_store_reg64(tmp, a->vd);
1626
1627     widenfn(tmp, rm1);
1628     tcg_temp_free_i32(rm1);
1629     if (a->shift != 0) {
1630         tcg_gen_shli_i64(tmp, tmp, a->shift);
1631         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1632     }
1633     neon_store_reg64(tmp, a->vd + 1);
1634     tcg_temp_free_i64(tmp);
1635     return true;
1636 }
1637
1638 static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
1639 {
1640     static NeonGenWidenFn * const widenfn[] = {
1641         gen_helper_neon_widen_s8,
1642         gen_helper_neon_widen_s16,
1643         tcg_gen_ext_i32_i64,
1644     };
1645     return do_vshll_2sh(s, a, widenfn[a->size], false);
1646 }
1647
1648 static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
1649 {
1650     static NeonGenWidenFn * const widenfn[] = {
1651         gen_helper_neon_widen_u8,
1652         gen_helper_neon_widen_u16,
1653         tcg_gen_extu_i32_i64,
1654     };
1655     return do_vshll_2sh(s, a, widenfn[a->size], true);
1656 }
1657
1658 static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
1659                       NeonGenTwoSingleOpFn *fn)
1660 {
1661     /* FP operations in 2-reg-and-shift group */
1662     TCGv_i32 tmp, shiftv;
1663     TCGv_ptr fpstatus;
1664     int pass;
1665
1666     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1667         return false;
1668     }
1669
1670     /* UNDEF accesses to D16-D31 if they don't exist. */
1671     if (!dc_isar_feature(aa32_simd_r32, s) &&
1672         ((a->vd | a->vm) & 0x10)) {
1673         return false;
1674     }
1675
1676     if ((a->vm | a->vd) & a->q) {
1677         return false;
1678     }
1679
1680     if (!vfp_access_check(s)) {
1681         return true;
1682     }
1683
1684     fpstatus = fpstatus_ptr(FPST_STD);
1685     shiftv = tcg_const_i32(a->shift);
1686     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1687         tmp = neon_load_reg(a->vm, pass);
1688         fn(tmp, tmp, shiftv, fpstatus);
1689         neon_store_reg(a->vd, pass, tmp);
1690     }
1691     tcg_temp_free_ptr(fpstatus);
1692     tcg_temp_free_i32(shiftv);
1693     return true;
1694 }
1695
1696 #define DO_FP_2SH(INSN, FUNC)                                           \
1697     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1698     {                                                                   \
1699         return do_fp_2sh(s, a, FUNC);                                   \
1700     }
1701
1702 DO_FP_2SH(VCVT_SF, gen_helper_vfp_sltos)
1703 DO_FP_2SH(VCVT_UF, gen_helper_vfp_ultos)
1704 DO_FP_2SH(VCVT_FS, gen_helper_vfp_tosls_round_to_zero)
1705 DO_FP_2SH(VCVT_FU, gen_helper_vfp_touls_round_to_zero)
1706
1707 static uint64_t asimd_imm_const(uint32_t imm, int cmode, int op)
1708 {
1709     /*
1710      * Expand the encoded constant.
1711      * Note that cmode = 2,3,4,5,6,7,10,11,12,13 imm=0 is UNPREDICTABLE.
1712      * We choose to not special-case this and will behave as if a
1713      * valid constant encoding of 0 had been given.
1714      * cmode = 15 op = 1 must UNDEF; we assume decode has handled that.
1715      */
1716     switch (cmode) {
1717     case 0: case 1:
1718         /* no-op */
1719         break;
1720     case 2: case 3:
1721         imm <<= 8;
1722         break;
1723     case 4: case 5:
1724         imm <<= 16;
1725         break;
1726     case 6: case 7:
1727         imm <<= 24;
1728         break;
1729     case 8: case 9:
1730         imm |= imm << 16;
1731         break;
1732     case 10: case 11:
1733         imm = (imm << 8) | (imm << 24);
1734         break;
1735     case 12:
1736         imm = (imm << 8) | 0xff;
1737         break;
1738     case 13:
1739         imm = (imm << 16) | 0xffff;
1740         break;
1741     case 14:
1742         if (op) {
1743             /*
1744              * This is the only case where the top and bottom 32 bits
1745              * of the encoded constant differ.
1746              */
1747             uint64_t imm64 = 0;
1748             int n;
1749
1750             for (n = 0; n < 8; n++) {
1751                 if (imm & (1 << n)) {
1752                     imm64 |= (0xffULL << (n * 8));
1753                 }
1754             }
1755             return imm64;
1756         }
1757         imm |= (imm << 8) | (imm << 16) | (imm << 24);
1758         break;
1759     case 15:
1760         imm = ((imm & 0x80) << 24) | ((imm & 0x3f) << 19)
1761             | ((imm & 0x40) ? (0x1f << 25) : (1 << 30));
1762         break;
1763     }
1764     if (op) {
1765         imm = ~imm;
1766     }
1767     return dup_const(MO_32, imm);
1768 }
1769
1770 static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
1771                         GVecGen2iFn *fn)
1772 {
1773     uint64_t imm;
1774     int reg_ofs, vec_size;
1775
1776     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1777         return false;
1778     }
1779
1780     /* UNDEF accesses to D16-D31 if they don't exist. */
1781     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
1782         return false;
1783     }
1784
1785     if (a->vd & a->q) {
1786         return false;
1787     }
1788
1789     if (!vfp_access_check(s)) {
1790         return true;
1791     }
1792
1793     reg_ofs = neon_reg_offset(a->vd, 0);
1794     vec_size = a->q ? 16 : 8;
1795     imm = asimd_imm_const(a->imm, a->cmode, a->op);
1796
1797     fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
1798     return true;
1799 }
1800
1801 static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
1802                         int64_t c, uint32_t oprsz, uint32_t maxsz)
1803 {
1804     tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
1805 }
1806
1807 static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
1808 {
1809     /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
1810     GVecGen2iFn *fn;
1811
1812     if ((a->cmode & 1) && a->cmode < 12) {
1813         /* for op=1, the imm will be inverted, so BIC becomes AND. */
1814         fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
1815     } else {
1816         /* There is one unallocated cmode/op combination in this space */
1817         if (a->cmode == 15 && a->op == 1) {
1818             return false;
1819         }
1820         fn = gen_VMOV_1r;
1821     }
1822     return do_1reg_imm(s, a, fn);
1823 }
1824
1825 static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
1826                            NeonGenWidenFn *widenfn,
1827                            NeonGenTwo64OpFn *opfn,
1828                            bool src1_wide)
1829 {
1830     /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
1831     TCGv_i64 rn0_64, rn1_64, rm_64;
1832     TCGv_i32 rm;
1833
1834     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1835         return false;
1836     }
1837
1838     /* UNDEF accesses to D16-D31 if they don't exist. */
1839     if (!dc_isar_feature(aa32_simd_r32, s) &&
1840         ((a->vd | a->vn | a->vm) & 0x10)) {
1841         return false;
1842     }
1843
1844     if (!widenfn || !opfn) {
1845         /* size == 3 case, which is an entirely different insn group */
1846         return false;
1847     }
1848
1849     if ((a->vd & 1) || (src1_wide && (a->vn & 1))) {
1850         return false;
1851     }
1852
1853     if (!vfp_access_check(s)) {
1854         return true;
1855     }
1856
1857     rn0_64 = tcg_temp_new_i64();
1858     rn1_64 = tcg_temp_new_i64();
1859     rm_64 = tcg_temp_new_i64();
1860
1861     if (src1_wide) {
1862         neon_load_reg64(rn0_64, a->vn);
1863     } else {
1864         TCGv_i32 tmp = neon_load_reg(a->vn, 0);
1865         widenfn(rn0_64, tmp);
1866         tcg_temp_free_i32(tmp);
1867     }
1868     rm = neon_load_reg(a->vm, 0);
1869
1870     widenfn(rm_64, rm);
1871     tcg_temp_free_i32(rm);
1872     opfn(rn0_64, rn0_64, rm_64);
1873
1874     /*
1875      * Load second pass inputs before storing the first pass result, to
1876      * avoid incorrect results if a narrow input overlaps with the result.
1877      */
1878     if (src1_wide) {
1879         neon_load_reg64(rn1_64, a->vn + 1);
1880     } else {
1881         TCGv_i32 tmp = neon_load_reg(a->vn, 1);
1882         widenfn(rn1_64, tmp);
1883         tcg_temp_free_i32(tmp);
1884     }
1885     rm = neon_load_reg(a->vm, 1);
1886
1887     neon_store_reg64(rn0_64, a->vd);
1888
1889     widenfn(rm_64, rm);
1890     tcg_temp_free_i32(rm);
1891     opfn(rn1_64, rn1_64, rm_64);
1892     neon_store_reg64(rn1_64, a->vd + 1);
1893
1894     tcg_temp_free_i64(rn0_64);
1895     tcg_temp_free_i64(rn1_64);
1896     tcg_temp_free_i64(rm_64);
1897
1898     return true;
1899 }
1900
1901 #define DO_PREWIDEN(INSN, S, EXT, OP, SRC1WIDE)                         \
1902     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1903     {                                                                   \
1904         static NeonGenWidenFn * const widenfn[] = {                     \
1905             gen_helper_neon_widen_##S##8,                               \
1906             gen_helper_neon_widen_##S##16,                              \
1907             tcg_gen_##EXT##_i32_i64,                                    \
1908             NULL,                                                       \
1909         };                                                              \
1910         static NeonGenTwo64OpFn * const addfn[] = {                     \
1911             gen_helper_neon_##OP##l_u16,                                \
1912             gen_helper_neon_##OP##l_u32,                                \
1913             tcg_gen_##OP##_i64,                                         \
1914             NULL,                                                       \
1915         };                                                              \
1916         return do_prewiden_3d(s, a, widenfn[a->size],                   \
1917                               addfn[a->size], SRC1WIDE);                \
1918     }
1919
1920 DO_PREWIDEN(VADDL_S, s, ext, add, false)
1921 DO_PREWIDEN(VADDL_U, u, extu, add, false)
1922 DO_PREWIDEN(VSUBL_S, s, ext, sub, false)
1923 DO_PREWIDEN(VSUBL_U, u, extu, sub, false)
1924 DO_PREWIDEN(VADDW_S, s, ext, add, true)
1925 DO_PREWIDEN(VADDW_U, u, extu, add, true)
1926 DO_PREWIDEN(VSUBW_S, s, ext, sub, true)
1927 DO_PREWIDEN(VSUBW_U, u, extu, sub, true)
1928
1929 static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
1930                          NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
1931 {
1932     /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
1933     TCGv_i64 rn_64, rm_64;
1934     TCGv_i32 rd0, rd1;
1935
1936     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1937         return false;
1938     }
1939
1940     /* UNDEF accesses to D16-D31 if they don't exist. */
1941     if (!dc_isar_feature(aa32_simd_r32, s) &&
1942         ((a->vd | a->vn | a->vm) & 0x10)) {
1943         return false;
1944     }
1945
1946     if (!opfn || !narrowfn) {
1947         /* size == 3 case, which is an entirely different insn group */
1948         return false;
1949     }
1950
1951     if ((a->vn | a->vm) & 1) {
1952         return false;
1953     }
1954
1955     if (!vfp_access_check(s)) {
1956         return true;
1957     }
1958
1959     rn_64 = tcg_temp_new_i64();
1960     rm_64 = tcg_temp_new_i64();
1961     rd0 = tcg_temp_new_i32();
1962     rd1 = tcg_temp_new_i32();
1963
1964     neon_load_reg64(rn_64, a->vn);
1965     neon_load_reg64(rm_64, a->vm);
1966
1967     opfn(rn_64, rn_64, rm_64);
1968
1969     narrowfn(rd0, rn_64);
1970
1971     neon_load_reg64(rn_64, a->vn + 1);
1972     neon_load_reg64(rm_64, a->vm + 1);
1973
1974     opfn(rn_64, rn_64, rm_64);
1975
1976     narrowfn(rd1, rn_64);
1977
1978     neon_store_reg(a->vd, 0, rd0);
1979     neon_store_reg(a->vd, 1, rd1);
1980
1981     tcg_temp_free_i64(rn_64);
1982     tcg_temp_free_i64(rm_64);
1983
1984     return true;
1985 }
1986
1987 #define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
1988     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1989     {                                                                   \
1990         static NeonGenTwo64OpFn * const addfn[] = {                     \
1991             gen_helper_neon_##OP##l_u16,                                \
1992             gen_helper_neon_##OP##l_u32,                                \
1993             tcg_gen_##OP##_i64,                                         \
1994             NULL,                                                       \
1995         };                                                              \
1996         static NeonGenNarrowFn * const narrowfn[] = {                   \
1997             gen_helper_neon_##NARROWTYPE##_high_u8,                     \
1998             gen_helper_neon_##NARROWTYPE##_high_u16,                    \
1999             EXTOP,                                                      \
2000             NULL,                                                       \
2001         };                                                              \
2002         return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
2003     }
2004
2005 static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
2006 {
2007     tcg_gen_addi_i64(rn, rn, 1u << 31);
2008     tcg_gen_extrh_i64_i32(rd, rn);
2009 }
2010
2011 DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
2012 DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
2013 DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
2014 DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
2015
2016 static bool do_long_3d(DisasContext *s, arg_3diff *a,
2017                        NeonGenTwoOpWidenFn *opfn,
2018                        NeonGenTwo64OpFn *accfn)
2019 {
2020     /*
2021      * 3-regs different lengths, long operations.
2022      * These perform an operation on two inputs that returns a double-width
2023      * result, and then possibly perform an accumulation operation of
2024      * that result into the double-width destination.
2025      */
2026     TCGv_i64 rd0, rd1, tmp;
2027     TCGv_i32 rn, rm;
2028
2029     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2030         return false;
2031     }
2032
2033     /* UNDEF accesses to D16-D31 if they don't exist. */
2034     if (!dc_isar_feature(aa32_simd_r32, s) &&
2035         ((a->vd | a->vn | a->vm) & 0x10)) {
2036         return false;
2037     }
2038
2039     if (!opfn) {
2040         /* size == 3 case, which is an entirely different insn group */
2041         return false;
2042     }
2043
2044     if (a->vd & 1) {
2045         return false;
2046     }
2047
2048     if (!vfp_access_check(s)) {
2049         return true;
2050     }
2051
2052     rd0 = tcg_temp_new_i64();
2053     rd1 = tcg_temp_new_i64();
2054
2055     rn = neon_load_reg(a->vn, 0);
2056     rm = neon_load_reg(a->vm, 0);
2057     opfn(rd0, rn, rm);
2058     tcg_temp_free_i32(rn);
2059     tcg_temp_free_i32(rm);
2060
2061     rn = neon_load_reg(a->vn, 1);
2062     rm = neon_load_reg(a->vm, 1);
2063     opfn(rd1, rn, rm);
2064     tcg_temp_free_i32(rn);
2065     tcg_temp_free_i32(rm);
2066
2067     /* Don't store results until after all loads: they might overlap */
2068     if (accfn) {
2069         tmp = tcg_temp_new_i64();
2070         neon_load_reg64(tmp, a->vd);
2071         accfn(tmp, tmp, rd0);
2072         neon_store_reg64(tmp, a->vd);
2073         neon_load_reg64(tmp, a->vd + 1);
2074         accfn(tmp, tmp, rd1);
2075         neon_store_reg64(tmp, a->vd + 1);
2076         tcg_temp_free_i64(tmp);
2077     } else {
2078         neon_store_reg64(rd0, a->vd);
2079         neon_store_reg64(rd1, a->vd + 1);
2080     }
2081
2082     tcg_temp_free_i64(rd0);
2083     tcg_temp_free_i64(rd1);
2084
2085     return true;
2086 }
2087
2088 static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
2089 {
2090     static NeonGenTwoOpWidenFn * const opfn[] = {
2091         gen_helper_neon_abdl_s16,
2092         gen_helper_neon_abdl_s32,
2093         gen_helper_neon_abdl_s64,
2094         NULL,
2095     };
2096
2097     return do_long_3d(s, a, opfn[a->size], NULL);
2098 }
2099
2100 static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
2101 {
2102     static NeonGenTwoOpWidenFn * const opfn[] = {
2103         gen_helper_neon_abdl_u16,
2104         gen_helper_neon_abdl_u32,
2105         gen_helper_neon_abdl_u64,
2106         NULL,
2107     };
2108
2109     return do_long_3d(s, a, opfn[a->size], NULL);
2110 }
2111
2112 static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
2113 {
2114     static NeonGenTwoOpWidenFn * const opfn[] = {
2115         gen_helper_neon_abdl_s16,
2116         gen_helper_neon_abdl_s32,
2117         gen_helper_neon_abdl_s64,
2118         NULL,
2119     };
2120     static NeonGenTwo64OpFn * const addfn[] = {
2121         gen_helper_neon_addl_u16,
2122         gen_helper_neon_addl_u32,
2123         tcg_gen_add_i64,
2124         NULL,
2125     };
2126
2127     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2128 }
2129
2130 static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
2131 {
2132     static NeonGenTwoOpWidenFn * const opfn[] = {
2133         gen_helper_neon_abdl_u16,
2134         gen_helper_neon_abdl_u32,
2135         gen_helper_neon_abdl_u64,
2136         NULL,
2137     };
2138     static NeonGenTwo64OpFn * const addfn[] = {
2139         gen_helper_neon_addl_u16,
2140         gen_helper_neon_addl_u32,
2141         tcg_gen_add_i64,
2142         NULL,
2143     };
2144
2145     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2146 }
2147
2148 static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2149 {
2150     TCGv_i32 lo = tcg_temp_new_i32();
2151     TCGv_i32 hi = tcg_temp_new_i32();
2152
2153     tcg_gen_muls2_i32(lo, hi, rn, rm);
2154     tcg_gen_concat_i32_i64(rd, lo, hi);
2155
2156     tcg_temp_free_i32(lo);
2157     tcg_temp_free_i32(hi);
2158 }
2159
2160 static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2161 {
2162     TCGv_i32 lo = tcg_temp_new_i32();
2163     TCGv_i32 hi = tcg_temp_new_i32();
2164
2165     tcg_gen_mulu2_i32(lo, hi, rn, rm);
2166     tcg_gen_concat_i32_i64(rd, lo, hi);
2167
2168     tcg_temp_free_i32(lo);
2169     tcg_temp_free_i32(hi);
2170 }
2171
2172 static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
2173 {
2174     static NeonGenTwoOpWidenFn * const opfn[] = {
2175         gen_helper_neon_mull_s8,
2176         gen_helper_neon_mull_s16,
2177         gen_mull_s32,
2178         NULL,
2179     };
2180
2181     return do_long_3d(s, a, opfn[a->size], NULL);
2182 }
2183
2184 static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
2185 {
2186     static NeonGenTwoOpWidenFn * const opfn[] = {
2187         gen_helper_neon_mull_u8,
2188         gen_helper_neon_mull_u16,
2189         gen_mull_u32,
2190         NULL,
2191     };
2192
2193     return do_long_3d(s, a, opfn[a->size], NULL);
2194 }
2195
2196 #define DO_VMLAL(INSN,MULL,ACC)                                         \
2197     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2198     {                                                                   \
2199         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2200             gen_helper_neon_##MULL##8,                                  \
2201             gen_helper_neon_##MULL##16,                                 \
2202             gen_##MULL##32,                                             \
2203             NULL,                                                       \
2204         };                                                              \
2205         static NeonGenTwo64OpFn * const accfn[] = {                     \
2206             gen_helper_neon_##ACC##l_u16,                               \
2207             gen_helper_neon_##ACC##l_u32,                               \
2208             tcg_gen_##ACC##_i64,                                        \
2209             NULL,                                                       \
2210         };                                                              \
2211         return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
2212     }
2213
2214 DO_VMLAL(VMLAL_S,mull_s,add)
2215 DO_VMLAL(VMLAL_U,mull_u,add)
2216 DO_VMLAL(VMLSL_S,mull_s,sub)
2217 DO_VMLAL(VMLSL_U,mull_u,sub)
2218
2219 static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2220 {
2221     gen_helper_neon_mull_s16(rd, rn, rm);
2222     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rd, rd);
2223 }
2224
2225 static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2226 {
2227     gen_mull_s32(rd, rn, rm);
2228     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rd, rd);
2229 }
2230
2231 static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
2232 {
2233     static NeonGenTwoOpWidenFn * const opfn[] = {
2234         NULL,
2235         gen_VQDMULL_16,
2236         gen_VQDMULL_32,
2237         NULL,
2238     };
2239
2240     return do_long_3d(s, a, opfn[a->size], NULL);
2241 }
2242
2243 static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2244 {
2245     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2246 }
2247
2248 static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2249 {
2250     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2251 }
2252
2253 static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
2254 {
2255     static NeonGenTwoOpWidenFn * const opfn[] = {
2256         NULL,
2257         gen_VQDMULL_16,
2258         gen_VQDMULL_32,
2259         NULL,
2260     };
2261     static NeonGenTwo64OpFn * const accfn[] = {
2262         NULL,
2263         gen_VQDMLAL_acc_16,
2264         gen_VQDMLAL_acc_32,
2265         NULL,
2266     };
2267
2268     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2269 }
2270
2271 static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2272 {
2273     gen_helper_neon_negl_u32(rm, rm);
2274     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2275 }
2276
2277 static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2278 {
2279     tcg_gen_neg_i64(rm, rm);
2280     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2281 }
2282
2283 static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
2284 {
2285     static NeonGenTwoOpWidenFn * const opfn[] = {
2286         NULL,
2287         gen_VQDMULL_16,
2288         gen_VQDMULL_32,
2289         NULL,
2290     };
2291     static NeonGenTwo64OpFn * const accfn[] = {
2292         NULL,
2293         gen_VQDMLSL_acc_16,
2294         gen_VQDMLSL_acc_32,
2295         NULL,
2296     };
2297
2298     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2299 }
2300
2301 static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
2302 {
2303     gen_helper_gvec_3 *fn_gvec;
2304
2305     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2306         return false;
2307     }
2308
2309     /* UNDEF accesses to D16-D31 if they don't exist. */
2310     if (!dc_isar_feature(aa32_simd_r32, s) &&
2311         ((a->vd | a->vn | a->vm) & 0x10)) {
2312         return false;
2313     }
2314
2315     if (a->vd & 1) {
2316         return false;
2317     }
2318
2319     switch (a->size) {
2320     case 0:
2321         fn_gvec = gen_helper_neon_pmull_h;
2322         break;
2323     case 2:
2324         if (!dc_isar_feature(aa32_pmull, s)) {
2325             return false;
2326         }
2327         fn_gvec = gen_helper_gvec_pmull_q;
2328         break;
2329     default:
2330         return false;
2331     }
2332
2333     if (!vfp_access_check(s)) {
2334         return true;
2335     }
2336
2337     tcg_gen_gvec_3_ool(neon_reg_offset(a->vd, 0),
2338                        neon_reg_offset(a->vn, 0),
2339                        neon_reg_offset(a->vm, 0),
2340                        16, 16, 0, fn_gvec);
2341     return true;
2342 }
2343
2344 static void gen_neon_dup_low16(TCGv_i32 var)
2345 {
2346     TCGv_i32 tmp = tcg_temp_new_i32();
2347     tcg_gen_ext16u_i32(var, var);
2348     tcg_gen_shli_i32(tmp, var, 16);
2349     tcg_gen_or_i32(var, var, tmp);
2350     tcg_temp_free_i32(tmp);
2351 }
2352
2353 static void gen_neon_dup_high16(TCGv_i32 var)
2354 {
2355     TCGv_i32 tmp = tcg_temp_new_i32();
2356     tcg_gen_andi_i32(var, var, 0xffff0000);
2357     tcg_gen_shri_i32(tmp, var, 16);
2358     tcg_gen_or_i32(var, var, tmp);
2359     tcg_temp_free_i32(tmp);
2360 }
2361
2362 static inline TCGv_i32 neon_get_scalar(int size, int reg)
2363 {
2364     TCGv_i32 tmp;
2365     if (size == 1) {
2366         tmp = neon_load_reg(reg & 7, reg >> 4);
2367         if (reg & 8) {
2368             gen_neon_dup_high16(tmp);
2369         } else {
2370             gen_neon_dup_low16(tmp);
2371         }
2372     } else {
2373         tmp = neon_load_reg(reg & 15, reg >> 4);
2374     }
2375     return tmp;
2376 }
2377
2378 static bool do_2scalar(DisasContext *s, arg_2scalar *a,
2379                        NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
2380 {
2381     /*
2382      * Two registers and a scalar: perform an operation between
2383      * the input elements and the scalar, and then possibly
2384      * perform an accumulation operation of that result into the
2385      * destination.
2386      */
2387     TCGv_i32 scalar;
2388     int pass;
2389
2390     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2391         return false;
2392     }
2393
2394     /* UNDEF accesses to D16-D31 if they don't exist. */
2395     if (!dc_isar_feature(aa32_simd_r32, s) &&
2396         ((a->vd | a->vn | a->vm) & 0x10)) {
2397         return false;
2398     }
2399
2400     if (!opfn) {
2401         /* Bad size (including size == 3, which is a different insn group) */
2402         return false;
2403     }
2404
2405     if (a->q && ((a->vd | a->vn) & 1)) {
2406         return false;
2407     }
2408
2409     if (!vfp_access_check(s)) {
2410         return true;
2411     }
2412
2413     scalar = neon_get_scalar(a->size, a->vm);
2414
2415     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2416         TCGv_i32 tmp = neon_load_reg(a->vn, pass);
2417         opfn(tmp, tmp, scalar);
2418         if (accfn) {
2419             TCGv_i32 rd = neon_load_reg(a->vd, pass);
2420             accfn(tmp, rd, tmp);
2421             tcg_temp_free_i32(rd);
2422         }
2423         neon_store_reg(a->vd, pass, tmp);
2424     }
2425     tcg_temp_free_i32(scalar);
2426     return true;
2427 }
2428
2429 static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
2430 {
2431     static NeonGenTwoOpFn * const opfn[] = {
2432         NULL,
2433         gen_helper_neon_mul_u16,
2434         tcg_gen_mul_i32,
2435         NULL,
2436     };
2437
2438     return do_2scalar(s, a, opfn[a->size], NULL);
2439 }
2440
2441 static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
2442 {
2443     static NeonGenTwoOpFn * const opfn[] = {
2444         NULL,
2445         gen_helper_neon_mul_u16,
2446         tcg_gen_mul_i32,
2447         NULL,
2448     };
2449     static NeonGenTwoOpFn * const accfn[] = {
2450         NULL,
2451         gen_helper_neon_add_u16,
2452         tcg_gen_add_i32,
2453         NULL,
2454     };
2455
2456     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2457 }
2458
2459 static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
2460 {
2461     static NeonGenTwoOpFn * const opfn[] = {
2462         NULL,
2463         gen_helper_neon_mul_u16,
2464         tcg_gen_mul_i32,
2465         NULL,
2466     };
2467     static NeonGenTwoOpFn * const accfn[] = {
2468         NULL,
2469         gen_helper_neon_sub_u16,
2470         tcg_gen_sub_i32,
2471         NULL,
2472     };
2473
2474     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2475 }
2476
2477 /*
2478  * Rather than have a float-specific version of do_2scalar just for
2479  * three insns, we wrap a NeonGenTwoSingleOpFn to turn it into
2480  * a NeonGenTwoOpFn.
2481  */
2482 #define WRAP_FP_FN(WRAPNAME, FUNC)                              \
2483     static void WRAPNAME(TCGv_i32 rd, TCGv_i32 rn, TCGv_i32 rm) \
2484     {                                                           \
2485         TCGv_ptr fpstatus = fpstatus_ptr(FPST_STD);             \
2486         FUNC(rd, rn, rm, fpstatus);                             \
2487         tcg_temp_free_ptr(fpstatus);                            \
2488     }
2489
2490 WRAP_FP_FN(gen_VMUL_F_mul, gen_helper_vfp_muls)
2491 WRAP_FP_FN(gen_VMUL_F_add, gen_helper_vfp_adds)
2492 WRAP_FP_FN(gen_VMUL_F_sub, gen_helper_vfp_subs)
2493
2494 static bool trans_VMUL_F_2sc(DisasContext *s, arg_2scalar *a)
2495 {
2496     static NeonGenTwoOpFn * const opfn[] = {
2497         NULL,
2498         NULL, /* TODO: fp16 support */
2499         gen_VMUL_F_mul,
2500         NULL,
2501     };
2502
2503     return do_2scalar(s, a, opfn[a->size], NULL);
2504 }
2505
2506 static bool trans_VMLA_F_2sc(DisasContext *s, arg_2scalar *a)
2507 {
2508     static NeonGenTwoOpFn * const opfn[] = {
2509         NULL,
2510         NULL, /* TODO: fp16 support */
2511         gen_VMUL_F_mul,
2512         NULL,
2513     };
2514     static NeonGenTwoOpFn * const accfn[] = {
2515         NULL,
2516         NULL, /* TODO: fp16 support */
2517         gen_VMUL_F_add,
2518         NULL,
2519     };
2520
2521     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2522 }
2523
2524 static bool trans_VMLS_F_2sc(DisasContext *s, arg_2scalar *a)
2525 {
2526     static NeonGenTwoOpFn * const opfn[] = {
2527         NULL,
2528         NULL, /* TODO: fp16 support */
2529         gen_VMUL_F_mul,
2530         NULL,
2531     };
2532     static NeonGenTwoOpFn * const accfn[] = {
2533         NULL,
2534         NULL, /* TODO: fp16 support */
2535         gen_VMUL_F_sub,
2536         NULL,
2537     };
2538
2539     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2540 }
2541
2542 WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
2543 WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
2544 WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
2545 WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
2546
2547 static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
2548 {
2549     static NeonGenTwoOpFn * const opfn[] = {
2550         NULL,
2551         gen_VQDMULH_16,
2552         gen_VQDMULH_32,
2553         NULL,
2554     };
2555
2556     return do_2scalar(s, a, opfn[a->size], NULL);
2557 }
2558
2559 static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
2560 {
2561     static NeonGenTwoOpFn * const opfn[] = {
2562         NULL,
2563         gen_VQRDMULH_16,
2564         gen_VQRDMULH_32,
2565         NULL,
2566     };
2567
2568     return do_2scalar(s, a, opfn[a->size], NULL);
2569 }
2570
2571 static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
2572                             NeonGenThreeOpEnvFn *opfn)
2573 {
2574     /*
2575      * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
2576      * performs a kind of fused op-then-accumulate using a helper
2577      * function that takes all of rd, rn and the scalar at once.
2578      */
2579     TCGv_i32 scalar;
2580     int pass;
2581
2582     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2583         return false;
2584     }
2585
2586     if (!dc_isar_feature(aa32_rdm, s)) {
2587         return false;
2588     }
2589
2590     /* UNDEF accesses to D16-D31 if they don't exist. */
2591     if (!dc_isar_feature(aa32_simd_r32, s) &&
2592         ((a->vd | a->vn | a->vm) & 0x10)) {
2593         return false;
2594     }
2595
2596     if (!opfn) {
2597         /* Bad size (including size == 3, which is a different insn group) */
2598         return false;
2599     }
2600
2601     if (a->q && ((a->vd | a->vn) & 1)) {
2602         return false;
2603     }
2604
2605     if (!vfp_access_check(s)) {
2606         return true;
2607     }
2608
2609     scalar = neon_get_scalar(a->size, a->vm);
2610
2611     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2612         TCGv_i32 rn = neon_load_reg(a->vn, pass);
2613         TCGv_i32 rd = neon_load_reg(a->vd, pass);
2614         opfn(rd, cpu_env, rn, scalar, rd);
2615         tcg_temp_free_i32(rn);
2616         neon_store_reg(a->vd, pass, rd);
2617     }
2618     tcg_temp_free_i32(scalar);
2619
2620     return true;
2621 }
2622
2623 static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
2624 {
2625     static NeonGenThreeOpEnvFn *opfn[] = {
2626         NULL,
2627         gen_helper_neon_qrdmlah_s16,
2628         gen_helper_neon_qrdmlah_s32,
2629         NULL,
2630     };
2631     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2632 }
2633
2634 static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
2635 {
2636     static NeonGenThreeOpEnvFn *opfn[] = {
2637         NULL,
2638         gen_helper_neon_qrdmlsh_s16,
2639         gen_helper_neon_qrdmlsh_s32,
2640         NULL,
2641     };
2642     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2643 }
2644
2645 static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
2646                             NeonGenTwoOpWidenFn *opfn,
2647                             NeonGenTwo64OpFn *accfn)
2648 {
2649     /*
2650      * Two registers and a scalar, long operations: perform an
2651      * operation on the input elements and the scalar which produces
2652      * a double-width result, and then possibly perform an accumulation
2653      * operation of that result into the destination.
2654      */
2655     TCGv_i32 scalar, rn;
2656     TCGv_i64 rn0_64, rn1_64;
2657
2658     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2659         return false;
2660     }
2661
2662     /* UNDEF accesses to D16-D31 if they don't exist. */
2663     if (!dc_isar_feature(aa32_simd_r32, s) &&
2664         ((a->vd | a->vn | a->vm) & 0x10)) {
2665         return false;
2666     }
2667
2668     if (!opfn) {
2669         /* Bad size (including size == 3, which is a different insn group) */
2670         return false;
2671     }
2672
2673     if (a->vd & 1) {
2674         return false;
2675     }
2676
2677     if (!vfp_access_check(s)) {
2678         return true;
2679     }
2680
2681     scalar = neon_get_scalar(a->size, a->vm);
2682
2683     /* Load all inputs before writing any outputs, in case of overlap */
2684     rn = neon_load_reg(a->vn, 0);
2685     rn0_64 = tcg_temp_new_i64();
2686     opfn(rn0_64, rn, scalar);
2687     tcg_temp_free_i32(rn);
2688
2689     rn = neon_load_reg(a->vn, 1);
2690     rn1_64 = tcg_temp_new_i64();
2691     opfn(rn1_64, rn, scalar);
2692     tcg_temp_free_i32(rn);
2693     tcg_temp_free_i32(scalar);
2694
2695     if (accfn) {
2696         TCGv_i64 t64 = tcg_temp_new_i64();
2697         neon_load_reg64(t64, a->vd);
2698         accfn(t64, t64, rn0_64);
2699         neon_store_reg64(t64, a->vd);
2700         neon_load_reg64(t64, a->vd + 1);
2701         accfn(t64, t64, rn1_64);
2702         neon_store_reg64(t64, a->vd + 1);
2703         tcg_temp_free_i64(t64);
2704     } else {
2705         neon_store_reg64(rn0_64, a->vd);
2706         neon_store_reg64(rn1_64, a->vd + 1);
2707     }
2708     tcg_temp_free_i64(rn0_64);
2709     tcg_temp_free_i64(rn1_64);
2710     return true;
2711 }
2712
2713 static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
2714 {
2715     static NeonGenTwoOpWidenFn * const opfn[] = {
2716         NULL,
2717         gen_helper_neon_mull_s16,
2718         gen_mull_s32,
2719         NULL,
2720     };
2721
2722     return do_2scalar_long(s, a, opfn[a->size], NULL);
2723 }
2724
2725 static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
2726 {
2727     static NeonGenTwoOpWidenFn * const opfn[] = {
2728         NULL,
2729         gen_helper_neon_mull_u16,
2730         gen_mull_u32,
2731         NULL,
2732     };
2733
2734     return do_2scalar_long(s, a, opfn[a->size], NULL);
2735 }
2736
2737 #define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
2738     static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
2739     {                                                                   \
2740         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2741             NULL,                                                       \
2742             gen_helper_neon_##MULL##16,                                 \
2743             gen_##MULL##32,                                             \
2744             NULL,                                                       \
2745         };                                                              \
2746         static NeonGenTwo64OpFn * const accfn[] = {                     \
2747             NULL,                                                       \
2748             gen_helper_neon_##ACC##l_u32,                               \
2749             tcg_gen_##ACC##_i64,                                        \
2750             NULL,                                                       \
2751         };                                                              \
2752         return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
2753     }
2754
2755 DO_VMLAL_2SC(VMLAL_S, mull_s, add)
2756 DO_VMLAL_2SC(VMLAL_U, mull_u, add)
2757 DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
2758 DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
2759
2760 static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
2761 {
2762     static NeonGenTwoOpWidenFn * const opfn[] = {
2763         NULL,
2764         gen_VQDMULL_16,
2765         gen_VQDMULL_32,
2766         NULL,
2767     };
2768
2769     return do_2scalar_long(s, a, opfn[a->size], NULL);
2770 }
2771
2772 static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
2773 {
2774     static NeonGenTwoOpWidenFn * const opfn[] = {
2775         NULL,
2776         gen_VQDMULL_16,
2777         gen_VQDMULL_32,
2778         NULL,
2779     };
2780     static NeonGenTwo64OpFn * const accfn[] = {
2781         NULL,
2782         gen_VQDMLAL_acc_16,
2783         gen_VQDMLAL_acc_32,
2784         NULL,
2785     };
2786
2787     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2788 }
2789
2790 static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
2791 {
2792     static NeonGenTwoOpWidenFn * const opfn[] = {
2793         NULL,
2794         gen_VQDMULL_16,
2795         gen_VQDMULL_32,
2796         NULL,
2797     };
2798     static NeonGenTwo64OpFn * const accfn[] = {
2799         NULL,
2800         gen_VQDMLSL_acc_16,
2801         gen_VQDMLSL_acc_32,
2802         NULL,
2803     };
2804
2805     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2806 }
2807
2808 static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
2809 {
2810     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2811         return false;
2812     }
2813
2814     /* UNDEF accesses to D16-D31 if they don't exist. */
2815     if (!dc_isar_feature(aa32_simd_r32, s) &&
2816         ((a->vd | a->vn | a->vm) & 0x10)) {
2817         return false;
2818     }
2819
2820     if ((a->vn | a->vm | a->vd) & a->q) {
2821         return false;
2822     }
2823
2824     if (a->imm > 7 && !a->q) {
2825         return false;
2826     }
2827
2828     if (!vfp_access_check(s)) {
2829         return true;
2830     }
2831
2832     if (!a->q) {
2833         /* Extract 64 bits from <Vm:Vn> */
2834         TCGv_i64 left, right, dest;
2835
2836         left = tcg_temp_new_i64();
2837         right = tcg_temp_new_i64();
2838         dest = tcg_temp_new_i64();
2839
2840         neon_load_reg64(right, a->vn);
2841         neon_load_reg64(left, a->vm);
2842         tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
2843         neon_store_reg64(dest, a->vd);
2844
2845         tcg_temp_free_i64(left);
2846         tcg_temp_free_i64(right);
2847         tcg_temp_free_i64(dest);
2848     } else {
2849         /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
2850         TCGv_i64 left, middle, right, destleft, destright;
2851
2852         left = tcg_temp_new_i64();
2853         middle = tcg_temp_new_i64();
2854         right = tcg_temp_new_i64();
2855         destleft = tcg_temp_new_i64();
2856         destright = tcg_temp_new_i64();
2857
2858         if (a->imm < 8) {
2859             neon_load_reg64(right, a->vn);
2860             neon_load_reg64(middle, a->vn + 1);
2861             tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
2862             neon_load_reg64(left, a->vm);
2863             tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
2864         } else {
2865             neon_load_reg64(right, a->vn + 1);
2866             neon_load_reg64(middle, a->vm);
2867             tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
2868             neon_load_reg64(left, a->vm + 1);
2869             tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
2870         }
2871
2872         neon_store_reg64(destright, a->vd);
2873         neon_store_reg64(destleft, a->vd + 1);
2874
2875         tcg_temp_free_i64(destright);
2876         tcg_temp_free_i64(destleft);
2877         tcg_temp_free_i64(right);
2878         tcg_temp_free_i64(middle);
2879         tcg_temp_free_i64(left);
2880     }
2881     return true;
2882 }
2883
2884 static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
2885 {
2886     int n;
2887     TCGv_i32 tmp, tmp2, tmp3, tmp4;
2888     TCGv_ptr ptr1;
2889
2890     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2891         return false;
2892     }
2893
2894     /* UNDEF accesses to D16-D31 if they don't exist. */
2895     if (!dc_isar_feature(aa32_simd_r32, s) &&
2896         ((a->vd | a->vn | a->vm) & 0x10)) {
2897         return false;
2898     }
2899
2900     if (!vfp_access_check(s)) {
2901         return true;
2902     }
2903
2904     n = a->len + 1;
2905     if ((a->vn + n) > 32) {
2906         /*
2907          * This is UNPREDICTABLE; we choose to UNDEF to avoid the
2908          * helper function running off the end of the register file.
2909          */
2910         return false;
2911     }
2912     n <<= 3;
2913     if (a->op) {
2914         tmp = neon_load_reg(a->vd, 0);
2915     } else {
2916         tmp = tcg_temp_new_i32();
2917         tcg_gen_movi_i32(tmp, 0);
2918     }
2919     tmp2 = neon_load_reg(a->vm, 0);
2920     ptr1 = vfp_reg_ptr(true, a->vn);
2921     tmp4 = tcg_const_i32(n);
2922     gen_helper_neon_tbl(tmp2, tmp2, tmp, ptr1, tmp4);
2923     tcg_temp_free_i32(tmp);
2924     if (a->op) {
2925         tmp = neon_load_reg(a->vd, 1);
2926     } else {
2927         tmp = tcg_temp_new_i32();
2928         tcg_gen_movi_i32(tmp, 0);
2929     }
2930     tmp3 = neon_load_reg(a->vm, 1);
2931     gen_helper_neon_tbl(tmp3, tmp3, tmp, ptr1, tmp4);
2932     tcg_temp_free_i32(tmp4);
2933     tcg_temp_free_ptr(ptr1);
2934     neon_store_reg(a->vd, 0, tmp2);
2935     neon_store_reg(a->vd, 1, tmp3);
2936     tcg_temp_free_i32(tmp);
2937     return true;
2938 }
2939
2940 static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
2941 {
2942     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2943         return false;
2944     }
2945
2946     /* UNDEF accesses to D16-D31 if they don't exist. */
2947     if (!dc_isar_feature(aa32_simd_r32, s) &&
2948         ((a->vd | a->vm) & 0x10)) {
2949         return false;
2950     }
2951
2952     if (a->vd & a->q) {
2953         return false;
2954     }
2955
2956     if (!vfp_access_check(s)) {
2957         return true;
2958     }
2959
2960     tcg_gen_gvec_dup_mem(a->size, neon_reg_offset(a->vd, 0),
2961                          neon_element_offset(a->vm, a->index, a->size),
2962                          a->q ? 16 : 8, a->q ? 16 : 8);
2963     return true;
2964 }
2965
2966 static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
2967 {
2968     int pass, half;
2969
2970     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2971         return false;
2972     }
2973
2974     /* UNDEF accesses to D16-D31 if they don't exist. */
2975     if (!dc_isar_feature(aa32_simd_r32, s) &&
2976         ((a->vd | a->vm) & 0x10)) {
2977         return false;
2978     }
2979
2980     if ((a->vd | a->vm) & a->q) {
2981         return false;
2982     }
2983
2984     if (a->size == 3) {
2985         return false;
2986     }
2987
2988     if (!vfp_access_check(s)) {
2989         return true;
2990     }
2991
2992     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
2993         TCGv_i32 tmp[2];
2994
2995         for (half = 0; half < 2; half++) {
2996             tmp[half] = neon_load_reg(a->vm, pass * 2 + half);
2997             switch (a->size) {
2998             case 0:
2999                 tcg_gen_bswap32_i32(tmp[half], tmp[half]);
3000                 break;
3001             case 1:
3002                 gen_swap_half(tmp[half], tmp[half]);
3003                 break;
3004             case 2:
3005                 break;
3006             default:
3007                 g_assert_not_reached();
3008             }
3009         }
3010         neon_store_reg(a->vd, pass * 2, tmp[1]);
3011         neon_store_reg(a->vd, pass * 2 + 1, tmp[0]);
3012     }
3013     return true;
3014 }
3015
3016 static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
3017                               NeonGenWidenFn *widenfn,
3018                               NeonGenTwo64OpFn *opfn,
3019                               NeonGenTwo64OpFn *accfn)
3020 {
3021     /*
3022      * Pairwise long operations: widen both halves of the pair,
3023      * combine the pairs with the opfn, and then possibly accumulate
3024      * into the destination with the accfn.
3025      */
3026     int pass;
3027
3028     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3029         return false;
3030     }
3031
3032     /* UNDEF accesses to D16-D31 if they don't exist. */
3033     if (!dc_isar_feature(aa32_simd_r32, s) &&
3034         ((a->vd | a->vm) & 0x10)) {
3035         return false;
3036     }
3037
3038     if ((a->vd | a->vm) & a->q) {
3039         return false;
3040     }
3041
3042     if (!widenfn) {
3043         return false;
3044     }
3045
3046     if (!vfp_access_check(s)) {
3047         return true;
3048     }
3049
3050     for (pass = 0; pass < a->q + 1; pass++) {
3051         TCGv_i32 tmp;
3052         TCGv_i64 rm0_64, rm1_64, rd_64;
3053
3054         rm0_64 = tcg_temp_new_i64();
3055         rm1_64 = tcg_temp_new_i64();
3056         rd_64 = tcg_temp_new_i64();
3057         tmp = neon_load_reg(a->vm, pass * 2);
3058         widenfn(rm0_64, tmp);
3059         tcg_temp_free_i32(tmp);
3060         tmp = neon_load_reg(a->vm, pass * 2 + 1);
3061         widenfn(rm1_64, tmp);
3062         tcg_temp_free_i32(tmp);
3063         opfn(rd_64, rm0_64, rm1_64);
3064         tcg_temp_free_i64(rm0_64);
3065         tcg_temp_free_i64(rm1_64);
3066
3067         if (accfn) {
3068             TCGv_i64 tmp64 = tcg_temp_new_i64();
3069             neon_load_reg64(tmp64, a->vd + pass);
3070             accfn(rd_64, tmp64, rd_64);
3071             tcg_temp_free_i64(tmp64);
3072         }
3073         neon_store_reg64(rd_64, a->vd + pass);
3074         tcg_temp_free_i64(rd_64);
3075     }
3076     return true;
3077 }
3078
3079 static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
3080 {
3081     static NeonGenWidenFn * const widenfn[] = {
3082         gen_helper_neon_widen_s8,
3083         gen_helper_neon_widen_s16,
3084         tcg_gen_ext_i32_i64,
3085         NULL,
3086     };
3087     static NeonGenTwo64OpFn * const opfn[] = {
3088         gen_helper_neon_paddl_u16,
3089         gen_helper_neon_paddl_u32,
3090         tcg_gen_add_i64,
3091         NULL,
3092     };
3093
3094     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3095 }
3096
3097 static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
3098 {
3099     static NeonGenWidenFn * const widenfn[] = {
3100         gen_helper_neon_widen_u8,
3101         gen_helper_neon_widen_u16,
3102         tcg_gen_extu_i32_i64,
3103         NULL,
3104     };
3105     static NeonGenTwo64OpFn * const opfn[] = {
3106         gen_helper_neon_paddl_u16,
3107         gen_helper_neon_paddl_u32,
3108         tcg_gen_add_i64,
3109         NULL,
3110     };
3111
3112     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3113 }
3114
3115 static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
3116 {
3117     static NeonGenWidenFn * const widenfn[] = {
3118         gen_helper_neon_widen_s8,
3119         gen_helper_neon_widen_s16,
3120         tcg_gen_ext_i32_i64,
3121         NULL,
3122     };
3123     static NeonGenTwo64OpFn * const opfn[] = {
3124         gen_helper_neon_paddl_u16,
3125         gen_helper_neon_paddl_u32,
3126         tcg_gen_add_i64,
3127         NULL,
3128     };
3129     static NeonGenTwo64OpFn * const accfn[] = {
3130         gen_helper_neon_addl_u16,
3131         gen_helper_neon_addl_u32,
3132         tcg_gen_add_i64,
3133         NULL,
3134     };
3135
3136     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3137                              accfn[a->size]);
3138 }
3139
3140 static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
3141 {
3142     static NeonGenWidenFn * const widenfn[] = {
3143         gen_helper_neon_widen_u8,
3144         gen_helper_neon_widen_u16,
3145         tcg_gen_extu_i32_i64,
3146         NULL,
3147     };
3148     static NeonGenTwo64OpFn * const opfn[] = {
3149         gen_helper_neon_paddl_u16,
3150         gen_helper_neon_paddl_u32,
3151         tcg_gen_add_i64,
3152         NULL,
3153     };
3154     static NeonGenTwo64OpFn * const accfn[] = {
3155         gen_helper_neon_addl_u16,
3156         gen_helper_neon_addl_u32,
3157         tcg_gen_add_i64,
3158         NULL,
3159     };
3160
3161     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3162                              accfn[a->size]);
3163 }
3164
3165 typedef void ZipFn(TCGv_ptr, TCGv_ptr);
3166
3167 static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
3168                        ZipFn *fn)
3169 {
3170     TCGv_ptr pd, pm;
3171
3172     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3173         return false;
3174     }
3175
3176     /* UNDEF accesses to D16-D31 if they don't exist. */
3177     if (!dc_isar_feature(aa32_simd_r32, s) &&
3178         ((a->vd | a->vm) & 0x10)) {
3179         return false;
3180     }
3181
3182     if ((a->vd | a->vm) & a->q) {
3183         return false;
3184     }
3185
3186     if (!fn) {
3187         /* Bad size or size/q combination */
3188         return false;
3189     }
3190
3191     if (!vfp_access_check(s)) {
3192         return true;
3193     }
3194
3195     pd = vfp_reg_ptr(true, a->vd);
3196     pm = vfp_reg_ptr(true, a->vm);
3197     fn(pd, pm);
3198     tcg_temp_free_ptr(pd);
3199     tcg_temp_free_ptr(pm);
3200     return true;
3201 }
3202
3203 static bool trans_VUZP(DisasContext *s, arg_2misc *a)
3204 {
3205     static ZipFn * const fn[2][4] = {
3206         {
3207             gen_helper_neon_unzip8,
3208             gen_helper_neon_unzip16,
3209             NULL,
3210             NULL,
3211         }, {
3212             gen_helper_neon_qunzip8,
3213             gen_helper_neon_qunzip16,
3214             gen_helper_neon_qunzip32,
3215             NULL,
3216         }
3217     };
3218     return do_zip_uzp(s, a, fn[a->q][a->size]);
3219 }
3220
3221 static bool trans_VZIP(DisasContext *s, arg_2misc *a)
3222 {
3223     static ZipFn * const fn[2][4] = {
3224         {
3225             gen_helper_neon_zip8,
3226             gen_helper_neon_zip16,
3227             NULL,
3228             NULL,
3229         }, {
3230             gen_helper_neon_qzip8,
3231             gen_helper_neon_qzip16,
3232             gen_helper_neon_qzip32,
3233             NULL,
3234         }
3235     };
3236     return do_zip_uzp(s, a, fn[a->q][a->size]);
3237 }
3238
3239 static bool do_vmovn(DisasContext *s, arg_2misc *a,
3240                      NeonGenNarrowEnvFn *narrowfn)
3241 {
3242     TCGv_i64 rm;
3243     TCGv_i32 rd0, rd1;
3244
3245     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3246         return false;
3247     }
3248
3249     /* UNDEF accesses to D16-D31 if they don't exist. */
3250     if (!dc_isar_feature(aa32_simd_r32, s) &&
3251         ((a->vd | a->vm) & 0x10)) {
3252         return false;
3253     }
3254
3255     if (a->vm & 1) {
3256         return false;
3257     }
3258
3259     if (!narrowfn) {
3260         return false;
3261     }
3262
3263     if (!vfp_access_check(s)) {
3264         return true;
3265     }
3266
3267     rm = tcg_temp_new_i64();
3268     rd0 = tcg_temp_new_i32();
3269     rd1 = tcg_temp_new_i32();
3270
3271     neon_load_reg64(rm, a->vm);
3272     narrowfn(rd0, cpu_env, rm);
3273     neon_load_reg64(rm, a->vm + 1);
3274     narrowfn(rd1, cpu_env, rm);
3275     neon_store_reg(a->vd, 0, rd0);
3276     neon_store_reg(a->vd, 1, rd1);
3277     tcg_temp_free_i64(rm);
3278     return true;
3279 }
3280
3281 #define DO_VMOVN(INSN, FUNC)                                    \
3282     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3283     {                                                           \
3284         static NeonGenNarrowEnvFn * const narrowfn[] = {        \
3285             FUNC##8,                                            \
3286             FUNC##16,                                           \
3287             FUNC##32,                                           \
3288             NULL,                                               \
3289         };                                                      \
3290         return do_vmovn(s, a, narrowfn[a->size]);               \
3291     }
3292
3293 DO_VMOVN(VMOVN, gen_neon_narrow_u)
3294 DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
3295 DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
3296 DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
3297
3298 static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
3299 {
3300     TCGv_i32 rm0, rm1;
3301     TCGv_i64 rd;
3302     static NeonGenWidenFn * const widenfns[] = {
3303         gen_helper_neon_widen_u8,
3304         gen_helper_neon_widen_u16,
3305         tcg_gen_extu_i32_i64,
3306         NULL,
3307     };
3308     NeonGenWidenFn *widenfn = widenfns[a->size];
3309
3310     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3311         return false;
3312     }
3313
3314     /* UNDEF accesses to D16-D31 if they don't exist. */
3315     if (!dc_isar_feature(aa32_simd_r32, s) &&
3316         ((a->vd | a->vm) & 0x10)) {
3317         return false;
3318     }
3319
3320     if (a->vd & 1) {
3321         return false;
3322     }
3323
3324     if (!widenfn) {
3325         return false;
3326     }
3327
3328     if (!vfp_access_check(s)) {
3329         return true;
3330     }
3331
3332     rd = tcg_temp_new_i64();
3333
3334     rm0 = neon_load_reg(a->vm, 0);
3335     rm1 = neon_load_reg(a->vm, 1);
3336
3337     widenfn(rd, rm0);
3338     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3339     neon_store_reg64(rd, a->vd);
3340     widenfn(rd, rm1);
3341     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3342     neon_store_reg64(rd, a->vd + 1);
3343
3344     tcg_temp_free_i64(rd);
3345     tcg_temp_free_i32(rm0);
3346     tcg_temp_free_i32(rm1);
3347     return true;
3348 }
3349
3350 static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
3351 {
3352     TCGv_ptr fpst;
3353     TCGv_i32 ahp, tmp, tmp2, tmp3;
3354
3355     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3356         !dc_isar_feature(aa32_fp16_spconv, s)) {
3357         return false;
3358     }
3359
3360     /* UNDEF accesses to D16-D31 if they don't exist. */
3361     if (!dc_isar_feature(aa32_simd_r32, s) &&
3362         ((a->vd | a->vm) & 0x10)) {
3363         return false;
3364     }
3365
3366     if ((a->vm & 1) || (a->size != 1)) {
3367         return false;
3368     }
3369
3370     if (!vfp_access_check(s)) {
3371         return true;
3372     }
3373
3374     fpst = fpstatus_ptr(FPST_STD);
3375     ahp = get_ahp_flag();
3376     tmp = neon_load_reg(a->vm, 0);
3377     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3378     tmp2 = neon_load_reg(a->vm, 1);
3379     gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
3380     tcg_gen_shli_i32(tmp2, tmp2, 16);
3381     tcg_gen_or_i32(tmp2, tmp2, tmp);
3382     tcg_temp_free_i32(tmp);
3383     tmp = neon_load_reg(a->vm, 2);
3384     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3385     tmp3 = neon_load_reg(a->vm, 3);
3386     neon_store_reg(a->vd, 0, tmp2);
3387     gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
3388     tcg_gen_shli_i32(tmp3, tmp3, 16);
3389     tcg_gen_or_i32(tmp3, tmp3, tmp);
3390     neon_store_reg(a->vd, 1, tmp3);
3391     tcg_temp_free_i32(tmp);
3392     tcg_temp_free_i32(ahp);
3393     tcg_temp_free_ptr(fpst);
3394
3395     return true;
3396 }
3397
3398 static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
3399 {
3400     TCGv_ptr fpst;
3401     TCGv_i32 ahp, tmp, tmp2, tmp3;
3402
3403     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3404         !dc_isar_feature(aa32_fp16_spconv, s)) {
3405         return false;
3406     }
3407
3408     /* UNDEF accesses to D16-D31 if they don't exist. */
3409     if (!dc_isar_feature(aa32_simd_r32, s) &&
3410         ((a->vd | a->vm) & 0x10)) {
3411         return false;
3412     }
3413
3414     if ((a->vd & 1) || (a->size != 1)) {
3415         return false;
3416     }
3417
3418     if (!vfp_access_check(s)) {
3419         return true;
3420     }
3421
3422     fpst = fpstatus_ptr(FPST_STD);
3423     ahp = get_ahp_flag();
3424     tmp3 = tcg_temp_new_i32();
3425     tmp = neon_load_reg(a->vm, 0);
3426     tmp2 = neon_load_reg(a->vm, 1);
3427     tcg_gen_ext16u_i32(tmp3, tmp);
3428     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3429     neon_store_reg(a->vd, 0, tmp3);
3430     tcg_gen_shri_i32(tmp, tmp, 16);
3431     gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
3432     neon_store_reg(a->vd, 1, tmp);
3433     tmp3 = tcg_temp_new_i32();
3434     tcg_gen_ext16u_i32(tmp3, tmp2);
3435     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3436     neon_store_reg(a->vd, 2, tmp3);
3437     tcg_gen_shri_i32(tmp2, tmp2, 16);
3438     gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
3439     neon_store_reg(a->vd, 3, tmp2);
3440     tcg_temp_free_i32(ahp);
3441     tcg_temp_free_ptr(fpst);
3442
3443     return true;
3444 }
3445
3446 static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
3447 {
3448     int vec_size = a->q ? 16 : 8;
3449     int rd_ofs = neon_reg_offset(a->vd, 0);
3450     int rm_ofs = neon_reg_offset(a->vm, 0);
3451
3452     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3453         return false;
3454     }
3455
3456     /* UNDEF accesses to D16-D31 if they don't exist. */
3457     if (!dc_isar_feature(aa32_simd_r32, s) &&
3458         ((a->vd | a->vm) & 0x10)) {
3459         return false;
3460     }
3461
3462     if (a->size == 3) {
3463         return false;
3464     }
3465
3466     if ((a->vd | a->vm) & a->q) {
3467         return false;
3468     }
3469
3470     if (!vfp_access_check(s)) {
3471         return true;
3472     }
3473
3474     fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size);
3475
3476     return true;
3477 }
3478
3479 #define DO_2MISC_VEC(INSN, FN)                                  \
3480     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3481     {                                                           \
3482         return do_2misc_vec(s, a, FN);                          \
3483     }
3484
3485 DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg)
3486 DO_2MISC_VEC(VABS, tcg_gen_gvec_abs)
3487 DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0)
3488 DO_2MISC_VEC(VCGT0, gen_gvec_cgt0)
3489 DO_2MISC_VEC(VCLE0, gen_gvec_cle0)
3490 DO_2MISC_VEC(VCGE0, gen_gvec_cge0)
3491 DO_2MISC_VEC(VCLT0, gen_gvec_clt0)
3492
3493 static bool trans_VMVN(DisasContext *s, arg_2misc *a)
3494 {
3495     if (a->size != 0) {
3496         return false;
3497     }
3498     return do_2misc_vec(s, a, tcg_gen_gvec_not);
3499 }
3500
3501 #define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3502     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3503                          uint32_t rm_ofs, uint32_t oprsz,               \
3504                          uint32_t maxsz)                                \
3505     {                                                                   \
3506         tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz,        \
3507                            DATA, FUNC);                                 \
3508     }
3509
3510 #define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3511     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3512                          uint32_t rm_ofs, uint32_t oprsz,               \
3513                          uint32_t maxsz)                                \
3514     {                                                                   \
3515         tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC);   \
3516     }
3517
3518 WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0)
3519 WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aese, 1)
3520 WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0)
3521 WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesmc, 1)
3522 WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0)
3523 WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0)
3524 WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0)
3525
3526 #define DO_2M_CRYPTO(INSN, FEATURE, SIZE)                       \
3527     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3528     {                                                           \
3529         if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) {  \
3530             return false;                                       \
3531         }                                                       \
3532         return do_2misc_vec(s, a, gen_##INSN);                  \
3533     }
3534
3535 DO_2M_CRYPTO(AESE, aa32_aes, 0)
3536 DO_2M_CRYPTO(AESD, aa32_aes, 0)
3537 DO_2M_CRYPTO(AESMC, aa32_aes, 0)
3538 DO_2M_CRYPTO(AESIMC, aa32_aes, 0)
3539 DO_2M_CRYPTO(SHA1H, aa32_sha1, 2)
3540 DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2)
3541 DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2)
3542
3543 static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
3544 {
3545     int pass;
3546
3547     /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
3548     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3549         return false;
3550     }
3551
3552     /* UNDEF accesses to D16-D31 if they don't exist. */
3553     if (!dc_isar_feature(aa32_simd_r32, s) &&
3554         ((a->vd | a->vm) & 0x10)) {
3555         return false;
3556     }
3557