xref: /openbmc/qemu/target/arm/tcg/vec_helper.c (revision 5e29203bc7428d77f940d6427557a3b258e2224c)
1a3ef070eSClaudio Fontana /*
2a3ef070eSClaudio Fontana  * ARM AdvSIMD / SVE Vector Operations
3a3ef070eSClaudio Fontana  *
4a3ef070eSClaudio Fontana  * Copyright (c) 2018 Linaro
5a3ef070eSClaudio Fontana  *
6a3ef070eSClaudio Fontana  * This library is free software; you can redistribute it and/or
7a3ef070eSClaudio Fontana  * modify it under the terms of the GNU Lesser General Public
8a3ef070eSClaudio Fontana  * License as published by the Free Software Foundation; either
9a3ef070eSClaudio Fontana  * version 2.1 of the License, or (at your option) any later version.
10a3ef070eSClaudio Fontana  *
11a3ef070eSClaudio Fontana  * This library is distributed in the hope that it will be useful,
12a3ef070eSClaudio Fontana  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13a3ef070eSClaudio Fontana  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14a3ef070eSClaudio Fontana  * Lesser General Public License for more details.
15a3ef070eSClaudio Fontana  *
16a3ef070eSClaudio Fontana  * You should have received a copy of the GNU Lesser General Public
17a3ef070eSClaudio Fontana  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18a3ef070eSClaudio Fontana  */
19a3ef070eSClaudio Fontana 
20a3ef070eSClaudio Fontana #include "qemu/osdep.h"
21a3ef070eSClaudio Fontana #include "cpu.h"
22a3ef070eSClaudio Fontana #include "exec/helper-proto.h"
23a3ef070eSClaudio Fontana #include "tcg/tcg-gvec-desc.h"
24a3ef070eSClaudio Fontana #include "fpu/softfloat.h"
25a3ef070eSClaudio Fontana #include "qemu/int128.h"
268e3da4c7SRichard Henderson #include "crypto/clmul.h"
27a3ef070eSClaudio Fontana #include "vec_internal.h"
28a3ef070eSClaudio Fontana 
29a3ef070eSClaudio Fontana /*
30a3ef070eSClaudio Fontana  * Data for expanding active predicate bits to bytes, for byte elements.
31a3ef070eSClaudio Fontana  *
32a3ef070eSClaudio Fontana  *  for (i = 0; i < 256; ++i) {
33a3ef070eSClaudio Fontana  *      unsigned long m = 0;
34a3ef070eSClaudio Fontana  *      for (j = 0; j < 8; j++) {
35a3ef070eSClaudio Fontana  *          if ((i >> j) & 1) {
36a3ef070eSClaudio Fontana  *              m |= 0xfful << (j << 3);
37a3ef070eSClaudio Fontana  *          }
38a3ef070eSClaudio Fontana  *      }
39a3ef070eSClaudio Fontana  *      printf("0x%016lx,\n", m);
40a3ef070eSClaudio Fontana  *  }
41a3ef070eSClaudio Fontana  */
42a3ef070eSClaudio Fontana const uint64_t expand_pred_b_data[256] = {
43a3ef070eSClaudio Fontana     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44a3ef070eSClaudio Fontana     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45a3ef070eSClaudio Fontana     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46a3ef070eSClaudio Fontana     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47a3ef070eSClaudio Fontana     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48a3ef070eSClaudio Fontana     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49a3ef070eSClaudio Fontana     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50a3ef070eSClaudio Fontana     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51a3ef070eSClaudio Fontana     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52a3ef070eSClaudio Fontana     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53a3ef070eSClaudio Fontana     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54a3ef070eSClaudio Fontana     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55a3ef070eSClaudio Fontana     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56a3ef070eSClaudio Fontana     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57a3ef070eSClaudio Fontana     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58a3ef070eSClaudio Fontana     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59a3ef070eSClaudio Fontana     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60a3ef070eSClaudio Fontana     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61a3ef070eSClaudio Fontana     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62a3ef070eSClaudio Fontana     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63a3ef070eSClaudio Fontana     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64a3ef070eSClaudio Fontana     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65a3ef070eSClaudio Fontana     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66a3ef070eSClaudio Fontana     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67a3ef070eSClaudio Fontana     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68a3ef070eSClaudio Fontana     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69a3ef070eSClaudio Fontana     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70a3ef070eSClaudio Fontana     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71a3ef070eSClaudio Fontana     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72a3ef070eSClaudio Fontana     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73a3ef070eSClaudio Fontana     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74a3ef070eSClaudio Fontana     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75a3ef070eSClaudio Fontana     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76a3ef070eSClaudio Fontana     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77a3ef070eSClaudio Fontana     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78a3ef070eSClaudio Fontana     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79a3ef070eSClaudio Fontana     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80a3ef070eSClaudio Fontana     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81a3ef070eSClaudio Fontana     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82a3ef070eSClaudio Fontana     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83a3ef070eSClaudio Fontana     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84a3ef070eSClaudio Fontana     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85a3ef070eSClaudio Fontana     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86a3ef070eSClaudio Fontana     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87a3ef070eSClaudio Fontana     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88a3ef070eSClaudio Fontana     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89a3ef070eSClaudio Fontana     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90a3ef070eSClaudio Fontana     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91a3ef070eSClaudio Fontana     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92a3ef070eSClaudio Fontana     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93a3ef070eSClaudio Fontana     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94a3ef070eSClaudio Fontana     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95a3ef070eSClaudio Fontana     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96a3ef070eSClaudio Fontana     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97a3ef070eSClaudio Fontana     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98a3ef070eSClaudio Fontana     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99a3ef070eSClaudio Fontana     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100a3ef070eSClaudio Fontana     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101a3ef070eSClaudio Fontana     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102a3ef070eSClaudio Fontana     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103a3ef070eSClaudio Fontana     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104a3ef070eSClaudio Fontana     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105a3ef070eSClaudio Fontana     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106a3ef070eSClaudio Fontana     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107a3ef070eSClaudio Fontana     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108a3ef070eSClaudio Fontana     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109a3ef070eSClaudio Fontana     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110a3ef070eSClaudio Fontana     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111a3ef070eSClaudio Fontana     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112a3ef070eSClaudio Fontana     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113a3ef070eSClaudio Fontana     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114a3ef070eSClaudio Fontana     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115a3ef070eSClaudio Fontana     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116a3ef070eSClaudio Fontana     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117a3ef070eSClaudio Fontana     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118a3ef070eSClaudio Fontana     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119a3ef070eSClaudio Fontana     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120a3ef070eSClaudio Fontana     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121a3ef070eSClaudio Fontana     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122a3ef070eSClaudio Fontana     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123a3ef070eSClaudio Fontana     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124a3ef070eSClaudio Fontana     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125a3ef070eSClaudio Fontana     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126a3ef070eSClaudio Fontana     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127a3ef070eSClaudio Fontana     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128a3ef070eSClaudio Fontana     0xffffffffffffffff,
129a3ef070eSClaudio Fontana };
130a3ef070eSClaudio Fontana 
131a3ef070eSClaudio Fontana /*
132a3ef070eSClaudio Fontana  * Similarly for half-word elements.
133a3ef070eSClaudio Fontana  *  for (i = 0; i < 256; ++i) {
134a3ef070eSClaudio Fontana  *      unsigned long m = 0;
135a3ef070eSClaudio Fontana  *      if (i & 0xaa) {
136a3ef070eSClaudio Fontana  *          continue;
137a3ef070eSClaudio Fontana  *      }
138a3ef070eSClaudio Fontana  *      for (j = 0; j < 8; j += 2) {
139a3ef070eSClaudio Fontana  *          if ((i >> j) & 1) {
140a3ef070eSClaudio Fontana  *              m |= 0xfffful << (j << 3);
141a3ef070eSClaudio Fontana  *          }
142a3ef070eSClaudio Fontana  *      }
143a3ef070eSClaudio Fontana  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144a3ef070eSClaudio Fontana  *  }
145a3ef070eSClaudio Fontana  */
146a3ef070eSClaudio Fontana const uint64_t expand_pred_h_data[0x55 + 1] = {
147a3ef070eSClaudio Fontana     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148a3ef070eSClaudio Fontana     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149a3ef070eSClaudio Fontana     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150a3ef070eSClaudio Fontana     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151a3ef070eSClaudio Fontana     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152a3ef070eSClaudio Fontana     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153a3ef070eSClaudio Fontana     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154a3ef070eSClaudio Fontana     [0x55] = 0xffffffffffffffff,
155a3ef070eSClaudio Fontana };
156a3ef070eSClaudio Fontana 
157a3ef070eSClaudio Fontana /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
do_sqrdmlah_b(int8_t src1,int8_t src2,int8_t src3,bool neg,bool round)158a3ef070eSClaudio Fontana int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159a3ef070eSClaudio Fontana                      bool neg, bool round)
160a3ef070eSClaudio Fontana {
161a3ef070eSClaudio Fontana     /*
162a3ef070eSClaudio Fontana      * Simplify:
163a3ef070eSClaudio Fontana      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164a3ef070eSClaudio Fontana      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165a3ef070eSClaudio Fontana      */
166a3ef070eSClaudio Fontana     int32_t ret = (int32_t)src1 * src2;
167a3ef070eSClaudio Fontana     if (neg) {
168a3ef070eSClaudio Fontana         ret = -ret;
169a3ef070eSClaudio Fontana     }
170a3ef070eSClaudio Fontana     ret += ((int32_t)src3 << 7) + (round << 6);
171a3ef070eSClaudio Fontana     ret >>= 7;
172a3ef070eSClaudio Fontana 
173a3ef070eSClaudio Fontana     if (ret != (int8_t)ret) {
174a3ef070eSClaudio Fontana         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175a3ef070eSClaudio Fontana     }
176a3ef070eSClaudio Fontana     return ret;
177a3ef070eSClaudio Fontana }
178a3ef070eSClaudio Fontana 
HELPER(sve2_sqrdmlah_b)179a3ef070eSClaudio Fontana void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180a3ef070eSClaudio Fontana                              void *va, uint32_t desc)
181a3ef070eSClaudio Fontana {
182a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
183a3ef070eSClaudio Fontana     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184a3ef070eSClaudio Fontana 
185a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz; ++i) {
186a3ef070eSClaudio Fontana         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187a3ef070eSClaudio Fontana     }
188a3ef070eSClaudio Fontana }
189a3ef070eSClaudio Fontana 
HELPER(sve2_sqrdmlsh_b)190a3ef070eSClaudio Fontana void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191a3ef070eSClaudio Fontana                              void *va, uint32_t desc)
192a3ef070eSClaudio Fontana {
193a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
194a3ef070eSClaudio Fontana     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195a3ef070eSClaudio Fontana 
196a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz; ++i) {
197a3ef070eSClaudio Fontana         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198a3ef070eSClaudio Fontana     }
199a3ef070eSClaudio Fontana }
200a3ef070eSClaudio Fontana 
HELPER(sve2_sqdmulh_b)201a3ef070eSClaudio Fontana void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202a3ef070eSClaudio Fontana {
203a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
204a3ef070eSClaudio Fontana     int8_t *d = vd, *n = vn, *m = vm;
205a3ef070eSClaudio Fontana 
206a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz; ++i) {
207a3ef070eSClaudio Fontana         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208a3ef070eSClaudio Fontana     }
209a3ef070eSClaudio Fontana }
210a3ef070eSClaudio Fontana 
HELPER(sve2_sqrdmulh_b)211a3ef070eSClaudio Fontana void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212a3ef070eSClaudio Fontana {
213a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
214a3ef070eSClaudio Fontana     int8_t *d = vd, *n = vn, *m = vm;
215a3ef070eSClaudio Fontana 
216a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz; ++i) {
217a3ef070eSClaudio Fontana         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218a3ef070eSClaudio Fontana     }
219a3ef070eSClaudio Fontana }
220a3ef070eSClaudio Fontana 
221a3ef070eSClaudio Fontana /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
do_sqrdmlah_h(int16_t src1,int16_t src2,int16_t src3,bool neg,bool round,uint32_t * sat)222a3ef070eSClaudio Fontana int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223a3ef070eSClaudio Fontana                       bool neg, bool round, uint32_t *sat)
224a3ef070eSClaudio Fontana {
225a3ef070eSClaudio Fontana     /* Simplify similarly to do_sqrdmlah_b above.  */
226a3ef070eSClaudio Fontana     int32_t ret = (int32_t)src1 * src2;
227a3ef070eSClaudio Fontana     if (neg) {
228a3ef070eSClaudio Fontana         ret = -ret;
229a3ef070eSClaudio Fontana     }
230a3ef070eSClaudio Fontana     ret += ((int32_t)src3 << 15) + (round << 14);
231a3ef070eSClaudio Fontana     ret >>= 15;
232a3ef070eSClaudio Fontana 
233a3ef070eSClaudio Fontana     if (ret != (int16_t)ret) {
234a3ef070eSClaudio Fontana         *sat = 1;
235a3ef070eSClaudio Fontana         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236a3ef070eSClaudio Fontana     }
237a3ef070eSClaudio Fontana     return ret;
238a3ef070eSClaudio Fontana }
239a3ef070eSClaudio Fontana 
HELPER(neon_qrdmlah_s16)240a3ef070eSClaudio Fontana uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241a3ef070eSClaudio Fontana                                   uint32_t src2, uint32_t src3)
242a3ef070eSClaudio Fontana {
243a3ef070eSClaudio Fontana     uint32_t *sat = &env->vfp.qc[0];
244a3ef070eSClaudio Fontana     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245a3ef070eSClaudio Fontana     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246a3ef070eSClaudio Fontana                                 false, true, sat);
247a3ef070eSClaudio Fontana     return deposit32(e1, 16, 16, e2);
248a3ef070eSClaudio Fontana }
249a3ef070eSClaudio Fontana 
HELPER(gvec_qrdmlah_s16)250a3ef070eSClaudio Fontana void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251a3ef070eSClaudio Fontana                               void *vq, uint32_t desc)
252a3ef070eSClaudio Fontana {
253a3ef070eSClaudio Fontana     uintptr_t opr_sz = simd_oprsz(desc);
254a3ef070eSClaudio Fontana     int16_t *d = vd;
255a3ef070eSClaudio Fontana     int16_t *n = vn;
256a3ef070eSClaudio Fontana     int16_t *m = vm;
257a3ef070eSClaudio Fontana     uintptr_t i;
258a3ef070eSClaudio Fontana 
259a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 2; ++i) {
260a3ef070eSClaudio Fontana         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261a3ef070eSClaudio Fontana     }
262a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
263a3ef070eSClaudio Fontana }
264a3ef070eSClaudio Fontana 
HELPER(neon_qrdmlsh_s16)265a3ef070eSClaudio Fontana uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266a3ef070eSClaudio Fontana                                   uint32_t src2, uint32_t src3)
267a3ef070eSClaudio Fontana {
268a3ef070eSClaudio Fontana     uint32_t *sat = &env->vfp.qc[0];
269a3ef070eSClaudio Fontana     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270a3ef070eSClaudio Fontana     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271a3ef070eSClaudio Fontana                                 true, true, sat);
272a3ef070eSClaudio Fontana     return deposit32(e1, 16, 16, e2);
273a3ef070eSClaudio Fontana }
274a3ef070eSClaudio Fontana 
HELPER(gvec_qrdmlsh_s16)275a3ef070eSClaudio Fontana void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276a3ef070eSClaudio Fontana                               void *vq, uint32_t desc)
277a3ef070eSClaudio Fontana {
278a3ef070eSClaudio Fontana     uintptr_t opr_sz = simd_oprsz(desc);
279a3ef070eSClaudio Fontana     int16_t *d = vd;
280a3ef070eSClaudio Fontana     int16_t *n = vn;
281a3ef070eSClaudio Fontana     int16_t *m = vm;
282a3ef070eSClaudio Fontana     uintptr_t i;
283a3ef070eSClaudio Fontana 
284a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 2; ++i) {
285a3ef070eSClaudio Fontana         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286a3ef070eSClaudio Fontana     }
287a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
288a3ef070eSClaudio Fontana }
289a3ef070eSClaudio Fontana 
HELPER(neon_sqdmulh_h)290a3ef070eSClaudio Fontana void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291a3ef070eSClaudio Fontana                             void *vq, uint32_t desc)
292a3ef070eSClaudio Fontana {
293a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
294a3ef070eSClaudio Fontana     int16_t *d = vd, *n = vn, *m = vm;
295a3ef070eSClaudio Fontana 
296a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 2; ++i) {
297a3ef070eSClaudio Fontana         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298a3ef070eSClaudio Fontana     }
299a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
300a3ef070eSClaudio Fontana }
301a3ef070eSClaudio Fontana 
HELPER(neon_sqrdmulh_h)302a3ef070eSClaudio Fontana void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303a3ef070eSClaudio Fontana                              void *vq, uint32_t desc)
304a3ef070eSClaudio Fontana {
305a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
306a3ef070eSClaudio Fontana     int16_t *d = vd, *n = vn, *m = vm;
307a3ef070eSClaudio Fontana 
308a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 2; ++i) {
309a3ef070eSClaudio Fontana         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310a3ef070eSClaudio Fontana     }
311a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
312a3ef070eSClaudio Fontana }
313a3ef070eSClaudio Fontana 
HELPER(neon_sqdmulh_idx_h)314f80701cbSRichard Henderson void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
315f80701cbSRichard Henderson                                 void *vq, uint32_t desc)
316f80701cbSRichard Henderson {
317f80701cbSRichard Henderson     intptr_t i, j, opr_sz = simd_oprsz(desc);
318f80701cbSRichard Henderson     int idx = simd_data(desc);
319f80701cbSRichard Henderson     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
320a5b72cccSRichard Henderson     intptr_t elements = opr_sz / 2;
321a5b72cccSRichard Henderson     intptr_t eltspersegment = MIN(16 / 2, elements);
322f80701cbSRichard Henderson 
323a5b72cccSRichard Henderson     for (i = 0; i < elements; i += 16 / 2) {
324f80701cbSRichard Henderson         int16_t mm = m[i];
325a5b72cccSRichard Henderson         for (j = 0; j < eltspersegment; ++j) {
326f80701cbSRichard Henderson             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
327f80701cbSRichard Henderson         }
328f80701cbSRichard Henderson     }
329f80701cbSRichard Henderson     clear_tail(d, opr_sz, simd_maxsz(desc));
330f80701cbSRichard Henderson }
331f80701cbSRichard Henderson 
HELPER(neon_sqrdmulh_idx_h)332f80701cbSRichard Henderson void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
333f80701cbSRichard Henderson                                  void *vq, uint32_t desc)
334f80701cbSRichard Henderson {
335f80701cbSRichard Henderson     intptr_t i, j, opr_sz = simd_oprsz(desc);
336f80701cbSRichard Henderson     int idx = simd_data(desc);
337f80701cbSRichard Henderson     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
338a5b72cccSRichard Henderson     intptr_t elements = opr_sz / 2;
339a5b72cccSRichard Henderson     intptr_t eltspersegment = MIN(16 / 2, elements);
340f80701cbSRichard Henderson 
341a5b72cccSRichard Henderson     for (i = 0; i < elements; i += 16 / 2) {
342f80701cbSRichard Henderson         int16_t mm = m[i];
343a5b72cccSRichard Henderson         for (j = 0; j < eltspersegment; ++j) {
344f80701cbSRichard Henderson             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
345f80701cbSRichard Henderson         }
346f80701cbSRichard Henderson     }
347f80701cbSRichard Henderson     clear_tail(d, opr_sz, simd_maxsz(desc));
348f80701cbSRichard Henderson }
349f80701cbSRichard Henderson 
HELPER(neon_sqrdmlah_idx_h)350f698e452SRichard Henderson void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm,
351f698e452SRichard Henderson                                  void *vq, uint32_t desc)
352f698e452SRichard Henderson {
353f698e452SRichard Henderson     intptr_t i, j, opr_sz = simd_oprsz(desc);
354f698e452SRichard Henderson     int idx = simd_data(desc);
355f698e452SRichard Henderson     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
356f698e452SRichard Henderson     intptr_t elements = opr_sz / 2;
357f698e452SRichard Henderson     intptr_t eltspersegment = MIN(16 / 2, elements);
358f698e452SRichard Henderson 
359f698e452SRichard Henderson     for (i = 0; i < elements; i += 16 / 2) {
360f698e452SRichard Henderson         int16_t mm = m[i];
361f698e452SRichard Henderson         for (j = 0; j < eltspersegment; ++j) {
362f698e452SRichard Henderson             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq);
363f698e452SRichard Henderson         }
364f698e452SRichard Henderson     }
365f698e452SRichard Henderson     clear_tail(d, opr_sz, simd_maxsz(desc));
366f698e452SRichard Henderson }
367f698e452SRichard Henderson 
HELPER(neon_sqrdmlsh_idx_h)368f698e452SRichard Henderson void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm,
369f698e452SRichard Henderson                                  void *vq, uint32_t desc)
370f698e452SRichard Henderson {
371f698e452SRichard Henderson     intptr_t i, j, opr_sz = simd_oprsz(desc);
372f698e452SRichard Henderson     int idx = simd_data(desc);
373f698e452SRichard Henderson     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
374f698e452SRichard Henderson     intptr_t elements = opr_sz / 2;
375f698e452SRichard Henderson     intptr_t eltspersegment = MIN(16 / 2, elements);
376f698e452SRichard Henderson 
377f698e452SRichard Henderson     for (i = 0; i < elements; i += 16 / 2) {
378f698e452SRichard Henderson         int16_t mm = m[i];
379f698e452SRichard Henderson         for (j = 0; j < eltspersegment; ++j) {
380f698e452SRichard Henderson             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq);
381f698e452SRichard Henderson         }
382f698e452SRichard Henderson     }
383f698e452SRichard Henderson     clear_tail(d, opr_sz, simd_maxsz(desc));
384f698e452SRichard Henderson }
385f698e452SRichard Henderson 
HELPER(sve2_sqrdmlah_h)386a3ef070eSClaudio Fontana void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
387a3ef070eSClaudio Fontana                              void *va, uint32_t desc)
388a3ef070eSClaudio Fontana {
389a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
390a3ef070eSClaudio Fontana     int16_t *d = vd, *n = vn, *m = vm, *a = va;
391a3ef070eSClaudio Fontana     uint32_t discard;
392a3ef070eSClaudio Fontana 
393a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 2; ++i) {
394a3ef070eSClaudio Fontana         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
395a3ef070eSClaudio Fontana     }
396a3ef070eSClaudio Fontana }
397a3ef070eSClaudio Fontana 
HELPER(sve2_sqrdmlsh_h)398a3ef070eSClaudio Fontana void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
399a3ef070eSClaudio Fontana                              void *va, uint32_t desc)
400a3ef070eSClaudio Fontana {
401a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
402a3ef070eSClaudio Fontana     int16_t *d = vd, *n = vn, *m = vm, *a = va;
403a3ef070eSClaudio Fontana     uint32_t discard;
404a3ef070eSClaudio Fontana 
405a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 2; ++i) {
406a3ef070eSClaudio Fontana         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
407a3ef070eSClaudio Fontana     }
408a3ef070eSClaudio Fontana }
409a3ef070eSClaudio Fontana 
HELPER(sve2_sqdmulh_h)410a3ef070eSClaudio Fontana void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
411a3ef070eSClaudio Fontana {
412a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
413a3ef070eSClaudio Fontana     int16_t *d = vd, *n = vn, *m = vm;
414a3ef070eSClaudio Fontana     uint32_t discard;
415a3ef070eSClaudio Fontana 
416a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 2; ++i) {
417a3ef070eSClaudio Fontana         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
418a3ef070eSClaudio Fontana     }
419a3ef070eSClaudio Fontana }
420a3ef070eSClaudio Fontana 
HELPER(sve2_sqrdmulh_h)421a3ef070eSClaudio Fontana void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
422a3ef070eSClaudio Fontana {
423a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
424a3ef070eSClaudio Fontana     int16_t *d = vd, *n = vn, *m = vm;
425a3ef070eSClaudio Fontana     uint32_t discard;
426a3ef070eSClaudio Fontana 
427a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 2; ++i) {
428a3ef070eSClaudio Fontana         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
429a3ef070eSClaudio Fontana     }
430a3ef070eSClaudio Fontana }
431a3ef070eSClaudio Fontana 
HELPER(sve2_sqdmulh_idx_h)432a3ef070eSClaudio Fontana void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
433a3ef070eSClaudio Fontana {
434a3ef070eSClaudio Fontana     intptr_t i, j, opr_sz = simd_oprsz(desc);
435a3ef070eSClaudio Fontana     int idx = simd_data(desc);
436a3ef070eSClaudio Fontana     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
437a3ef070eSClaudio Fontana     uint32_t discard;
438a3ef070eSClaudio Fontana 
439a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
440a3ef070eSClaudio Fontana         int16_t mm = m[i];
441a3ef070eSClaudio Fontana         for (j = 0; j < 16 / 2; ++j) {
442a3ef070eSClaudio Fontana             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
443a3ef070eSClaudio Fontana         }
444a3ef070eSClaudio Fontana     }
445a3ef070eSClaudio Fontana }
446a3ef070eSClaudio Fontana 
HELPER(sve2_sqrdmulh_idx_h)447a3ef070eSClaudio Fontana void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
448a3ef070eSClaudio Fontana {
449a3ef070eSClaudio Fontana     intptr_t i, j, opr_sz = simd_oprsz(desc);
450a3ef070eSClaudio Fontana     int idx = simd_data(desc);
451a3ef070eSClaudio Fontana     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
452a3ef070eSClaudio Fontana     uint32_t discard;
453a3ef070eSClaudio Fontana 
454a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
455a3ef070eSClaudio Fontana         int16_t mm = m[i];
456a3ef070eSClaudio Fontana         for (j = 0; j < 16 / 2; ++j) {
457a3ef070eSClaudio Fontana             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
458a3ef070eSClaudio Fontana         }
459a3ef070eSClaudio Fontana     }
460a3ef070eSClaudio Fontana }
461a3ef070eSClaudio Fontana 
462a3ef070eSClaudio Fontana /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
do_sqrdmlah_s(int32_t src1,int32_t src2,int32_t src3,bool neg,bool round,uint32_t * sat)463a3ef070eSClaudio Fontana int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
464a3ef070eSClaudio Fontana                       bool neg, bool round, uint32_t *sat)
465a3ef070eSClaudio Fontana {
466a3ef070eSClaudio Fontana     /* Simplify similarly to do_sqrdmlah_b above.  */
467a3ef070eSClaudio Fontana     int64_t ret = (int64_t)src1 * src2;
468a3ef070eSClaudio Fontana     if (neg) {
469a3ef070eSClaudio Fontana         ret = -ret;
470a3ef070eSClaudio Fontana     }
471a3ef070eSClaudio Fontana     ret += ((int64_t)src3 << 31) + (round << 30);
472a3ef070eSClaudio Fontana     ret >>= 31;
473a3ef070eSClaudio Fontana 
474a3ef070eSClaudio Fontana     if (ret != (int32_t)ret) {
475a3ef070eSClaudio Fontana         *sat = 1;
476a3ef070eSClaudio Fontana         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
477a3ef070eSClaudio Fontana     }
478a3ef070eSClaudio Fontana     return ret;
479a3ef070eSClaudio Fontana }
480a3ef070eSClaudio Fontana 
HELPER(neon_qrdmlah_s32)481a3ef070eSClaudio Fontana uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
482a3ef070eSClaudio Fontana                                   int32_t src2, int32_t src3)
483a3ef070eSClaudio Fontana {
484a3ef070eSClaudio Fontana     uint32_t *sat = &env->vfp.qc[0];
485a3ef070eSClaudio Fontana     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
486a3ef070eSClaudio Fontana }
487a3ef070eSClaudio Fontana 
HELPER(gvec_qrdmlah_s32)488a3ef070eSClaudio Fontana void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
489a3ef070eSClaudio Fontana                               void *vq, uint32_t desc)
490a3ef070eSClaudio Fontana {
491a3ef070eSClaudio Fontana     uintptr_t opr_sz = simd_oprsz(desc);
492a3ef070eSClaudio Fontana     int32_t *d = vd;
493a3ef070eSClaudio Fontana     int32_t *n = vn;
494a3ef070eSClaudio Fontana     int32_t *m = vm;
495a3ef070eSClaudio Fontana     uintptr_t i;
496a3ef070eSClaudio Fontana 
497a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 4; ++i) {
498a3ef070eSClaudio Fontana         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
499a3ef070eSClaudio Fontana     }
500a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
501a3ef070eSClaudio Fontana }
502a3ef070eSClaudio Fontana 
HELPER(neon_qrdmlsh_s32)503a3ef070eSClaudio Fontana uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
504a3ef070eSClaudio Fontana                                   int32_t src2, int32_t src3)
505a3ef070eSClaudio Fontana {
506a3ef070eSClaudio Fontana     uint32_t *sat = &env->vfp.qc[0];
507a3ef070eSClaudio Fontana     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
508a3ef070eSClaudio Fontana }
509a3ef070eSClaudio Fontana 
HELPER(gvec_qrdmlsh_s32)510a3ef070eSClaudio Fontana void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
511a3ef070eSClaudio Fontana                               void *vq, uint32_t desc)
512a3ef070eSClaudio Fontana {
513a3ef070eSClaudio Fontana     uintptr_t opr_sz = simd_oprsz(desc);
514a3ef070eSClaudio Fontana     int32_t *d = vd;
515a3ef070eSClaudio Fontana     int32_t *n = vn;
516a3ef070eSClaudio Fontana     int32_t *m = vm;
517a3ef070eSClaudio Fontana     uintptr_t i;
518a3ef070eSClaudio Fontana 
519a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 4; ++i) {
520a3ef070eSClaudio Fontana         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
521a3ef070eSClaudio Fontana     }
522a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
523a3ef070eSClaudio Fontana }
524a3ef070eSClaudio Fontana 
HELPER(neon_sqdmulh_s)525a3ef070eSClaudio Fontana void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
526a3ef070eSClaudio Fontana                             void *vq, uint32_t desc)
527a3ef070eSClaudio Fontana {
528a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
529a3ef070eSClaudio Fontana     int32_t *d = vd, *n = vn, *m = vm;
530a3ef070eSClaudio Fontana 
531a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 4; ++i) {
532a3ef070eSClaudio Fontana         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
533a3ef070eSClaudio Fontana     }
534a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
535a3ef070eSClaudio Fontana }
536a3ef070eSClaudio Fontana 
HELPER(neon_sqrdmulh_s)537a3ef070eSClaudio Fontana void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
538a3ef070eSClaudio Fontana                              void *vq, uint32_t desc)
539a3ef070eSClaudio Fontana {
540a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
541a3ef070eSClaudio Fontana     int32_t *d = vd, *n = vn, *m = vm;
542a3ef070eSClaudio Fontana 
543a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 4; ++i) {
544a3ef070eSClaudio Fontana         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
545a3ef070eSClaudio Fontana     }
546a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
547a3ef070eSClaudio Fontana }
548a3ef070eSClaudio Fontana 
HELPER(neon_sqdmulh_idx_s)549f80701cbSRichard Henderson void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
550f80701cbSRichard Henderson                                 void *vq, uint32_t desc)
551f80701cbSRichard Henderson {
552f80701cbSRichard Henderson     intptr_t i, j, opr_sz = simd_oprsz(desc);
553f80701cbSRichard Henderson     int idx = simd_data(desc);
554f80701cbSRichard Henderson     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
555a5b72cccSRichard Henderson     intptr_t elements = opr_sz / 4;
556a5b72cccSRichard Henderson     intptr_t eltspersegment = MIN(16 / 4, elements);
557f80701cbSRichard Henderson 
558a5b72cccSRichard Henderson     for (i = 0; i < elements; i += 16 / 4) {
559f80701cbSRichard Henderson         int32_t mm = m[i];
560a5b72cccSRichard Henderson         for (j = 0; j < eltspersegment; ++j) {
561f80701cbSRichard Henderson             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
562f80701cbSRichard Henderson         }
563f80701cbSRichard Henderson     }
564f80701cbSRichard Henderson     clear_tail(d, opr_sz, simd_maxsz(desc));
565f80701cbSRichard Henderson }
566f80701cbSRichard Henderson 
HELPER(neon_sqrdmulh_idx_s)567f80701cbSRichard Henderson void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
568f80701cbSRichard Henderson                                  void *vq, uint32_t desc)
569f80701cbSRichard Henderson {
570f80701cbSRichard Henderson     intptr_t i, j, opr_sz = simd_oprsz(desc);
571f80701cbSRichard Henderson     int idx = simd_data(desc);
572f80701cbSRichard Henderson     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
573a5b72cccSRichard Henderson     intptr_t elements = opr_sz / 4;
574a5b72cccSRichard Henderson     intptr_t eltspersegment = MIN(16 / 4, elements);
575f80701cbSRichard Henderson 
576a5b72cccSRichard Henderson     for (i = 0; i < elements; i += 16 / 4) {
577f80701cbSRichard Henderson         int32_t mm = m[i];
578a5b72cccSRichard Henderson         for (j = 0; j < eltspersegment; ++j) {
579f80701cbSRichard Henderson             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
580f80701cbSRichard Henderson         }
581f80701cbSRichard Henderson     }
582f80701cbSRichard Henderson     clear_tail(d, opr_sz, simd_maxsz(desc));
583f80701cbSRichard Henderson }
584f80701cbSRichard Henderson 
HELPER(neon_sqrdmlah_idx_s)585f698e452SRichard Henderson void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm,
586f698e452SRichard Henderson                                  void *vq, uint32_t desc)
587f698e452SRichard Henderson {
588f698e452SRichard Henderson     intptr_t i, j, opr_sz = simd_oprsz(desc);
589f698e452SRichard Henderson     int idx = simd_data(desc);
590f698e452SRichard Henderson     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
591f698e452SRichard Henderson     intptr_t elements = opr_sz / 4;
592f698e452SRichard Henderson     intptr_t eltspersegment = MIN(16 / 4, elements);
593f698e452SRichard Henderson 
594f698e452SRichard Henderson     for (i = 0; i < elements; i += 16 / 4) {
595f698e452SRichard Henderson         int32_t mm = m[i];
596f698e452SRichard Henderson         for (j = 0; j < eltspersegment; ++j) {
597f698e452SRichard Henderson             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq);
598f698e452SRichard Henderson         }
599f698e452SRichard Henderson     }
600f698e452SRichard Henderson     clear_tail(d, opr_sz, simd_maxsz(desc));
601f698e452SRichard Henderson }
602f698e452SRichard Henderson 
HELPER(neon_sqrdmlsh_idx_s)603f698e452SRichard Henderson void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm,
604f698e452SRichard Henderson                                  void *vq, uint32_t desc)
605f698e452SRichard Henderson {
606f698e452SRichard Henderson     intptr_t i, j, opr_sz = simd_oprsz(desc);
607f698e452SRichard Henderson     int idx = simd_data(desc);
608f698e452SRichard Henderson     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
609f698e452SRichard Henderson     intptr_t elements = opr_sz / 4;
610f698e452SRichard Henderson     intptr_t eltspersegment = MIN(16 / 4, elements);
611f698e452SRichard Henderson 
612f698e452SRichard Henderson     for (i = 0; i < elements; i += 16 / 4) {
613f698e452SRichard Henderson         int32_t mm = m[i];
614f698e452SRichard Henderson         for (j = 0; j < eltspersegment; ++j) {
615f698e452SRichard Henderson             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq);
616f698e452SRichard Henderson         }
617f698e452SRichard Henderson     }
618f698e452SRichard Henderson     clear_tail(d, opr_sz, simd_maxsz(desc));
619f698e452SRichard Henderson }
620f698e452SRichard Henderson 
HELPER(sve2_sqrdmlah_s)621a3ef070eSClaudio Fontana void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
622a3ef070eSClaudio Fontana                              void *va, uint32_t desc)
623a3ef070eSClaudio Fontana {
624a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
625a3ef070eSClaudio Fontana     int32_t *d = vd, *n = vn, *m = vm, *a = va;
626a3ef070eSClaudio Fontana     uint32_t discard;
627a3ef070eSClaudio Fontana 
628a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 4; ++i) {
629a3ef070eSClaudio Fontana         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
630a3ef070eSClaudio Fontana     }
631a3ef070eSClaudio Fontana }
632a3ef070eSClaudio Fontana 
HELPER(sve2_sqrdmlsh_s)633a3ef070eSClaudio Fontana void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
634a3ef070eSClaudio Fontana                              void *va, uint32_t desc)
635a3ef070eSClaudio Fontana {
636a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
637a3ef070eSClaudio Fontana     int32_t *d = vd, *n = vn, *m = vm, *a = va;
638a3ef070eSClaudio Fontana     uint32_t discard;
639a3ef070eSClaudio Fontana 
640a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 4; ++i) {
641a3ef070eSClaudio Fontana         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
642a3ef070eSClaudio Fontana     }
643a3ef070eSClaudio Fontana }
644a3ef070eSClaudio Fontana 
HELPER(sve2_sqdmulh_s)645a3ef070eSClaudio Fontana void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
646a3ef070eSClaudio Fontana {
647a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
648a3ef070eSClaudio Fontana     int32_t *d = vd, *n = vn, *m = vm;
649a3ef070eSClaudio Fontana     uint32_t discard;
650a3ef070eSClaudio Fontana 
651a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 4; ++i) {
652a3ef070eSClaudio Fontana         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
653a3ef070eSClaudio Fontana     }
654a3ef070eSClaudio Fontana }
655a3ef070eSClaudio Fontana 
HELPER(sve2_sqrdmulh_s)656a3ef070eSClaudio Fontana void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
657a3ef070eSClaudio Fontana {
658a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
659a3ef070eSClaudio Fontana     int32_t *d = vd, *n = vn, *m = vm;
660a3ef070eSClaudio Fontana     uint32_t discard;
661a3ef070eSClaudio Fontana 
662a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 4; ++i) {
663a3ef070eSClaudio Fontana         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
664a3ef070eSClaudio Fontana     }
665a3ef070eSClaudio Fontana }
666a3ef070eSClaudio Fontana 
HELPER(sve2_sqdmulh_idx_s)667a3ef070eSClaudio Fontana void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
668a3ef070eSClaudio Fontana {
669a3ef070eSClaudio Fontana     intptr_t i, j, opr_sz = simd_oprsz(desc);
670a3ef070eSClaudio Fontana     int idx = simd_data(desc);
671a3ef070eSClaudio Fontana     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
672a3ef070eSClaudio Fontana     uint32_t discard;
673a3ef070eSClaudio Fontana 
674a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
675a3ef070eSClaudio Fontana         int32_t mm = m[i];
676a3ef070eSClaudio Fontana         for (j = 0; j < 16 / 4; ++j) {
677a3ef070eSClaudio Fontana             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
678a3ef070eSClaudio Fontana         }
679a3ef070eSClaudio Fontana     }
680a3ef070eSClaudio Fontana }
681a3ef070eSClaudio Fontana 
HELPER(sve2_sqrdmulh_idx_s)682a3ef070eSClaudio Fontana void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
683a3ef070eSClaudio Fontana {
684a3ef070eSClaudio Fontana     intptr_t i, j, opr_sz = simd_oprsz(desc);
685a3ef070eSClaudio Fontana     int idx = simd_data(desc);
686a3ef070eSClaudio Fontana     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
687a3ef070eSClaudio Fontana     uint32_t discard;
688a3ef070eSClaudio Fontana 
689a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
690a3ef070eSClaudio Fontana         int32_t mm = m[i];
691a3ef070eSClaudio Fontana         for (j = 0; j < 16 / 4; ++j) {
692a3ef070eSClaudio Fontana             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
693a3ef070eSClaudio Fontana         }
694a3ef070eSClaudio Fontana     }
695a3ef070eSClaudio Fontana }
696a3ef070eSClaudio Fontana 
697a3ef070eSClaudio Fontana /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
do_sat128_d(Int128 r)698a3ef070eSClaudio Fontana static int64_t do_sat128_d(Int128 r)
699a3ef070eSClaudio Fontana {
700a3ef070eSClaudio Fontana     int64_t ls = int128_getlo(r);
701a3ef070eSClaudio Fontana     int64_t hs = int128_gethi(r);
702a3ef070eSClaudio Fontana 
703a3ef070eSClaudio Fontana     if (unlikely(hs != (ls >> 63))) {
704a3ef070eSClaudio Fontana         return hs < 0 ? INT64_MIN : INT64_MAX;
705a3ef070eSClaudio Fontana     }
706a3ef070eSClaudio Fontana     return ls;
707a3ef070eSClaudio Fontana }
708a3ef070eSClaudio Fontana 
do_sqrdmlah_d(int64_t n,int64_t m,int64_t a,bool neg,bool round)709a3ef070eSClaudio Fontana int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
710a3ef070eSClaudio Fontana {
711a3ef070eSClaudio Fontana     uint64_t l, h;
712a3ef070eSClaudio Fontana     Int128 r, t;
713a3ef070eSClaudio Fontana 
714a3ef070eSClaudio Fontana     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
715a3ef070eSClaudio Fontana     muls64(&l, &h, m, n);
716a3ef070eSClaudio Fontana     r = int128_make128(l, h);
717a3ef070eSClaudio Fontana     if (neg) {
718a3ef070eSClaudio Fontana         r = int128_neg(r);
719a3ef070eSClaudio Fontana     }
720a3ef070eSClaudio Fontana     if (a) {
721a3ef070eSClaudio Fontana         t = int128_exts64(a);
722a3ef070eSClaudio Fontana         t = int128_lshift(t, 63);
723a3ef070eSClaudio Fontana         r = int128_add(r, t);
724a3ef070eSClaudio Fontana     }
725a3ef070eSClaudio Fontana     if (round) {
726a3ef070eSClaudio Fontana         t = int128_exts64(1ll << 62);
727a3ef070eSClaudio Fontana         r = int128_add(r, t);
728a3ef070eSClaudio Fontana     }
729a3ef070eSClaudio Fontana     r = int128_rshift(r, 63);
730a3ef070eSClaudio Fontana 
731a3ef070eSClaudio Fontana     return do_sat128_d(r);
732a3ef070eSClaudio Fontana }
733a3ef070eSClaudio Fontana 
HELPER(sve2_sqrdmlah_d)734a3ef070eSClaudio Fontana void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
735a3ef070eSClaudio Fontana                              void *va, uint32_t desc)
736a3ef070eSClaudio Fontana {
737a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
738a3ef070eSClaudio Fontana     int64_t *d = vd, *n = vn, *m = vm, *a = va;
739a3ef070eSClaudio Fontana 
740a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 8; ++i) {
741a3ef070eSClaudio Fontana         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
742a3ef070eSClaudio Fontana     }
743a3ef070eSClaudio Fontana }
744a3ef070eSClaudio Fontana 
HELPER(sve2_sqrdmlsh_d)745a3ef070eSClaudio Fontana void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
746a3ef070eSClaudio Fontana                              void *va, uint32_t desc)
747a3ef070eSClaudio Fontana {
748a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
749a3ef070eSClaudio Fontana     int64_t *d = vd, *n = vn, *m = vm, *a = va;
750a3ef070eSClaudio Fontana 
751a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 8; ++i) {
752a3ef070eSClaudio Fontana         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
753a3ef070eSClaudio Fontana     }
754a3ef070eSClaudio Fontana }
755a3ef070eSClaudio Fontana 
HELPER(sve2_sqdmulh_d)756a3ef070eSClaudio Fontana void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
757a3ef070eSClaudio Fontana {
758a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
759a3ef070eSClaudio Fontana     int64_t *d = vd, *n = vn, *m = vm;
760a3ef070eSClaudio Fontana 
761a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 8; ++i) {
762a3ef070eSClaudio Fontana         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
763a3ef070eSClaudio Fontana     }
764a3ef070eSClaudio Fontana }
765a3ef070eSClaudio Fontana 
HELPER(sve2_sqrdmulh_d)766a3ef070eSClaudio Fontana void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
767a3ef070eSClaudio Fontana {
768a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
769a3ef070eSClaudio Fontana     int64_t *d = vd, *n = vn, *m = vm;
770a3ef070eSClaudio Fontana 
771a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 8; ++i) {
772a3ef070eSClaudio Fontana         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
773a3ef070eSClaudio Fontana     }
774a3ef070eSClaudio Fontana }
775a3ef070eSClaudio Fontana 
HELPER(sve2_sqdmulh_idx_d)776a3ef070eSClaudio Fontana void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
777a3ef070eSClaudio Fontana {
778a3ef070eSClaudio Fontana     intptr_t i, j, opr_sz = simd_oprsz(desc);
779a3ef070eSClaudio Fontana     int idx = simd_data(desc);
780a3ef070eSClaudio Fontana     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
781a3ef070eSClaudio Fontana 
782a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
783a3ef070eSClaudio Fontana         int64_t mm = m[i];
784a3ef070eSClaudio Fontana         for (j = 0; j < 16 / 8; ++j) {
785a3ef070eSClaudio Fontana             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
786a3ef070eSClaudio Fontana         }
787a3ef070eSClaudio Fontana     }
788a3ef070eSClaudio Fontana }
789a3ef070eSClaudio Fontana 
HELPER(sve2_sqrdmulh_idx_d)790a3ef070eSClaudio Fontana void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
791a3ef070eSClaudio Fontana {
792a3ef070eSClaudio Fontana     intptr_t i, j, opr_sz = simd_oprsz(desc);
793a3ef070eSClaudio Fontana     int idx = simd_data(desc);
794a3ef070eSClaudio Fontana     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
795a3ef070eSClaudio Fontana 
796a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
797a3ef070eSClaudio Fontana         int64_t mm = m[i];
798a3ef070eSClaudio Fontana         for (j = 0; j < 16 / 8; ++j) {
799a3ef070eSClaudio Fontana             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
800a3ef070eSClaudio Fontana         }
801a3ef070eSClaudio Fontana     }
802a3ef070eSClaudio Fontana }
803a3ef070eSClaudio Fontana 
804a3ef070eSClaudio Fontana /* Integer 8 and 16-bit dot-product.
805a3ef070eSClaudio Fontana  *
806a3ef070eSClaudio Fontana  * Note that for the loops herein, host endianness does not matter
807a3ef070eSClaudio Fontana  * with respect to the ordering of data within the quad-width lanes.
808a3ef070eSClaudio Fontana  * All elements are treated equally, no matter where they are.
809a3ef070eSClaudio Fontana  */
810a3ef070eSClaudio Fontana 
811a3ef070eSClaudio Fontana #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
812a3ef070eSClaudio Fontana void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
813a3ef070eSClaudio Fontana {                                                                         \
814a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);                                \
815a3ef070eSClaudio Fontana     TYPED *d = vd, *a = va;                                               \
816a3ef070eSClaudio Fontana     TYPEN *n = vn;                                                        \
817a3ef070eSClaudio Fontana     TYPEM *m = vm;                                                        \
818a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
819a3ef070eSClaudio Fontana         d[i] = (a[i] +                                                    \
820a3ef070eSClaudio Fontana                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
821a3ef070eSClaudio Fontana                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
822a3ef070eSClaudio Fontana                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
823a3ef070eSClaudio Fontana                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
824a3ef070eSClaudio Fontana     }                                                                     \
825a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
826a3ef070eSClaudio Fontana }
827a3ef070eSClaudio Fontana 
DO_DOT(gvec_sdot_b,int32_t,int8_t,int8_t)828a3ef070eSClaudio Fontana DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
829a3ef070eSClaudio Fontana DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
830a3ef070eSClaudio Fontana DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
831a3ef070eSClaudio Fontana DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
832a3ef070eSClaudio Fontana DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
833a3ef070eSClaudio Fontana 
834a3ef070eSClaudio Fontana #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
835a3ef070eSClaudio Fontana void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
836a3ef070eSClaudio Fontana {                                                                         \
837a3ef070eSClaudio Fontana     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
838a3ef070eSClaudio Fontana     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
839*5e29203bSPeter Maydell     /*                                                                    \
840*5e29203bSPeter Maydell      * Special case: opr_sz == 8 from AA64/AA32 advsimd means the         \
841*5e29203bSPeter Maydell      * first iteration might not be a full 16 byte segment. But           \
842*5e29203bSPeter Maydell      * for vector lengths beyond that this must be SVE and we know        \
843*5e29203bSPeter Maydell      * opr_sz is a multiple of 16, so we need not clamp segend            \
844*5e29203bSPeter Maydell      * to opr_sz_n when we advance it at the end of the loop.             \
845*5e29203bSPeter Maydell      */                                                                   \
846a3ef070eSClaudio Fontana     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
847a3ef070eSClaudio Fontana     intptr_t index = simd_data(desc);                                     \
848a3ef070eSClaudio Fontana     TYPED *d = vd, *a = va;                                               \
849a3ef070eSClaudio Fontana     TYPEN *n = vn;                                                        \
850a3ef070eSClaudio Fontana     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
851a3ef070eSClaudio Fontana     do {                                                                  \
852a3ef070eSClaudio Fontana         TYPED m0 = m_indexed[i * 4 + 0];                                  \
853a3ef070eSClaudio Fontana         TYPED m1 = m_indexed[i * 4 + 1];                                  \
854a3ef070eSClaudio Fontana         TYPED m2 = m_indexed[i * 4 + 2];                                  \
855a3ef070eSClaudio Fontana         TYPED m3 = m_indexed[i * 4 + 3];                                  \
856a3ef070eSClaudio Fontana         do {                                                              \
857a3ef070eSClaudio Fontana             d[i] = (a[i] +                                                \
858a3ef070eSClaudio Fontana                     n[i * 4 + 0] * m0 +                                   \
859a3ef070eSClaudio Fontana                     n[i * 4 + 1] * m1 +                                   \
860a3ef070eSClaudio Fontana                     n[i * 4 + 2] * m2 +                                   \
861a3ef070eSClaudio Fontana                     n[i * 4 + 3] * m3);                                   \
862a3ef070eSClaudio Fontana         } while (++i < segend);                                           \
863*5e29203bSPeter Maydell         segend = i + (16 / sizeof(TYPED));                                \
864a3ef070eSClaudio Fontana     } while (i < opr_sz_n);                                               \
865a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
866a3ef070eSClaudio Fontana }
867a3ef070eSClaudio Fontana 
868a3ef070eSClaudio Fontana DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
869a3ef070eSClaudio Fontana DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
870a3ef070eSClaudio Fontana DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
871a3ef070eSClaudio Fontana DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
872a3ef070eSClaudio Fontana DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
873a3ef070eSClaudio Fontana DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
874a3ef070eSClaudio Fontana 
875a3ef070eSClaudio Fontana void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
876a3ef070eSClaudio Fontana                          void *vfpst, uint32_t desc)
877a3ef070eSClaudio Fontana {
878a3ef070eSClaudio Fontana     uintptr_t opr_sz = simd_oprsz(desc);
879a3ef070eSClaudio Fontana     float16 *d = vd;
880a3ef070eSClaudio Fontana     float16 *n = vn;
881a3ef070eSClaudio Fontana     float16 *m = vm;
882a3ef070eSClaudio Fontana     float_status *fpst = vfpst;
883a3ef070eSClaudio Fontana     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
884a3ef070eSClaudio Fontana     uint32_t neg_imag = neg_real ^ 1;
885a3ef070eSClaudio Fontana     uintptr_t i;
886a3ef070eSClaudio Fontana 
887a3ef070eSClaudio Fontana     /* Shift boolean to the sign bit so we can xor to negate.  */
888a3ef070eSClaudio Fontana     neg_real <<= 15;
889a3ef070eSClaudio Fontana     neg_imag <<= 15;
890a3ef070eSClaudio Fontana 
891a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 2; i += 2) {
892a3ef070eSClaudio Fontana         float16 e0 = n[H2(i)];
893a3ef070eSClaudio Fontana         float16 e1 = m[H2(i + 1)] ^ neg_imag;
894a3ef070eSClaudio Fontana         float16 e2 = n[H2(i + 1)];
895a3ef070eSClaudio Fontana         float16 e3 = m[H2(i)] ^ neg_real;
896a3ef070eSClaudio Fontana 
897a3ef070eSClaudio Fontana         d[H2(i)] = float16_add(e0, e1, fpst);
898a3ef070eSClaudio Fontana         d[H2(i + 1)] = float16_add(e2, e3, fpst);
899a3ef070eSClaudio Fontana     }
900a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
901a3ef070eSClaudio Fontana }
902a3ef070eSClaudio Fontana 
HELPER(gvec_fcadds)903a3ef070eSClaudio Fontana void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
904a3ef070eSClaudio Fontana                          void *vfpst, uint32_t desc)
905a3ef070eSClaudio Fontana {
906a3ef070eSClaudio Fontana     uintptr_t opr_sz = simd_oprsz(desc);
907a3ef070eSClaudio Fontana     float32 *d = vd;
908a3ef070eSClaudio Fontana     float32 *n = vn;
909a3ef070eSClaudio Fontana     float32 *m = vm;
910a3ef070eSClaudio Fontana     float_status *fpst = vfpst;
911a3ef070eSClaudio Fontana     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
912a3ef070eSClaudio Fontana     uint32_t neg_imag = neg_real ^ 1;
913a3ef070eSClaudio Fontana     uintptr_t i;
914a3ef070eSClaudio Fontana 
915a3ef070eSClaudio Fontana     /* Shift boolean to the sign bit so we can xor to negate.  */
916a3ef070eSClaudio Fontana     neg_real <<= 31;
917a3ef070eSClaudio Fontana     neg_imag <<= 31;
918a3ef070eSClaudio Fontana 
919a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 4; i += 2) {
920a3ef070eSClaudio Fontana         float32 e0 = n[H4(i)];
921a3ef070eSClaudio Fontana         float32 e1 = m[H4(i + 1)] ^ neg_imag;
922a3ef070eSClaudio Fontana         float32 e2 = n[H4(i + 1)];
923a3ef070eSClaudio Fontana         float32 e3 = m[H4(i)] ^ neg_real;
924a3ef070eSClaudio Fontana 
925a3ef070eSClaudio Fontana         d[H4(i)] = float32_add(e0, e1, fpst);
926a3ef070eSClaudio Fontana         d[H4(i + 1)] = float32_add(e2, e3, fpst);
927a3ef070eSClaudio Fontana     }
928a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
929a3ef070eSClaudio Fontana }
930a3ef070eSClaudio Fontana 
HELPER(gvec_fcaddd)931a3ef070eSClaudio Fontana void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
932a3ef070eSClaudio Fontana                          void *vfpst, uint32_t desc)
933a3ef070eSClaudio Fontana {
934a3ef070eSClaudio Fontana     uintptr_t opr_sz = simd_oprsz(desc);
935a3ef070eSClaudio Fontana     float64 *d = vd;
936a3ef070eSClaudio Fontana     float64 *n = vn;
937a3ef070eSClaudio Fontana     float64 *m = vm;
938a3ef070eSClaudio Fontana     float_status *fpst = vfpst;
939a3ef070eSClaudio Fontana     uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
940a3ef070eSClaudio Fontana     uint64_t neg_imag = neg_real ^ 1;
941a3ef070eSClaudio Fontana     uintptr_t i;
942a3ef070eSClaudio Fontana 
943a3ef070eSClaudio Fontana     /* Shift boolean to the sign bit so we can xor to negate.  */
944a3ef070eSClaudio Fontana     neg_real <<= 63;
945a3ef070eSClaudio Fontana     neg_imag <<= 63;
946a3ef070eSClaudio Fontana 
947a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 8; i += 2) {
948a3ef070eSClaudio Fontana         float64 e0 = n[i];
949a3ef070eSClaudio Fontana         float64 e1 = m[i + 1] ^ neg_imag;
950a3ef070eSClaudio Fontana         float64 e2 = n[i + 1];
951a3ef070eSClaudio Fontana         float64 e3 = m[i] ^ neg_real;
952a3ef070eSClaudio Fontana 
953a3ef070eSClaudio Fontana         d[i] = float64_add(e0, e1, fpst);
954a3ef070eSClaudio Fontana         d[i + 1] = float64_add(e2, e3, fpst);
955a3ef070eSClaudio Fontana     }
956a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
957a3ef070eSClaudio Fontana }
958a3ef070eSClaudio Fontana 
HELPER(gvec_fcmlah)959a3ef070eSClaudio Fontana void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
960a3ef070eSClaudio Fontana                          void *vfpst, uint32_t desc)
961a3ef070eSClaudio Fontana {
962a3ef070eSClaudio Fontana     uintptr_t opr_sz = simd_oprsz(desc);
963a3ef070eSClaudio Fontana     float16 *d = vd, *n = vn, *m = vm, *a = va;
964a3ef070eSClaudio Fontana     float_status *fpst = vfpst;
965a3ef070eSClaudio Fontana     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
966a3ef070eSClaudio Fontana     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
967a3ef070eSClaudio Fontana     uint32_t neg_real = flip ^ neg_imag;
968a3ef070eSClaudio Fontana     uintptr_t i;
969a3ef070eSClaudio Fontana 
970a3ef070eSClaudio Fontana     /* Shift boolean to the sign bit so we can xor to negate.  */
971a3ef070eSClaudio Fontana     neg_real <<= 15;
972a3ef070eSClaudio Fontana     neg_imag <<= 15;
973a3ef070eSClaudio Fontana 
974a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 2; i += 2) {
975a3ef070eSClaudio Fontana         float16 e2 = n[H2(i + flip)];
976a3ef070eSClaudio Fontana         float16 e1 = m[H2(i + flip)] ^ neg_real;
977a3ef070eSClaudio Fontana         float16 e4 = e2;
978a3ef070eSClaudio Fontana         float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
979a3ef070eSClaudio Fontana 
980a3ef070eSClaudio Fontana         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
981a3ef070eSClaudio Fontana         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
982a3ef070eSClaudio Fontana     }
983a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
984a3ef070eSClaudio Fontana }
985a3ef070eSClaudio Fontana 
HELPER(gvec_fcmlah_idx)986a3ef070eSClaudio Fontana void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
987a3ef070eSClaudio Fontana                              void *vfpst, uint32_t desc)
988a3ef070eSClaudio Fontana {
989a3ef070eSClaudio Fontana     uintptr_t opr_sz = simd_oprsz(desc);
990a3ef070eSClaudio Fontana     float16 *d = vd, *n = vn, *m = vm, *a = va;
991a3ef070eSClaudio Fontana     float_status *fpst = vfpst;
992a3ef070eSClaudio Fontana     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
993a3ef070eSClaudio Fontana     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
994a3ef070eSClaudio Fontana     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
995a3ef070eSClaudio Fontana     uint32_t neg_real = flip ^ neg_imag;
996a3ef070eSClaudio Fontana     intptr_t elements = opr_sz / sizeof(float16);
99776bccf3cSRichard Henderson     intptr_t eltspersegment = MIN(16 / sizeof(float16), elements);
998a3ef070eSClaudio Fontana     intptr_t i, j;
999a3ef070eSClaudio Fontana 
1000a3ef070eSClaudio Fontana     /* Shift boolean to the sign bit so we can xor to negate.  */
1001a3ef070eSClaudio Fontana     neg_real <<= 15;
1002a3ef070eSClaudio Fontana     neg_imag <<= 15;
1003a3ef070eSClaudio Fontana 
1004a3ef070eSClaudio Fontana     for (i = 0; i < elements; i += eltspersegment) {
1005a3ef070eSClaudio Fontana         float16 mr = m[H2(i + 2 * index + 0)];
1006a3ef070eSClaudio Fontana         float16 mi = m[H2(i + 2 * index + 1)];
1007a3ef070eSClaudio Fontana         float16 e1 = neg_real ^ (flip ? mi : mr);
1008a3ef070eSClaudio Fontana         float16 e3 = neg_imag ^ (flip ? mr : mi);
1009a3ef070eSClaudio Fontana 
1010a3ef070eSClaudio Fontana         for (j = i; j < i + eltspersegment; j += 2) {
1011a3ef070eSClaudio Fontana             float16 e2 = n[H2(j + flip)];
1012a3ef070eSClaudio Fontana             float16 e4 = e2;
1013a3ef070eSClaudio Fontana 
1014a3ef070eSClaudio Fontana             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
1015a3ef070eSClaudio Fontana             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
1016a3ef070eSClaudio Fontana         }
1017a3ef070eSClaudio Fontana     }
1018a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
1019a3ef070eSClaudio Fontana }
1020a3ef070eSClaudio Fontana 
HELPER(gvec_fcmlas)1021a3ef070eSClaudio Fontana void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
1022a3ef070eSClaudio Fontana                          void *vfpst, uint32_t desc)
1023a3ef070eSClaudio Fontana {
1024a3ef070eSClaudio Fontana     uintptr_t opr_sz = simd_oprsz(desc);
1025a3ef070eSClaudio Fontana     float32 *d = vd, *n = vn, *m = vm, *a = va;
1026a3ef070eSClaudio Fontana     float_status *fpst = vfpst;
1027a3ef070eSClaudio Fontana     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1028a3ef070eSClaudio Fontana     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1029a3ef070eSClaudio Fontana     uint32_t neg_real = flip ^ neg_imag;
1030a3ef070eSClaudio Fontana     uintptr_t i;
1031a3ef070eSClaudio Fontana 
1032a3ef070eSClaudio Fontana     /* Shift boolean to the sign bit so we can xor to negate.  */
1033a3ef070eSClaudio Fontana     neg_real <<= 31;
1034a3ef070eSClaudio Fontana     neg_imag <<= 31;
1035a3ef070eSClaudio Fontana 
1036a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 4; i += 2) {
1037a3ef070eSClaudio Fontana         float32 e2 = n[H4(i + flip)];
1038a3ef070eSClaudio Fontana         float32 e1 = m[H4(i + flip)] ^ neg_real;
1039a3ef070eSClaudio Fontana         float32 e4 = e2;
1040a3ef070eSClaudio Fontana         float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
1041a3ef070eSClaudio Fontana 
1042a3ef070eSClaudio Fontana         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
1043a3ef070eSClaudio Fontana         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
1044a3ef070eSClaudio Fontana     }
1045a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
1046a3ef070eSClaudio Fontana }
1047a3ef070eSClaudio Fontana 
HELPER(gvec_fcmlas_idx)1048a3ef070eSClaudio Fontana void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
1049a3ef070eSClaudio Fontana                              void *vfpst, uint32_t desc)
1050a3ef070eSClaudio Fontana {
1051a3ef070eSClaudio Fontana     uintptr_t opr_sz = simd_oprsz(desc);
1052a3ef070eSClaudio Fontana     float32 *d = vd, *n = vn, *m = vm, *a = va;
1053a3ef070eSClaudio Fontana     float_status *fpst = vfpst;
1054a3ef070eSClaudio Fontana     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1055a3ef070eSClaudio Fontana     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1056a3ef070eSClaudio Fontana     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1057a3ef070eSClaudio Fontana     uint32_t neg_real = flip ^ neg_imag;
1058a3ef070eSClaudio Fontana     intptr_t elements = opr_sz / sizeof(float32);
105976bccf3cSRichard Henderson     intptr_t eltspersegment = MIN(16 / sizeof(float32), elements);
1060a3ef070eSClaudio Fontana     intptr_t i, j;
1061a3ef070eSClaudio Fontana 
1062a3ef070eSClaudio Fontana     /* Shift boolean to the sign bit so we can xor to negate.  */
1063a3ef070eSClaudio Fontana     neg_real <<= 31;
1064a3ef070eSClaudio Fontana     neg_imag <<= 31;
1065a3ef070eSClaudio Fontana 
1066a3ef070eSClaudio Fontana     for (i = 0; i < elements; i += eltspersegment) {
1067a3ef070eSClaudio Fontana         float32 mr = m[H4(i + 2 * index + 0)];
1068a3ef070eSClaudio Fontana         float32 mi = m[H4(i + 2 * index + 1)];
1069a3ef070eSClaudio Fontana         float32 e1 = neg_real ^ (flip ? mi : mr);
1070a3ef070eSClaudio Fontana         float32 e3 = neg_imag ^ (flip ? mr : mi);
1071a3ef070eSClaudio Fontana 
1072a3ef070eSClaudio Fontana         for (j = i; j < i + eltspersegment; j += 2) {
1073a3ef070eSClaudio Fontana             float32 e2 = n[H4(j + flip)];
1074a3ef070eSClaudio Fontana             float32 e4 = e2;
1075a3ef070eSClaudio Fontana 
1076a3ef070eSClaudio Fontana             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
1077a3ef070eSClaudio Fontana             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
1078a3ef070eSClaudio Fontana         }
1079a3ef070eSClaudio Fontana     }
1080a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
1081a3ef070eSClaudio Fontana }
1082a3ef070eSClaudio Fontana 
HELPER(gvec_fcmlad)1083a3ef070eSClaudio Fontana void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
1084a3ef070eSClaudio Fontana                          void *vfpst, uint32_t desc)
1085a3ef070eSClaudio Fontana {
1086a3ef070eSClaudio Fontana     uintptr_t opr_sz = simd_oprsz(desc);
1087a3ef070eSClaudio Fontana     float64 *d = vd, *n = vn, *m = vm, *a = va;
1088a3ef070eSClaudio Fontana     float_status *fpst = vfpst;
1089a3ef070eSClaudio Fontana     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1090a3ef070eSClaudio Fontana     uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1091a3ef070eSClaudio Fontana     uint64_t neg_real = flip ^ neg_imag;
1092a3ef070eSClaudio Fontana     uintptr_t i;
1093a3ef070eSClaudio Fontana 
1094a3ef070eSClaudio Fontana     /* Shift boolean to the sign bit so we can xor to negate.  */
1095a3ef070eSClaudio Fontana     neg_real <<= 63;
1096a3ef070eSClaudio Fontana     neg_imag <<= 63;
1097a3ef070eSClaudio Fontana 
1098a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 8; i += 2) {
1099a3ef070eSClaudio Fontana         float64 e2 = n[i + flip];
1100a3ef070eSClaudio Fontana         float64 e1 = m[i + flip] ^ neg_real;
1101a3ef070eSClaudio Fontana         float64 e4 = e2;
1102a3ef070eSClaudio Fontana         float64 e3 = m[i + 1 - flip] ^ neg_imag;
1103a3ef070eSClaudio Fontana 
1104a3ef070eSClaudio Fontana         d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
1105a3ef070eSClaudio Fontana         d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
1106a3ef070eSClaudio Fontana     }
1107a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
1108a3ef070eSClaudio Fontana }
1109a3ef070eSClaudio Fontana 
1110a3ef070eSClaudio Fontana /*
1111a3ef070eSClaudio Fontana  * Floating point comparisons producing an integer result (all 1s or all 0s).
1112a3ef070eSClaudio Fontana  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1113a3ef070eSClaudio Fontana  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1114a3ef070eSClaudio Fontana  */
float16_ceq(float16 op1,float16 op2,float_status * stat)1115a3ef070eSClaudio Fontana static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
1116a3ef070eSClaudio Fontana {
1117a3ef070eSClaudio Fontana     return -float16_eq_quiet(op1, op2, stat);
1118a3ef070eSClaudio Fontana }
1119a3ef070eSClaudio Fontana 
float32_ceq(float32 op1,float32 op2,float_status * stat)1120a3ef070eSClaudio Fontana static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
1121a3ef070eSClaudio Fontana {
1122a3ef070eSClaudio Fontana     return -float32_eq_quiet(op1, op2, stat);
1123a3ef070eSClaudio Fontana }
1124a3ef070eSClaudio Fontana 
float64_ceq(float64 op1,float64 op2,float_status * stat)11254fe068faSRichard Henderson static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
11264fe068faSRichard Henderson {
11274fe068faSRichard Henderson     return -float64_eq_quiet(op1, op2, stat);
11284fe068faSRichard Henderson }
11294fe068faSRichard Henderson 
float16_cge(float16 op1,float16 op2,float_status * stat)1130a3ef070eSClaudio Fontana static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
1131a3ef070eSClaudio Fontana {
1132a3ef070eSClaudio Fontana     return -float16_le(op2, op1, stat);
1133a3ef070eSClaudio Fontana }
1134a3ef070eSClaudio Fontana 
float32_cge(float32 op1,float32 op2,float_status * stat)1135a3ef070eSClaudio Fontana static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
1136a3ef070eSClaudio Fontana {
1137a3ef070eSClaudio Fontana     return -float32_le(op2, op1, stat);
1138a3ef070eSClaudio Fontana }
1139a3ef070eSClaudio Fontana 
float64_cge(float64 op1,float64 op2,float_status * stat)11404fe068faSRichard Henderson static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
11414fe068faSRichard Henderson {
11424fe068faSRichard Henderson     return -float64_le(op2, op1, stat);
11434fe068faSRichard Henderson }
11444fe068faSRichard Henderson 
float16_cgt(float16 op1,float16 op2,float_status * stat)1145a3ef070eSClaudio Fontana static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
1146a3ef070eSClaudio Fontana {
1147a3ef070eSClaudio Fontana     return -float16_lt(op2, op1, stat);
1148a3ef070eSClaudio Fontana }
1149a3ef070eSClaudio Fontana 
float32_cgt(float32 op1,float32 op2,float_status * stat)1150a3ef070eSClaudio Fontana static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1151a3ef070eSClaudio Fontana {
1152a3ef070eSClaudio Fontana     return -float32_lt(op2, op1, stat);
1153a3ef070eSClaudio Fontana }
1154a3ef070eSClaudio Fontana 
float64_cgt(float64 op1,float64 op2,float_status * stat)11554fe068faSRichard Henderson static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
11564fe068faSRichard Henderson {
11574fe068faSRichard Henderson     return -float64_lt(op2, op1, stat);
11584fe068faSRichard Henderson }
11594fe068faSRichard Henderson 
float16_acge(float16 op1,float16 op2,float_status * stat)1160a3ef070eSClaudio Fontana static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1161a3ef070eSClaudio Fontana {
1162a3ef070eSClaudio Fontana     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1163a3ef070eSClaudio Fontana }
1164a3ef070eSClaudio Fontana 
float32_acge(float32 op1,float32 op2,float_status * stat)1165a3ef070eSClaudio Fontana static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1166a3ef070eSClaudio Fontana {
1167a3ef070eSClaudio Fontana     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1168a3ef070eSClaudio Fontana }
1169a3ef070eSClaudio Fontana 
float64_acge(float64 op1,float64 op2,float_status * stat)11704fe068faSRichard Henderson static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
11714fe068faSRichard Henderson {
11724fe068faSRichard Henderson     return -float64_le(float64_abs(op2), float64_abs(op1), stat);
11734fe068faSRichard Henderson }
11744fe068faSRichard Henderson 
float16_acgt(float16 op1,float16 op2,float_status * stat)1175a3ef070eSClaudio Fontana static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1176a3ef070eSClaudio Fontana {
1177a3ef070eSClaudio Fontana     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1178a3ef070eSClaudio Fontana }
1179a3ef070eSClaudio Fontana 
float32_acgt(float32 op1,float32 op2,float_status * stat)1180a3ef070eSClaudio Fontana static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1181a3ef070eSClaudio Fontana {
1182a3ef070eSClaudio Fontana     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1183a3ef070eSClaudio Fontana }
1184a3ef070eSClaudio Fontana 
float64_acgt(float64 op1,float64 op2,float_status * stat)11854fe068faSRichard Henderson static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
11864fe068faSRichard Henderson {
11874fe068faSRichard Henderson     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
11884fe068faSRichard Henderson }
11894fe068faSRichard Henderson 
vfp_tosszh(float16 x,void * fpstp)1190a3ef070eSClaudio Fontana static int16_t vfp_tosszh(float16 x, void *fpstp)
1191a3ef070eSClaudio Fontana {
1192a3ef070eSClaudio Fontana     float_status *fpst = fpstp;
1193a3ef070eSClaudio Fontana     if (float16_is_any_nan(x)) {
1194a3ef070eSClaudio Fontana         float_raise(float_flag_invalid, fpst);
1195a3ef070eSClaudio Fontana         return 0;
1196a3ef070eSClaudio Fontana     }
1197a3ef070eSClaudio Fontana     return float16_to_int16_round_to_zero(x, fpst);
1198a3ef070eSClaudio Fontana }
1199a3ef070eSClaudio Fontana 
vfp_touszh(float16 x,void * fpstp)1200a3ef070eSClaudio Fontana static uint16_t vfp_touszh(float16 x, void *fpstp)
1201a3ef070eSClaudio Fontana {
1202a3ef070eSClaudio Fontana     float_status *fpst = fpstp;
1203a3ef070eSClaudio Fontana     if (float16_is_any_nan(x)) {
1204a3ef070eSClaudio Fontana         float_raise(float_flag_invalid, fpst);
1205a3ef070eSClaudio Fontana         return 0;
1206a3ef070eSClaudio Fontana     }
1207a3ef070eSClaudio Fontana     return float16_to_uint16_round_to_zero(x, fpst);
1208a3ef070eSClaudio Fontana }
1209a3ef070eSClaudio Fontana 
1210a3ef070eSClaudio Fontana #define DO_2OP(NAME, FUNC, TYPE) \
1211a3ef070eSClaudio Fontana void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)  \
1212a3ef070eSClaudio Fontana {                                                                 \
1213a3ef070eSClaudio Fontana     intptr_t i, oprsz = simd_oprsz(desc);                         \
1214a3ef070eSClaudio Fontana     TYPE *d = vd, *n = vn;                                        \
1215a3ef070eSClaudio Fontana     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1216a3ef070eSClaudio Fontana         d[i] = FUNC(n[i], stat);                                  \
1217a3ef070eSClaudio Fontana     }                                                             \
1218a3ef070eSClaudio Fontana     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1219a3ef070eSClaudio Fontana }
1220a3ef070eSClaudio Fontana 
DO_2OP(gvec_frecpe_h,helper_recpe_f16,float16)1221a3ef070eSClaudio Fontana DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1222a3ef070eSClaudio Fontana DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1223a3ef070eSClaudio Fontana DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1224a3ef070eSClaudio Fontana 
1225a3ef070eSClaudio Fontana DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1226a3ef070eSClaudio Fontana DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1227a3ef070eSClaudio Fontana DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1228a3ef070eSClaudio Fontana 
1229a3ef070eSClaudio Fontana DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1230a3ef070eSClaudio Fontana DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1231a3ef070eSClaudio Fontana 
1232a3ef070eSClaudio Fontana DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1233a3ef070eSClaudio Fontana DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1234a3ef070eSClaudio Fontana DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1235a3ef070eSClaudio Fontana DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1236a3ef070eSClaudio Fontana DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1237a3ef070eSClaudio Fontana DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1238a3ef070eSClaudio Fontana DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1239a3ef070eSClaudio Fontana DO_2OP(gvec_touszh, vfp_touszh, float16)
1240a3ef070eSClaudio Fontana 
1241a3ef070eSClaudio Fontana #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1242a3ef070eSClaudio Fontana     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1243a3ef070eSClaudio Fontana     {                                                           \
1244a3ef070eSClaudio Fontana         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1245a3ef070eSClaudio Fontana     }
1246a3ef070eSClaudio Fontana 
1247a3ef070eSClaudio Fontana #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1248a3ef070eSClaudio Fontana     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1249a3ef070eSClaudio Fontana     {                                                           \
1250a3ef070eSClaudio Fontana         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1251a3ef070eSClaudio Fontana     }
1252a3ef070eSClaudio Fontana 
1253a3ef070eSClaudio Fontana #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1254a3ef070eSClaudio Fontana     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1255a3ef070eSClaudio Fontana     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1256a3ef070eSClaudio Fontana     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1257a3ef070eSClaudio Fontana     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1258a3ef070eSClaudio Fontana 
1259a3ef070eSClaudio Fontana DO_2OP_CMP0(cgt, cgt, FWD)
1260a3ef070eSClaudio Fontana DO_2OP_CMP0(cge, cge, FWD)
1261a3ef070eSClaudio Fontana DO_2OP_CMP0(ceq, ceq, FWD)
1262a3ef070eSClaudio Fontana DO_2OP_CMP0(clt, cgt, REV)
1263a3ef070eSClaudio Fontana DO_2OP_CMP0(cle, cge, REV)
1264a3ef070eSClaudio Fontana 
1265a3ef070eSClaudio Fontana #undef DO_2OP
1266a3ef070eSClaudio Fontana #undef DO_2OP_CMP0
1267a3ef070eSClaudio Fontana 
1268a3ef070eSClaudio Fontana /* Floating-point trigonometric starting value.
1269a3ef070eSClaudio Fontana  * See the ARM ARM pseudocode function FPTrigSMul.
1270a3ef070eSClaudio Fontana  */
1271a3ef070eSClaudio Fontana static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1272a3ef070eSClaudio Fontana {
1273a3ef070eSClaudio Fontana     float16 result = float16_mul(op1, op1, stat);
1274a3ef070eSClaudio Fontana     if (!float16_is_any_nan(result)) {
1275a3ef070eSClaudio Fontana         result = float16_set_sign(result, op2 & 1);
1276a3ef070eSClaudio Fontana     }
1277a3ef070eSClaudio Fontana     return result;
1278a3ef070eSClaudio Fontana }
1279a3ef070eSClaudio Fontana 
float32_ftsmul(float32 op1,uint32_t op2,float_status * stat)1280a3ef070eSClaudio Fontana static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1281a3ef070eSClaudio Fontana {
1282a3ef070eSClaudio Fontana     float32 result = float32_mul(op1, op1, stat);
1283a3ef070eSClaudio Fontana     if (!float32_is_any_nan(result)) {
1284a3ef070eSClaudio Fontana         result = float32_set_sign(result, op2 & 1);
1285a3ef070eSClaudio Fontana     }
1286a3ef070eSClaudio Fontana     return result;
1287a3ef070eSClaudio Fontana }
1288a3ef070eSClaudio Fontana 
float64_ftsmul(float64 op1,uint64_t op2,float_status * stat)1289a3ef070eSClaudio Fontana static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1290a3ef070eSClaudio Fontana {
1291a3ef070eSClaudio Fontana     float64 result = float64_mul(op1, op1, stat);
1292a3ef070eSClaudio Fontana     if (!float64_is_any_nan(result)) {
1293a3ef070eSClaudio Fontana         result = float64_set_sign(result, op2 & 1);
1294a3ef070eSClaudio Fontana     }
1295a3ef070eSClaudio Fontana     return result;
1296a3ef070eSClaudio Fontana }
1297a3ef070eSClaudio Fontana 
float16_abd(float16 op1,float16 op2,float_status * stat)1298a3ef070eSClaudio Fontana static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1299a3ef070eSClaudio Fontana {
1300a3ef070eSClaudio Fontana     return float16_abs(float16_sub(op1, op2, stat));
1301a3ef070eSClaudio Fontana }
1302a3ef070eSClaudio Fontana 
float32_abd(float32 op1,float32 op2,float_status * stat)1303a3ef070eSClaudio Fontana static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1304a3ef070eSClaudio Fontana {
1305a3ef070eSClaudio Fontana     return float32_abs(float32_sub(op1, op2, stat));
1306a3ef070eSClaudio Fontana }
1307a3ef070eSClaudio Fontana 
float64_abd(float64 op1,float64 op2,float_status * stat)130843454734SRichard Henderson static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
130943454734SRichard Henderson {
131043454734SRichard Henderson     return float64_abs(float64_sub(op1, op2, stat));
131143454734SRichard Henderson }
131243454734SRichard Henderson 
1313a3ef070eSClaudio Fontana /*
1314a3ef070eSClaudio Fontana  * Reciprocal step. These are the AArch32 version which uses a
1315a3ef070eSClaudio Fontana  * non-fused multiply-and-subtract.
1316a3ef070eSClaudio Fontana  */
float16_recps_nf(float16 op1,float16 op2,float_status * stat)1317a3ef070eSClaudio Fontana static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1318a3ef070eSClaudio Fontana {
1319a3ef070eSClaudio Fontana     op1 = float16_squash_input_denormal(op1, stat);
1320a3ef070eSClaudio Fontana     op2 = float16_squash_input_denormal(op2, stat);
1321a3ef070eSClaudio Fontana 
1322a3ef070eSClaudio Fontana     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1323a3ef070eSClaudio Fontana         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1324a3ef070eSClaudio Fontana         return float16_two;
1325a3ef070eSClaudio Fontana     }
1326a3ef070eSClaudio Fontana     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1327a3ef070eSClaudio Fontana }
1328a3ef070eSClaudio Fontana 
float32_recps_nf(float32 op1,float32 op2,float_status * stat)1329a3ef070eSClaudio Fontana static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1330a3ef070eSClaudio Fontana {
1331a3ef070eSClaudio Fontana     op1 = float32_squash_input_denormal(op1, stat);
1332a3ef070eSClaudio Fontana     op2 = float32_squash_input_denormal(op2, stat);
1333a3ef070eSClaudio Fontana 
1334a3ef070eSClaudio Fontana     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1335a3ef070eSClaudio Fontana         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1336a3ef070eSClaudio Fontana         return float32_two;
1337a3ef070eSClaudio Fontana     }
1338a3ef070eSClaudio Fontana     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1339a3ef070eSClaudio Fontana }
1340a3ef070eSClaudio Fontana 
1341a3ef070eSClaudio Fontana /* Reciprocal square-root step. AArch32 non-fused semantics. */
float16_rsqrts_nf(float16 op1,float16 op2,float_status * stat)1342a3ef070eSClaudio Fontana static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1343a3ef070eSClaudio Fontana {
1344a3ef070eSClaudio Fontana     op1 = float16_squash_input_denormal(op1, stat);
1345a3ef070eSClaudio Fontana     op2 = float16_squash_input_denormal(op2, stat);
1346a3ef070eSClaudio Fontana 
1347a3ef070eSClaudio Fontana     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1348a3ef070eSClaudio Fontana         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1349a3ef070eSClaudio Fontana         return float16_one_point_five;
1350a3ef070eSClaudio Fontana     }
1351a3ef070eSClaudio Fontana     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1352a3ef070eSClaudio Fontana     return float16_div(op1, float16_two, stat);
1353a3ef070eSClaudio Fontana }
1354a3ef070eSClaudio Fontana 
float32_rsqrts_nf(float32 op1,float32 op2,float_status * stat)1355a3ef070eSClaudio Fontana static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1356a3ef070eSClaudio Fontana {
1357a3ef070eSClaudio Fontana     op1 = float32_squash_input_denormal(op1, stat);
1358a3ef070eSClaudio Fontana     op2 = float32_squash_input_denormal(op2, stat);
1359a3ef070eSClaudio Fontana 
1360a3ef070eSClaudio Fontana     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1361a3ef070eSClaudio Fontana         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1362a3ef070eSClaudio Fontana         return float32_one_point_five;
1363a3ef070eSClaudio Fontana     }
1364a3ef070eSClaudio Fontana     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1365a3ef070eSClaudio Fontana     return float32_div(op1, float32_two, stat);
1366a3ef070eSClaudio Fontana }
1367a3ef070eSClaudio Fontana 
1368a3ef070eSClaudio Fontana #define DO_3OP(NAME, FUNC, TYPE) \
1369a3ef070eSClaudio Fontana void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1370a3ef070eSClaudio Fontana {                                                                          \
1371a3ef070eSClaudio Fontana     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1372a3ef070eSClaudio Fontana     TYPE *d = vd, *n = vn, *m = vm;                                        \
1373a3ef070eSClaudio Fontana     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1374a3ef070eSClaudio Fontana         d[i] = FUNC(n[i], m[i], stat);                                     \
1375a3ef070eSClaudio Fontana     }                                                                      \
1376a3ef070eSClaudio Fontana     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1377a3ef070eSClaudio Fontana }
1378a3ef070eSClaudio Fontana 
DO_3OP(gvec_fadd_h,float16_add,float16)1379a3ef070eSClaudio Fontana DO_3OP(gvec_fadd_h, float16_add, float16)
1380a3ef070eSClaudio Fontana DO_3OP(gvec_fadd_s, float32_add, float32)
1381a3ef070eSClaudio Fontana DO_3OP(gvec_fadd_d, float64_add, float64)
1382a3ef070eSClaudio Fontana 
1383a3ef070eSClaudio Fontana DO_3OP(gvec_fsub_h, float16_sub, float16)
1384a3ef070eSClaudio Fontana DO_3OP(gvec_fsub_s, float32_sub, float32)
1385a3ef070eSClaudio Fontana DO_3OP(gvec_fsub_d, float64_sub, float64)
1386a3ef070eSClaudio Fontana 
1387a3ef070eSClaudio Fontana DO_3OP(gvec_fmul_h, float16_mul, float16)
1388a3ef070eSClaudio Fontana DO_3OP(gvec_fmul_s, float32_mul, float32)
1389a3ef070eSClaudio Fontana DO_3OP(gvec_fmul_d, float64_mul, float64)
1390a3ef070eSClaudio Fontana 
1391a3ef070eSClaudio Fontana DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1392a3ef070eSClaudio Fontana DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1393a3ef070eSClaudio Fontana DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1394a3ef070eSClaudio Fontana 
1395a3ef070eSClaudio Fontana DO_3OP(gvec_fabd_h, float16_abd, float16)
1396a3ef070eSClaudio Fontana DO_3OP(gvec_fabd_s, float32_abd, float32)
139743454734SRichard Henderson DO_3OP(gvec_fabd_d, float64_abd, float64)
1398a3ef070eSClaudio Fontana 
1399a3ef070eSClaudio Fontana DO_3OP(gvec_fceq_h, float16_ceq, float16)
1400a3ef070eSClaudio Fontana DO_3OP(gvec_fceq_s, float32_ceq, float32)
14014fe068faSRichard Henderson DO_3OP(gvec_fceq_d, float64_ceq, float64)
1402a3ef070eSClaudio Fontana 
1403a3ef070eSClaudio Fontana DO_3OP(gvec_fcge_h, float16_cge, float16)
1404a3ef070eSClaudio Fontana DO_3OP(gvec_fcge_s, float32_cge, float32)
14054fe068faSRichard Henderson DO_3OP(gvec_fcge_d, float64_cge, float64)
1406a3ef070eSClaudio Fontana 
1407a3ef070eSClaudio Fontana DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1408a3ef070eSClaudio Fontana DO_3OP(gvec_fcgt_s, float32_cgt, float32)
14094fe068faSRichard Henderson DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1410a3ef070eSClaudio Fontana 
1411a3ef070eSClaudio Fontana DO_3OP(gvec_facge_h, float16_acge, float16)
1412a3ef070eSClaudio Fontana DO_3OP(gvec_facge_s, float32_acge, float32)
14134fe068faSRichard Henderson DO_3OP(gvec_facge_d, float64_acge, float64)
1414a3ef070eSClaudio Fontana 
1415a3ef070eSClaudio Fontana DO_3OP(gvec_facgt_h, float16_acgt, float16)
1416a3ef070eSClaudio Fontana DO_3OP(gvec_facgt_s, float32_acgt, float32)
14174fe068faSRichard Henderson DO_3OP(gvec_facgt_d, float64_acgt, float64)
1418a3ef070eSClaudio Fontana 
1419a3ef070eSClaudio Fontana DO_3OP(gvec_fmax_h, float16_max, float16)
1420a3ef070eSClaudio Fontana DO_3OP(gvec_fmax_s, float32_max, float32)
1421a1e250fcSRichard Henderson DO_3OP(gvec_fmax_d, float64_max, float64)
1422a3ef070eSClaudio Fontana 
1423a3ef070eSClaudio Fontana DO_3OP(gvec_fmin_h, float16_min, float16)
1424a3ef070eSClaudio Fontana DO_3OP(gvec_fmin_s, float32_min, float32)
1425a1e250fcSRichard Henderson DO_3OP(gvec_fmin_d, float64_min, float64)
1426a3ef070eSClaudio Fontana 
1427a3ef070eSClaudio Fontana DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1428a3ef070eSClaudio Fontana DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1429a1e250fcSRichard Henderson DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1430a3ef070eSClaudio Fontana 
1431a3ef070eSClaudio Fontana DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1432a3ef070eSClaudio Fontana DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1433a1e250fcSRichard Henderson DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1434a3ef070eSClaudio Fontana 
1435a3ef070eSClaudio Fontana DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1436a3ef070eSClaudio Fontana DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1437a3ef070eSClaudio Fontana 
1438a3ef070eSClaudio Fontana DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1439a3ef070eSClaudio Fontana DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1440a3ef070eSClaudio Fontana 
1441a3ef070eSClaudio Fontana #ifdef TARGET_AARCH64
1442e0300a9aSRichard Henderson DO_3OP(gvec_fdiv_h, float16_div, float16)
1443e0300a9aSRichard Henderson DO_3OP(gvec_fdiv_s, float32_div, float32)
1444e0300a9aSRichard Henderson DO_3OP(gvec_fdiv_d, float64_div, float64)
1445e0300a9aSRichard Henderson 
1446cb1c77feSRichard Henderson DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1447cb1c77feSRichard Henderson DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1448cb1c77feSRichard Henderson DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1449a3ef070eSClaudio Fontana 
1450a3ef070eSClaudio Fontana DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1451a3ef070eSClaudio Fontana DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1452a3ef070eSClaudio Fontana DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1453a3ef070eSClaudio Fontana 
1454a3ef070eSClaudio Fontana DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1455a3ef070eSClaudio Fontana DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1456a3ef070eSClaudio Fontana DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1457a3ef070eSClaudio Fontana 
1458a3ef070eSClaudio Fontana #endif
1459a3ef070eSClaudio Fontana #undef DO_3OP
1460a3ef070eSClaudio Fontana 
1461a3ef070eSClaudio Fontana /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1462a3ef070eSClaudio Fontana static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1463a3ef070eSClaudio Fontana                                  float_status *stat)
1464a3ef070eSClaudio Fontana {
1465a3ef070eSClaudio Fontana     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1466a3ef070eSClaudio Fontana }
1467a3ef070eSClaudio Fontana 
float32_muladd_nf(float32 dest,float32 op1,float32 op2,float_status * stat)1468a3ef070eSClaudio Fontana static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1469a3ef070eSClaudio Fontana                                  float_status *stat)
1470a3ef070eSClaudio Fontana {
1471a3ef070eSClaudio Fontana     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1472a3ef070eSClaudio Fontana }
1473a3ef070eSClaudio Fontana 
float16_mulsub_nf(float16 dest,float16 op1,float16 op2,float_status * stat)1474a3ef070eSClaudio Fontana static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1475a3ef070eSClaudio Fontana                                  float_status *stat)
1476a3ef070eSClaudio Fontana {
1477a3ef070eSClaudio Fontana     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1478a3ef070eSClaudio Fontana }
1479a3ef070eSClaudio Fontana 
float32_mulsub_nf(float32 dest,float32 op1,float32 op2,float_status * stat)1480a3ef070eSClaudio Fontana static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1481a3ef070eSClaudio Fontana                                  float_status *stat)
1482a3ef070eSClaudio Fontana {
1483a3ef070eSClaudio Fontana     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1484a3ef070eSClaudio Fontana }
1485a3ef070eSClaudio Fontana 
1486a3ef070eSClaudio Fontana /* Fused versions; these have the semantics Neon VFMA/VFMS want */
float16_muladd_f(float16 dest,float16 op1,float16 op2,float_status * stat)1487a3ef070eSClaudio Fontana static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1488a3ef070eSClaudio Fontana                                 float_status *stat)
1489a3ef070eSClaudio Fontana {
1490a3ef070eSClaudio Fontana     return float16_muladd(op1, op2, dest, 0, stat);
1491a3ef070eSClaudio Fontana }
1492a3ef070eSClaudio Fontana 
float32_muladd_f(float32 dest,float32 op1,float32 op2,float_status * stat)1493a3ef070eSClaudio Fontana static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1494a3ef070eSClaudio Fontana                                  float_status *stat)
1495a3ef070eSClaudio Fontana {
1496a3ef070eSClaudio Fontana     return float32_muladd(op1, op2, dest, 0, stat);
1497a3ef070eSClaudio Fontana }
1498a3ef070eSClaudio Fontana 
float64_muladd_f(float64 dest,float64 op1,float64 op2,float_status * stat)14992d558efbSRichard Henderson static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
15002d558efbSRichard Henderson                                  float_status *stat)
15012d558efbSRichard Henderson {
15022d558efbSRichard Henderson     return float64_muladd(op1, op2, dest, 0, stat);
15032d558efbSRichard Henderson }
15042d558efbSRichard Henderson 
float16_mulsub_f(float16 dest,float16 op1,float16 op2,float_status * stat)1505a3ef070eSClaudio Fontana static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1506a3ef070eSClaudio Fontana                                  float_status *stat)
1507a3ef070eSClaudio Fontana {
1508a3ef070eSClaudio Fontana     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1509a3ef070eSClaudio Fontana }
1510a3ef070eSClaudio Fontana 
float32_mulsub_f(float32 dest,float32 op1,float32 op2,float_status * stat)1511a3ef070eSClaudio Fontana static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1512a3ef070eSClaudio Fontana                                  float_status *stat)
1513a3ef070eSClaudio Fontana {
1514a3ef070eSClaudio Fontana     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1515a3ef070eSClaudio Fontana }
1516a3ef070eSClaudio Fontana 
float64_mulsub_f(float64 dest,float64 op1,float64 op2,float_status * stat)15172d558efbSRichard Henderson static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
15182d558efbSRichard Henderson                                  float_status *stat)
15192d558efbSRichard Henderson {
15202d558efbSRichard Henderson     return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
15212d558efbSRichard Henderson }
15222d558efbSRichard Henderson 
1523a3ef070eSClaudio Fontana #define DO_MULADD(NAME, FUNC, TYPE)                                     \
1524a3ef070eSClaudio Fontana void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1525a3ef070eSClaudio Fontana {                                                                          \
1526a3ef070eSClaudio Fontana     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1527a3ef070eSClaudio Fontana     TYPE *d = vd, *n = vn, *m = vm;                                        \
1528a3ef070eSClaudio Fontana     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1529a3ef070eSClaudio Fontana         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1530a3ef070eSClaudio Fontana     }                                                                      \
1531a3ef070eSClaudio Fontana     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1532a3ef070eSClaudio Fontana }
1533a3ef070eSClaudio Fontana 
DO_MULADD(gvec_fmla_h,float16_muladd_nf,float16)1534a3ef070eSClaudio Fontana DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1535a3ef070eSClaudio Fontana DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1536a3ef070eSClaudio Fontana 
1537a3ef070eSClaudio Fontana DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1538a3ef070eSClaudio Fontana DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1539a3ef070eSClaudio Fontana 
1540a3ef070eSClaudio Fontana DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1541a3ef070eSClaudio Fontana DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
15422d558efbSRichard Henderson DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1543a3ef070eSClaudio Fontana 
1544a3ef070eSClaudio Fontana DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1545a3ef070eSClaudio Fontana DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
15462d558efbSRichard Henderson DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1547a3ef070eSClaudio Fontana 
1548a3ef070eSClaudio Fontana /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1549a3ef070eSClaudio Fontana  * For AdvSIMD, there is of course only one such vector segment.
1550a3ef070eSClaudio Fontana  */
1551a3ef070eSClaudio Fontana 
1552a3ef070eSClaudio Fontana #define DO_MUL_IDX(NAME, TYPE, H) \
1553a3ef070eSClaudio Fontana void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1554a3ef070eSClaudio Fontana {                                                                          \
1555a3ef070eSClaudio Fontana     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1556a3ef070eSClaudio Fontana     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1557a3ef070eSClaudio Fontana     intptr_t idx = simd_data(desc);                                        \
1558a3ef070eSClaudio Fontana     TYPE *d = vd, *n = vn, *m = vm;                                        \
1559a3ef070eSClaudio Fontana     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1560a3ef070eSClaudio Fontana         TYPE mm = m[H(i + idx)];                                           \
1561a3ef070eSClaudio Fontana         for (j = 0; j < segment; j++) {                                    \
1562a3ef070eSClaudio Fontana             d[i + j] = n[i + j] * mm;                                      \
1563a3ef070eSClaudio Fontana         }                                                                  \
1564a3ef070eSClaudio Fontana     }                                                                      \
1565a3ef070eSClaudio Fontana     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1566a3ef070eSClaudio Fontana }
1567a3ef070eSClaudio Fontana 
1568a3ef070eSClaudio Fontana DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1569a3ef070eSClaudio Fontana DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1570a3ef070eSClaudio Fontana DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1571a3ef070eSClaudio Fontana 
1572a3ef070eSClaudio Fontana #undef DO_MUL_IDX
1573a3ef070eSClaudio Fontana 
1574a3ef070eSClaudio Fontana #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1575a3ef070eSClaudio Fontana void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1576a3ef070eSClaudio Fontana {                                                                          \
1577a3ef070eSClaudio Fontana     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1578a3ef070eSClaudio Fontana     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1579a3ef070eSClaudio Fontana     intptr_t idx = simd_data(desc);                                        \
1580a3ef070eSClaudio Fontana     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1581a3ef070eSClaudio Fontana     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1582a3ef070eSClaudio Fontana         TYPE mm = m[H(i + idx)];                                           \
1583a3ef070eSClaudio Fontana         for (j = 0; j < segment; j++) {                                    \
1584a3ef070eSClaudio Fontana             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1585a3ef070eSClaudio Fontana         }                                                                  \
1586a3ef070eSClaudio Fontana     }                                                                      \
1587a3ef070eSClaudio Fontana     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1588a3ef070eSClaudio Fontana }
1589a3ef070eSClaudio Fontana 
1590a3ef070eSClaudio Fontana DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1591a3ef070eSClaudio Fontana DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1592a3ef070eSClaudio Fontana DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1593a3ef070eSClaudio Fontana 
1594a3ef070eSClaudio Fontana DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1595a3ef070eSClaudio Fontana DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1596a3ef070eSClaudio Fontana DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1597a3ef070eSClaudio Fontana 
1598a3ef070eSClaudio Fontana #undef DO_MLA_IDX
1599a3ef070eSClaudio Fontana 
1600cb1c77feSRichard Henderson #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1601a3ef070eSClaudio Fontana void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1602a3ef070eSClaudio Fontana {                                                                          \
1603a3ef070eSClaudio Fontana     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1604a3ef070eSClaudio Fontana     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1605a3ef070eSClaudio Fontana     intptr_t idx = simd_data(desc);                                        \
1606a3ef070eSClaudio Fontana     TYPE *d = vd, *n = vn, *m = vm;                                        \
1607a3ef070eSClaudio Fontana     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1608a3ef070eSClaudio Fontana         TYPE mm = m[H(i + idx)];                                           \
1609a3ef070eSClaudio Fontana         for (j = 0; j < segment; j++) {                                    \
1610cb1c77feSRichard Henderson             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1611a3ef070eSClaudio Fontana         }                                                                  \
1612a3ef070eSClaudio Fontana     }                                                                      \
1613a3ef070eSClaudio Fontana     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1614a3ef070eSClaudio Fontana }
1615a3ef070eSClaudio Fontana 
1616cb1c77feSRichard Henderson #define nop(N, M, S) (M)
1617a3ef070eSClaudio Fontana 
1618cb1c77feSRichard Henderson DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1619cb1c77feSRichard Henderson DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1620cb1c77feSRichard Henderson DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1621cb1c77feSRichard Henderson 
1622cb1c77feSRichard Henderson #ifdef TARGET_AARCH64
1623cb1c77feSRichard Henderson 
1624cb1c77feSRichard Henderson DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1625cb1c77feSRichard Henderson DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1626cb1c77feSRichard Henderson DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1627cb1c77feSRichard Henderson 
1628cb1c77feSRichard Henderson #endif
1629cb1c77feSRichard Henderson 
1630cb1c77feSRichard Henderson #undef nop
1631a3ef070eSClaudio Fontana 
1632a3ef070eSClaudio Fontana /*
1633a3ef070eSClaudio Fontana  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1634a3ef070eSClaudio Fontana  * the fused ops below they assume accumulate both from and into Vd.
1635a3ef070eSClaudio Fontana  */
1636cb1c77feSRichard Henderson DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1637cb1c77feSRichard Henderson DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1638cb1c77feSRichard Henderson DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1639cb1c77feSRichard Henderson DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1640a3ef070eSClaudio Fontana 
1641a3ef070eSClaudio Fontana #undef DO_FMUL_IDX
1642a3ef070eSClaudio Fontana 
1643a3ef070eSClaudio Fontana #define DO_FMLA_IDX(NAME, TYPE, H)                                         \
1644a3ef070eSClaudio Fontana void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1645a3ef070eSClaudio Fontana                   void *stat, uint32_t desc)                               \
1646a3ef070eSClaudio Fontana {                                                                          \
1647a3ef070eSClaudio Fontana     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1648a3ef070eSClaudio Fontana     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1649a3ef070eSClaudio Fontana     TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
1650a3ef070eSClaudio Fontana     intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
1651a3ef070eSClaudio Fontana     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1652a3ef070eSClaudio Fontana     op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
1653a3ef070eSClaudio Fontana     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1654a3ef070eSClaudio Fontana         TYPE mm = m[H(i + idx)];                                           \
1655a3ef070eSClaudio Fontana         for (j = 0; j < segment; j++) {                                    \
1656a3ef070eSClaudio Fontana             d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
1657a3ef070eSClaudio Fontana                                      mm, a[i + j], 0, stat);               \
1658a3ef070eSClaudio Fontana         }                                                                  \
1659a3ef070eSClaudio Fontana     }                                                                      \
1660a3ef070eSClaudio Fontana     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1661a3ef070eSClaudio Fontana }
1662a3ef070eSClaudio Fontana 
1663a3ef070eSClaudio Fontana DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1664a3ef070eSClaudio Fontana DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1665a3ef070eSClaudio Fontana DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1666a3ef070eSClaudio Fontana 
1667a3ef070eSClaudio Fontana #undef DO_FMLA_IDX
1668a3ef070eSClaudio Fontana 
1669a3ef070eSClaudio Fontana #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1670a3ef070eSClaudio Fontana void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1671a3ef070eSClaudio Fontana {                                                                          \
1672a3ef070eSClaudio Fontana     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1673a3ef070eSClaudio Fontana     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1674a3ef070eSClaudio Fontana     bool q = false;                                                        \
1675a3ef070eSClaudio Fontana     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1676a3ef070eSClaudio Fontana         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1677a3ef070eSClaudio Fontana         if (dd < MIN) {                                                    \
1678a3ef070eSClaudio Fontana             dd = MIN;                                                      \
1679a3ef070eSClaudio Fontana             q = true;                                                      \
1680a3ef070eSClaudio Fontana         } else if (dd > MAX) {                                             \
1681a3ef070eSClaudio Fontana             dd = MAX;                                                      \
1682a3ef070eSClaudio Fontana             q = true;                                                      \
1683a3ef070eSClaudio Fontana         }                                                                  \
1684a3ef070eSClaudio Fontana         d[i] = dd;                                                         \
1685a3ef070eSClaudio Fontana     }                                                                      \
1686a3ef070eSClaudio Fontana     if (q) {                                                               \
1687a3ef070eSClaudio Fontana         uint32_t *qc = vq;                                                 \
1688a3ef070eSClaudio Fontana         qc[0] = 1;                                                         \
1689a3ef070eSClaudio Fontana     }                                                                      \
1690a3ef070eSClaudio Fontana     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1691a3ef070eSClaudio Fontana }
1692a3ef070eSClaudio Fontana 
1693a3ef070eSClaudio Fontana DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1694a3ef070eSClaudio Fontana DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1695a3ef070eSClaudio Fontana DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1696a3ef070eSClaudio Fontana 
1697a3ef070eSClaudio Fontana DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1698a3ef070eSClaudio Fontana DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1699a3ef070eSClaudio Fontana DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1700a3ef070eSClaudio Fontana 
1701a3ef070eSClaudio Fontana DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1702a3ef070eSClaudio Fontana DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1703a3ef070eSClaudio Fontana DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1704a3ef070eSClaudio Fontana 
1705a3ef070eSClaudio Fontana DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1706a3ef070eSClaudio Fontana DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1707a3ef070eSClaudio Fontana DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1708a3ef070eSClaudio Fontana 
17098f6343aeSRichard Henderson DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
17108f6343aeSRichard Henderson DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
17118f6343aeSRichard Henderson DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
17128f6343aeSRichard Henderson 
17138f6343aeSRichard Henderson DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
17148f6343aeSRichard Henderson DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
17158f6343aeSRichard Henderson DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
17168f6343aeSRichard Henderson 
1717a3ef070eSClaudio Fontana #undef DO_SAT
1718a3ef070eSClaudio Fontana 
1719a3ef070eSClaudio Fontana void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1720a3ef070eSClaudio Fontana                           void *vm, uint32_t desc)
1721a3ef070eSClaudio Fontana {
1722a3ef070eSClaudio Fontana     intptr_t i, oprsz = simd_oprsz(desc);
1723a3ef070eSClaudio Fontana     uint64_t *d = vd, *n = vn, *m = vm;
1724a3ef070eSClaudio Fontana     bool q = false;
1725a3ef070eSClaudio Fontana 
1726a3ef070eSClaudio Fontana     for (i = 0; i < oprsz / 8; i++) {
1727a3ef070eSClaudio Fontana         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1728a3ef070eSClaudio Fontana         if (dd < nn) {
1729a3ef070eSClaudio Fontana             dd = UINT64_MAX;
1730a3ef070eSClaudio Fontana             q = true;
1731a3ef070eSClaudio Fontana         }
1732a3ef070eSClaudio Fontana         d[i] = dd;
1733a3ef070eSClaudio Fontana     }
1734a3ef070eSClaudio Fontana     if (q) {
1735a3ef070eSClaudio Fontana         uint32_t *qc = vq;
1736a3ef070eSClaudio Fontana         qc[0] = 1;
1737a3ef070eSClaudio Fontana     }
1738a3ef070eSClaudio Fontana     clear_tail(d, oprsz, simd_maxsz(desc));
1739a3ef070eSClaudio Fontana }
1740a3ef070eSClaudio Fontana 
HELPER(gvec_uqsub_d)1741a3ef070eSClaudio Fontana void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1742a3ef070eSClaudio Fontana                           void *vm, uint32_t desc)
1743a3ef070eSClaudio Fontana {
1744a3ef070eSClaudio Fontana     intptr_t i, oprsz = simd_oprsz(desc);
1745a3ef070eSClaudio Fontana     uint64_t *d = vd, *n = vn, *m = vm;
1746a3ef070eSClaudio Fontana     bool q = false;
1747a3ef070eSClaudio Fontana 
1748a3ef070eSClaudio Fontana     for (i = 0; i < oprsz / 8; i++) {
1749a3ef070eSClaudio Fontana         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1750a3ef070eSClaudio Fontana         if (nn < mm) {
1751a3ef070eSClaudio Fontana             dd = 0;
1752a3ef070eSClaudio Fontana             q = true;
1753a3ef070eSClaudio Fontana         }
1754a3ef070eSClaudio Fontana         d[i] = dd;
1755a3ef070eSClaudio Fontana     }
1756a3ef070eSClaudio Fontana     if (q) {
1757a3ef070eSClaudio Fontana         uint32_t *qc = vq;
1758a3ef070eSClaudio Fontana         qc[0] = 1;
1759a3ef070eSClaudio Fontana     }
1760a3ef070eSClaudio Fontana     clear_tail(d, oprsz, simd_maxsz(desc));
1761a3ef070eSClaudio Fontana }
1762a3ef070eSClaudio Fontana 
HELPER(gvec_sqadd_d)1763a3ef070eSClaudio Fontana void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1764a3ef070eSClaudio Fontana                           void *vm, uint32_t desc)
1765a3ef070eSClaudio Fontana {
1766a3ef070eSClaudio Fontana     intptr_t i, oprsz = simd_oprsz(desc);
1767a3ef070eSClaudio Fontana     int64_t *d = vd, *n = vn, *m = vm;
1768a3ef070eSClaudio Fontana     bool q = false;
1769a3ef070eSClaudio Fontana 
1770a3ef070eSClaudio Fontana     for (i = 0; i < oprsz / 8; i++) {
1771a3ef070eSClaudio Fontana         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1772a3ef070eSClaudio Fontana         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1773a3ef070eSClaudio Fontana             dd = (nn >> 63) ^ ~INT64_MIN;
1774a3ef070eSClaudio Fontana             q = true;
1775a3ef070eSClaudio Fontana         }
1776a3ef070eSClaudio Fontana         d[i] = dd;
1777a3ef070eSClaudio Fontana     }
1778a3ef070eSClaudio Fontana     if (q) {
1779a3ef070eSClaudio Fontana         uint32_t *qc = vq;
1780a3ef070eSClaudio Fontana         qc[0] = 1;
1781a3ef070eSClaudio Fontana     }
1782a3ef070eSClaudio Fontana     clear_tail(d, oprsz, simd_maxsz(desc));
1783a3ef070eSClaudio Fontana }
1784a3ef070eSClaudio Fontana 
HELPER(gvec_sqsub_d)1785a3ef070eSClaudio Fontana void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1786a3ef070eSClaudio Fontana                           void *vm, uint32_t desc)
1787a3ef070eSClaudio Fontana {
1788a3ef070eSClaudio Fontana     intptr_t i, oprsz = simd_oprsz(desc);
1789a3ef070eSClaudio Fontana     int64_t *d = vd, *n = vn, *m = vm;
1790a3ef070eSClaudio Fontana     bool q = false;
1791a3ef070eSClaudio Fontana 
1792a3ef070eSClaudio Fontana     for (i = 0; i < oprsz / 8; i++) {
1793a3ef070eSClaudio Fontana         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1794a3ef070eSClaudio Fontana         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1795a3ef070eSClaudio Fontana             dd = (nn >> 63) ^ ~INT64_MIN;
1796a3ef070eSClaudio Fontana             q = true;
1797a3ef070eSClaudio Fontana         }
1798a3ef070eSClaudio Fontana         d[i] = dd;
1799a3ef070eSClaudio Fontana     }
1800a3ef070eSClaudio Fontana     if (q) {
1801a3ef070eSClaudio Fontana         uint32_t *qc = vq;
1802a3ef070eSClaudio Fontana         qc[0] = 1;
1803a3ef070eSClaudio Fontana     }
1804a3ef070eSClaudio Fontana     clear_tail(d, oprsz, simd_maxsz(desc));
1805a3ef070eSClaudio Fontana }
1806a3ef070eSClaudio Fontana 
HELPER(gvec_usqadd_d)18078f6343aeSRichard Henderson void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
18088f6343aeSRichard Henderson                            void *vm, uint32_t desc)
18098f6343aeSRichard Henderson {
18108f6343aeSRichard Henderson     intptr_t i, oprsz = simd_oprsz(desc);
18118f6343aeSRichard Henderson     uint64_t *d = vd, *n = vn, *m = vm;
18128f6343aeSRichard Henderson     bool q = false;
18138f6343aeSRichard Henderson 
18148f6343aeSRichard Henderson     for (i = 0; i < oprsz / 8; i++) {
18158f6343aeSRichard Henderson         uint64_t nn = n[i];
18168f6343aeSRichard Henderson         int64_t mm = m[i];
18178f6343aeSRichard Henderson         uint64_t dd = nn + mm;
18188f6343aeSRichard Henderson 
18198f6343aeSRichard Henderson         if (mm < 0) {
18208f6343aeSRichard Henderson             if (nn < (uint64_t)-mm) {
18218f6343aeSRichard Henderson                 dd = 0;
18228f6343aeSRichard Henderson                 q = true;
18238f6343aeSRichard Henderson             }
18248f6343aeSRichard Henderson         } else {
18258f6343aeSRichard Henderson             if (dd < nn) {
18268f6343aeSRichard Henderson                 dd = UINT64_MAX;
18278f6343aeSRichard Henderson                 q = true;
18288f6343aeSRichard Henderson             }
18298f6343aeSRichard Henderson         }
18308f6343aeSRichard Henderson         d[i] = dd;
18318f6343aeSRichard Henderson     }
18328f6343aeSRichard Henderson     if (q) {
18338f6343aeSRichard Henderson         uint32_t *qc = vq;
18348f6343aeSRichard Henderson         qc[0] = 1;
18358f6343aeSRichard Henderson     }
18368f6343aeSRichard Henderson     clear_tail(d, oprsz, simd_maxsz(desc));
18378f6343aeSRichard Henderson }
18388f6343aeSRichard Henderson 
HELPER(gvec_suqadd_d)18398f6343aeSRichard Henderson void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
18408f6343aeSRichard Henderson                            void *vm, uint32_t desc)
18418f6343aeSRichard Henderson {
18428f6343aeSRichard Henderson     intptr_t i, oprsz = simd_oprsz(desc);
18438f6343aeSRichard Henderson     uint64_t *d = vd, *n = vn, *m = vm;
18448f6343aeSRichard Henderson     bool q = false;
18458f6343aeSRichard Henderson 
18468f6343aeSRichard Henderson     for (i = 0; i < oprsz / 8; i++) {
18478f6343aeSRichard Henderson         int64_t nn = n[i];
18488f6343aeSRichard Henderson         uint64_t mm = m[i];
18498f6343aeSRichard Henderson         int64_t dd = nn + mm;
18508f6343aeSRichard Henderson 
18518f6343aeSRichard Henderson         if (mm > (uint64_t)(INT64_MAX - nn)) {
18528f6343aeSRichard Henderson             dd = INT64_MAX;
18538f6343aeSRichard Henderson             q = true;
18548f6343aeSRichard Henderson         }
18558f6343aeSRichard Henderson         d[i] = dd;
18568f6343aeSRichard Henderson     }
18578f6343aeSRichard Henderson     if (q) {
18588f6343aeSRichard Henderson         uint32_t *qc = vq;
18598f6343aeSRichard Henderson         qc[0] = 1;
18608f6343aeSRichard Henderson     }
18618f6343aeSRichard Henderson     clear_tail(d, oprsz, simd_maxsz(desc));
18628f6343aeSRichard Henderson }
1863a3ef070eSClaudio Fontana 
1864a3ef070eSClaudio Fontana #define DO_SRA(NAME, TYPE)                              \
1865a3ef070eSClaudio Fontana void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1866a3ef070eSClaudio Fontana {                                                       \
1867a3ef070eSClaudio Fontana     intptr_t i, oprsz = simd_oprsz(desc);               \
1868a3ef070eSClaudio Fontana     int shift = simd_data(desc);                        \
1869a3ef070eSClaudio Fontana     TYPE *d = vd, *n = vn;                              \
1870a3ef070eSClaudio Fontana     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1871a3ef070eSClaudio Fontana         d[i] += n[i] >> shift;                          \
1872a3ef070eSClaudio Fontana     }                                                   \
1873a3ef070eSClaudio Fontana     clear_tail(d, oprsz, simd_maxsz(desc));             \
1874a3ef070eSClaudio Fontana }
1875a3ef070eSClaudio Fontana 
DO_SRA(gvec_ssra_b,int8_t)1876a3ef070eSClaudio Fontana DO_SRA(gvec_ssra_b, int8_t)
1877a3ef070eSClaudio Fontana DO_SRA(gvec_ssra_h, int16_t)
1878a3ef070eSClaudio Fontana DO_SRA(gvec_ssra_s, int32_t)
1879a3ef070eSClaudio Fontana DO_SRA(gvec_ssra_d, int64_t)
1880a3ef070eSClaudio Fontana 
1881a3ef070eSClaudio Fontana DO_SRA(gvec_usra_b, uint8_t)
1882a3ef070eSClaudio Fontana DO_SRA(gvec_usra_h, uint16_t)
1883a3ef070eSClaudio Fontana DO_SRA(gvec_usra_s, uint32_t)
1884a3ef070eSClaudio Fontana DO_SRA(gvec_usra_d, uint64_t)
1885a3ef070eSClaudio Fontana 
1886a3ef070eSClaudio Fontana #undef DO_SRA
1887a3ef070eSClaudio Fontana 
1888a3ef070eSClaudio Fontana #define DO_RSHR(NAME, TYPE)                             \
1889a3ef070eSClaudio Fontana void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1890a3ef070eSClaudio Fontana {                                                       \
1891a3ef070eSClaudio Fontana     intptr_t i, oprsz = simd_oprsz(desc);               \
1892a3ef070eSClaudio Fontana     int shift = simd_data(desc);                        \
1893a3ef070eSClaudio Fontana     TYPE *d = vd, *n = vn;                              \
1894a3ef070eSClaudio Fontana     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1895a3ef070eSClaudio Fontana         TYPE tmp = n[i] >> (shift - 1);                 \
1896a3ef070eSClaudio Fontana         d[i] = (tmp >> 1) + (tmp & 1);                  \
1897a3ef070eSClaudio Fontana     }                                                   \
1898a3ef070eSClaudio Fontana     clear_tail(d, oprsz, simd_maxsz(desc));             \
1899a3ef070eSClaudio Fontana }
1900a3ef070eSClaudio Fontana 
1901a3ef070eSClaudio Fontana DO_RSHR(gvec_srshr_b, int8_t)
1902a3ef070eSClaudio Fontana DO_RSHR(gvec_srshr_h, int16_t)
1903a3ef070eSClaudio Fontana DO_RSHR(gvec_srshr_s, int32_t)
1904a3ef070eSClaudio Fontana DO_RSHR(gvec_srshr_d, int64_t)
1905a3ef070eSClaudio Fontana 
1906a3ef070eSClaudio Fontana DO_RSHR(gvec_urshr_b, uint8_t)
1907a3ef070eSClaudio Fontana DO_RSHR(gvec_urshr_h, uint16_t)
1908a3ef070eSClaudio Fontana DO_RSHR(gvec_urshr_s, uint32_t)
1909a3ef070eSClaudio Fontana DO_RSHR(gvec_urshr_d, uint64_t)
1910a3ef070eSClaudio Fontana 
1911a3ef070eSClaudio Fontana #undef DO_RSHR
1912a3ef070eSClaudio Fontana 
1913a3ef070eSClaudio Fontana #define DO_RSRA(NAME, TYPE)                             \
1914a3ef070eSClaudio Fontana void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1915a3ef070eSClaudio Fontana {                                                       \
1916a3ef070eSClaudio Fontana     intptr_t i, oprsz = simd_oprsz(desc);               \
1917a3ef070eSClaudio Fontana     int shift = simd_data(desc);                        \
1918a3ef070eSClaudio Fontana     TYPE *d = vd, *n = vn;                              \
1919a3ef070eSClaudio Fontana     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1920a3ef070eSClaudio Fontana         TYPE tmp = n[i] >> (shift - 1);                 \
1921a3ef070eSClaudio Fontana         d[i] += (tmp >> 1) + (tmp & 1);                 \
1922a3ef070eSClaudio Fontana     }                                                   \
1923a3ef070eSClaudio Fontana     clear_tail(d, oprsz, simd_maxsz(desc));             \
1924a3ef070eSClaudio Fontana }
1925a3ef070eSClaudio Fontana 
1926a3ef070eSClaudio Fontana DO_RSRA(gvec_srsra_b, int8_t)
1927a3ef070eSClaudio Fontana DO_RSRA(gvec_srsra_h, int16_t)
1928a3ef070eSClaudio Fontana DO_RSRA(gvec_srsra_s, int32_t)
1929a3ef070eSClaudio Fontana DO_RSRA(gvec_srsra_d, int64_t)
1930a3ef070eSClaudio Fontana 
1931a3ef070eSClaudio Fontana DO_RSRA(gvec_ursra_b, uint8_t)
1932a3ef070eSClaudio Fontana DO_RSRA(gvec_ursra_h, uint16_t)
1933a3ef070eSClaudio Fontana DO_RSRA(gvec_ursra_s, uint32_t)
1934a3ef070eSClaudio Fontana DO_RSRA(gvec_ursra_d, uint64_t)
1935a3ef070eSClaudio Fontana 
1936a3ef070eSClaudio Fontana #undef DO_RSRA
1937a3ef070eSClaudio Fontana 
1938a3ef070eSClaudio Fontana #define DO_SRI(NAME, TYPE)                              \
1939a3ef070eSClaudio Fontana void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1940a3ef070eSClaudio Fontana {                                                       \
1941a3ef070eSClaudio Fontana     intptr_t i, oprsz = simd_oprsz(desc);               \
1942a3ef070eSClaudio Fontana     int shift = simd_data(desc);                        \
1943a3ef070eSClaudio Fontana     TYPE *d = vd, *n = vn;                              \
1944a3ef070eSClaudio Fontana     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1945a3ef070eSClaudio Fontana         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1946a3ef070eSClaudio Fontana     }                                                   \
1947a3ef070eSClaudio Fontana     clear_tail(d, oprsz, simd_maxsz(desc));             \
1948a3ef070eSClaudio Fontana }
1949a3ef070eSClaudio Fontana 
1950a3ef070eSClaudio Fontana DO_SRI(gvec_sri_b, uint8_t)
1951a3ef070eSClaudio Fontana DO_SRI(gvec_sri_h, uint16_t)
1952a3ef070eSClaudio Fontana DO_SRI(gvec_sri_s, uint32_t)
1953a3ef070eSClaudio Fontana DO_SRI(gvec_sri_d, uint64_t)
1954a3ef070eSClaudio Fontana 
1955a3ef070eSClaudio Fontana #undef DO_SRI
1956a3ef070eSClaudio Fontana 
1957a3ef070eSClaudio Fontana #define DO_SLI(NAME, TYPE)                              \
1958a3ef070eSClaudio Fontana void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1959a3ef070eSClaudio Fontana {                                                       \
1960a3ef070eSClaudio Fontana     intptr_t i, oprsz = simd_oprsz(desc);               \
1961a3ef070eSClaudio Fontana     int shift = simd_data(desc);                        \
1962a3ef070eSClaudio Fontana     TYPE *d = vd, *n = vn;                              \
1963a3ef070eSClaudio Fontana     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1964a3ef070eSClaudio Fontana         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1965a3ef070eSClaudio Fontana     }                                                   \
1966a3ef070eSClaudio Fontana     clear_tail(d, oprsz, simd_maxsz(desc));             \
1967a3ef070eSClaudio Fontana }
1968a3ef070eSClaudio Fontana 
1969a3ef070eSClaudio Fontana DO_SLI(gvec_sli_b, uint8_t)
1970a3ef070eSClaudio Fontana DO_SLI(gvec_sli_h, uint16_t)
1971a3ef070eSClaudio Fontana DO_SLI(gvec_sli_s, uint32_t)
1972a3ef070eSClaudio Fontana DO_SLI(gvec_sli_d, uint64_t)
1973a3ef070eSClaudio Fontana 
1974a3ef070eSClaudio Fontana #undef DO_SLI
1975a3ef070eSClaudio Fontana 
1976a3ef070eSClaudio Fontana /*
1977a3ef070eSClaudio Fontana  * Convert float16 to float32, raising no exceptions and
1978a3ef070eSClaudio Fontana  * preserving exceptional values, including SNaN.
1979a3ef070eSClaudio Fontana  * This is effectively an unpack+repack operation.
1980a3ef070eSClaudio Fontana  */
1981a3ef070eSClaudio Fontana static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1982a3ef070eSClaudio Fontana {
1983a3ef070eSClaudio Fontana     const int f16_bias = 15;
1984a3ef070eSClaudio Fontana     const int f32_bias = 127;
1985a3ef070eSClaudio Fontana     uint32_t sign = extract32(f16, 15, 1);
1986a3ef070eSClaudio Fontana     uint32_t exp = extract32(f16, 10, 5);
1987a3ef070eSClaudio Fontana     uint32_t frac = extract32(f16, 0, 10);
1988a3ef070eSClaudio Fontana 
1989a3ef070eSClaudio Fontana     if (exp == 0x1f) {
1990a3ef070eSClaudio Fontana         /* Inf or NaN */
1991a3ef070eSClaudio Fontana         exp = 0xff;
1992a3ef070eSClaudio Fontana     } else if (exp == 0) {
1993a3ef070eSClaudio Fontana         /* Zero or denormal.  */
1994a3ef070eSClaudio Fontana         if (frac != 0) {
1995a3ef070eSClaudio Fontana             if (fz16) {
1996a3ef070eSClaudio Fontana                 frac = 0;
1997a3ef070eSClaudio Fontana             } else {
1998a3ef070eSClaudio Fontana                 /*
1999a3ef070eSClaudio Fontana                  * Denormal; these are all normal float32.
2000a3ef070eSClaudio Fontana                  * Shift the fraction so that the msb is at bit 11,
2001a3ef070eSClaudio Fontana                  * then remove bit 11 as the implicit bit of the
2002a3ef070eSClaudio Fontana                  * normalized float32.  Note that we still go through
2003a3ef070eSClaudio Fontana                  * the shift for normal numbers below, to put the
2004a3ef070eSClaudio Fontana                  * float32 fraction at the right place.
2005a3ef070eSClaudio Fontana                  */
2006a3ef070eSClaudio Fontana                 int shift = clz32(frac) - 21;
2007a3ef070eSClaudio Fontana                 frac = (frac << shift) & 0x3ff;
2008a3ef070eSClaudio Fontana                 exp = f32_bias - f16_bias - shift + 1;
2009a3ef070eSClaudio Fontana             }
2010a3ef070eSClaudio Fontana         }
2011a3ef070eSClaudio Fontana     } else {
2012a3ef070eSClaudio Fontana         /* Normal number; adjust the bias.  */
2013a3ef070eSClaudio Fontana         exp += f32_bias - f16_bias;
2014a3ef070eSClaudio Fontana     }
2015a3ef070eSClaudio Fontana     sign <<= 31;
2016a3ef070eSClaudio Fontana     exp <<= 23;
2017a3ef070eSClaudio Fontana     frac <<= 23 - 10;
2018a3ef070eSClaudio Fontana 
2019a3ef070eSClaudio Fontana     return sign | exp | frac;
2020a3ef070eSClaudio Fontana }
2021a3ef070eSClaudio Fontana 
load4_f16(uint64_t * ptr,int is_q,int is_2)2022a3ef070eSClaudio Fontana static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
2023a3ef070eSClaudio Fontana {
2024a3ef070eSClaudio Fontana     /*
2025a3ef070eSClaudio Fontana      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
2026a3ef070eSClaudio Fontana      * Load the 2nd qword iff is_q & is_2.
2027a3ef070eSClaudio Fontana      * Shift to the 2nd dword iff !is_q & is_2.
2028a3ef070eSClaudio Fontana      * For !is_q & !is_2, the upper bits of the result are garbage.
2029a3ef070eSClaudio Fontana      */
2030a3ef070eSClaudio Fontana     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
2031a3ef070eSClaudio Fontana }
2032a3ef070eSClaudio Fontana 
2033a3ef070eSClaudio Fontana /*
2034a3ef070eSClaudio Fontana  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
2035a3ef070eSClaudio Fontana  * as there is not yet SVE versions that might use blocking.
2036a3ef070eSClaudio Fontana  */
2037a3ef070eSClaudio Fontana 
do_fmlal(float32 * d,void * vn,void * vm,float_status * fpst,uint32_t desc,bool fz16)2038a3ef070eSClaudio Fontana static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
2039a3ef070eSClaudio Fontana                      uint32_t desc, bool fz16)
2040a3ef070eSClaudio Fontana {
2041a3ef070eSClaudio Fontana     intptr_t i, oprsz = simd_oprsz(desc);
2042a3ef070eSClaudio Fontana     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2043a3ef070eSClaudio Fontana     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2044a3ef070eSClaudio Fontana     int is_q = oprsz == 16;
2045a3ef070eSClaudio Fontana     uint64_t n_4, m_4;
2046a3ef070eSClaudio Fontana 
2047a3ef070eSClaudio Fontana     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2048a3ef070eSClaudio Fontana     n_4 = load4_f16(vn, is_q, is_2);
2049a3ef070eSClaudio Fontana     m_4 = load4_f16(vm, is_q, is_2);
2050a3ef070eSClaudio Fontana 
2051a3ef070eSClaudio Fontana     /* Negate all inputs for FMLSL at once.  */
2052a3ef070eSClaudio Fontana     if (is_s) {
2053a3ef070eSClaudio Fontana         n_4 ^= 0x8000800080008000ull;
2054a3ef070eSClaudio Fontana     }
2055a3ef070eSClaudio Fontana 
2056a3ef070eSClaudio Fontana     for (i = 0; i < oprsz / 4; i++) {
2057a3ef070eSClaudio Fontana         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2058a3ef070eSClaudio Fontana         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
2059a3ef070eSClaudio Fontana         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2060a3ef070eSClaudio Fontana     }
2061a3ef070eSClaudio Fontana     clear_tail(d, oprsz, simd_maxsz(desc));
2062a3ef070eSClaudio Fontana }
2063a3ef070eSClaudio Fontana 
HELPER(gvec_fmlal_a32)2064a3ef070eSClaudio Fontana void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
2065a3ef070eSClaudio Fontana                             void *venv, uint32_t desc)
2066a3ef070eSClaudio Fontana {
2067a3ef070eSClaudio Fontana     CPUARMState *env = venv;
2068a3ef070eSClaudio Fontana     do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2069a3ef070eSClaudio Fontana              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2070a3ef070eSClaudio Fontana }
2071a3ef070eSClaudio Fontana 
HELPER(gvec_fmlal_a64)2072a3ef070eSClaudio Fontana void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
2073a3ef070eSClaudio Fontana                             void *venv, uint32_t desc)
2074a3ef070eSClaudio Fontana {
2075a3ef070eSClaudio Fontana     CPUARMState *env = venv;
2076a3ef070eSClaudio Fontana     do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
2077a3ef070eSClaudio Fontana              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2078a3ef070eSClaudio Fontana }
2079a3ef070eSClaudio Fontana 
HELPER(sve2_fmlal_zzzw_s)2080a3ef070eSClaudio Fontana void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
2081a3ef070eSClaudio Fontana                                void *venv, uint32_t desc)
2082a3ef070eSClaudio Fontana {
2083a3ef070eSClaudio Fontana     intptr_t i, oprsz = simd_oprsz(desc);
2084a3ef070eSClaudio Fontana     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2085a3ef070eSClaudio Fontana     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2086a3ef070eSClaudio Fontana     CPUARMState *env = venv;
2087a3ef070eSClaudio Fontana     float_status *status = &env->vfp.fp_status;
2088a3ef070eSClaudio Fontana     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
2089a3ef070eSClaudio Fontana 
2090a3ef070eSClaudio Fontana     for (i = 0; i < oprsz; i += sizeof(float32)) {
2091a3ef070eSClaudio Fontana         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
2092a3ef070eSClaudio Fontana         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
2093a3ef070eSClaudio Fontana         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2094a3ef070eSClaudio Fontana         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2095a3ef070eSClaudio Fontana         float32 aa = *(float32 *)(va + H1_4(i));
2096a3ef070eSClaudio Fontana 
2097a3ef070eSClaudio Fontana         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
2098a3ef070eSClaudio Fontana     }
2099a3ef070eSClaudio Fontana }
2100a3ef070eSClaudio Fontana 
do_fmlal_idx(float32 * d,void * vn,void * vm,float_status * fpst,uint32_t desc,bool fz16)2101a3ef070eSClaudio Fontana static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
2102a3ef070eSClaudio Fontana                          uint32_t desc, bool fz16)
2103a3ef070eSClaudio Fontana {
2104a3ef070eSClaudio Fontana     intptr_t i, oprsz = simd_oprsz(desc);
2105a3ef070eSClaudio Fontana     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2106a3ef070eSClaudio Fontana     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2107a3ef070eSClaudio Fontana     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
2108a3ef070eSClaudio Fontana     int is_q = oprsz == 16;
2109a3ef070eSClaudio Fontana     uint64_t n_4;
2110a3ef070eSClaudio Fontana     float32 m_1;
2111a3ef070eSClaudio Fontana 
2112a3ef070eSClaudio Fontana     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2113a3ef070eSClaudio Fontana     n_4 = load4_f16(vn, is_q, is_2);
2114a3ef070eSClaudio Fontana 
2115a3ef070eSClaudio Fontana     /* Negate all inputs for FMLSL at once.  */
2116a3ef070eSClaudio Fontana     if (is_s) {
2117a3ef070eSClaudio Fontana         n_4 ^= 0x8000800080008000ull;
2118a3ef070eSClaudio Fontana     }
2119a3ef070eSClaudio Fontana 
2120a3ef070eSClaudio Fontana     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
2121a3ef070eSClaudio Fontana 
2122a3ef070eSClaudio Fontana     for (i = 0; i < oprsz / 4; i++) {
2123a3ef070eSClaudio Fontana         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2124a3ef070eSClaudio Fontana         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2125a3ef070eSClaudio Fontana     }
2126a3ef070eSClaudio Fontana     clear_tail(d, oprsz, simd_maxsz(desc));
2127a3ef070eSClaudio Fontana }
2128a3ef070eSClaudio Fontana 
HELPER(gvec_fmlal_idx_a32)2129a3ef070eSClaudio Fontana void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
2130a3ef070eSClaudio Fontana                                 void *venv, uint32_t desc)
2131a3ef070eSClaudio Fontana {
2132a3ef070eSClaudio Fontana     CPUARMState *env = venv;
2133a3ef070eSClaudio Fontana     do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2134a3ef070eSClaudio Fontana                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2135a3ef070eSClaudio Fontana }
2136a3ef070eSClaudio Fontana 
HELPER(gvec_fmlal_idx_a64)2137a3ef070eSClaudio Fontana void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
2138a3ef070eSClaudio Fontana                                 void *venv, uint32_t desc)
2139a3ef070eSClaudio Fontana {
2140a3ef070eSClaudio Fontana     CPUARMState *env = venv;
2141a3ef070eSClaudio Fontana     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
2142a3ef070eSClaudio Fontana                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2143a3ef070eSClaudio Fontana }
2144a3ef070eSClaudio Fontana 
HELPER(sve2_fmlal_zzxw_s)2145a3ef070eSClaudio Fontana void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
2146a3ef070eSClaudio Fontana                                void *venv, uint32_t desc)
2147a3ef070eSClaudio Fontana {
2148a3ef070eSClaudio Fontana     intptr_t i, j, oprsz = simd_oprsz(desc);
2149a3ef070eSClaudio Fontana     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2150a3ef070eSClaudio Fontana     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2151a3ef070eSClaudio Fontana     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
2152a3ef070eSClaudio Fontana     CPUARMState *env = venv;
2153a3ef070eSClaudio Fontana     float_status *status = &env->vfp.fp_status;
2154a3ef070eSClaudio Fontana     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
2155a3ef070eSClaudio Fontana 
2156a3ef070eSClaudio Fontana     for (i = 0; i < oprsz; i += 16) {
2157a3ef070eSClaudio Fontana         float16 mm_16 = *(float16 *)(vm + i + idx);
2158a3ef070eSClaudio Fontana         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2159a3ef070eSClaudio Fontana 
2160a3ef070eSClaudio Fontana         for (j = 0; j < 16; j += sizeof(float32)) {
2161a3ef070eSClaudio Fontana             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
2162a3ef070eSClaudio Fontana             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2163a3ef070eSClaudio Fontana             float32 aa = *(float32 *)(va + H1_4(i + j));
2164a3ef070eSClaudio Fontana 
2165a3ef070eSClaudio Fontana             *(float32 *)(vd + H1_4(i + j)) =
2166a3ef070eSClaudio Fontana                 float32_muladd(nn, mm, aa, 0, status);
2167a3ef070eSClaudio Fontana         }
2168a3ef070eSClaudio Fontana     }
2169a3ef070eSClaudio Fontana }
2170a3ef070eSClaudio Fontana 
HELPER(gvec_sshl_b)2171a3ef070eSClaudio Fontana void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2172a3ef070eSClaudio Fontana {
2173a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
2174a3ef070eSClaudio Fontana     int8_t *d = vd, *n = vn, *m = vm;
2175a3ef070eSClaudio Fontana 
2176a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz; ++i) {
2177a3ef070eSClaudio Fontana         int8_t mm = m[i];
2178a3ef070eSClaudio Fontana         int8_t nn = n[i];
2179a3ef070eSClaudio Fontana         int8_t res = 0;
2180a3ef070eSClaudio Fontana         if (mm >= 0) {
2181a3ef070eSClaudio Fontana             if (mm < 8) {
2182a3ef070eSClaudio Fontana                 res = nn << mm;
2183a3ef070eSClaudio Fontana             }
2184a3ef070eSClaudio Fontana         } else {
2185a3ef070eSClaudio Fontana             res = nn >> (mm > -8 ? -mm : 7);
2186a3ef070eSClaudio Fontana         }
2187a3ef070eSClaudio Fontana         d[i] = res;
2188a3ef070eSClaudio Fontana     }
2189a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
2190a3ef070eSClaudio Fontana }
2191a3ef070eSClaudio Fontana 
HELPER(gvec_sshl_h)2192a3ef070eSClaudio Fontana void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2193a3ef070eSClaudio Fontana {
2194a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
2195a3ef070eSClaudio Fontana     int16_t *d = vd, *n = vn, *m = vm;
2196a3ef070eSClaudio Fontana 
2197a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 2; ++i) {
2198a3ef070eSClaudio Fontana         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2199a3ef070eSClaudio Fontana         int16_t nn = n[i];
2200a3ef070eSClaudio Fontana         int16_t res = 0;
2201a3ef070eSClaudio Fontana         if (mm >= 0) {
2202a3ef070eSClaudio Fontana             if (mm < 16) {
2203a3ef070eSClaudio Fontana                 res = nn << mm;
2204a3ef070eSClaudio Fontana             }
2205a3ef070eSClaudio Fontana         } else {
2206a3ef070eSClaudio Fontana             res = nn >> (mm > -16 ? -mm : 15);
2207a3ef070eSClaudio Fontana         }
2208a3ef070eSClaudio Fontana         d[i] = res;
2209a3ef070eSClaudio Fontana     }
2210a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
2211a3ef070eSClaudio Fontana }
2212a3ef070eSClaudio Fontana 
HELPER(gvec_ushl_b)2213a3ef070eSClaudio Fontana void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2214a3ef070eSClaudio Fontana {
2215a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
2216a3ef070eSClaudio Fontana     uint8_t *d = vd, *n = vn, *m = vm;
2217a3ef070eSClaudio Fontana 
2218a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz; ++i) {
2219a3ef070eSClaudio Fontana         int8_t mm = m[i];
2220a3ef070eSClaudio Fontana         uint8_t nn = n[i];
2221a3ef070eSClaudio Fontana         uint8_t res = 0;
2222a3ef070eSClaudio Fontana         if (mm >= 0) {
2223a3ef070eSClaudio Fontana             if (mm < 8) {
2224a3ef070eSClaudio Fontana                 res = nn << mm;
2225a3ef070eSClaudio Fontana             }
2226a3ef070eSClaudio Fontana         } else {
2227a3ef070eSClaudio Fontana             if (mm > -8) {
2228a3ef070eSClaudio Fontana                 res = nn >> -mm;
2229a3ef070eSClaudio Fontana             }
2230a3ef070eSClaudio Fontana         }
2231a3ef070eSClaudio Fontana         d[i] = res;
2232a3ef070eSClaudio Fontana     }
2233a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
2234a3ef070eSClaudio Fontana }
2235a3ef070eSClaudio Fontana 
HELPER(gvec_ushl_h)2236a3ef070eSClaudio Fontana void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2237a3ef070eSClaudio Fontana {
2238a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
2239a3ef070eSClaudio Fontana     uint16_t *d = vd, *n = vn, *m = vm;
2240a3ef070eSClaudio Fontana 
2241a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 2; ++i) {
2242a3ef070eSClaudio Fontana         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2243a3ef070eSClaudio Fontana         uint16_t nn = n[i];
2244a3ef070eSClaudio Fontana         uint16_t res = 0;
2245a3ef070eSClaudio Fontana         if (mm >= 0) {
2246a3ef070eSClaudio Fontana             if (mm < 16) {
2247a3ef070eSClaudio Fontana                 res = nn << mm;
2248a3ef070eSClaudio Fontana             }
2249a3ef070eSClaudio Fontana         } else {
2250a3ef070eSClaudio Fontana             if (mm > -16) {
2251a3ef070eSClaudio Fontana                 res = nn >> -mm;
2252a3ef070eSClaudio Fontana             }
2253a3ef070eSClaudio Fontana         }
2254a3ef070eSClaudio Fontana         d[i] = res;
2255a3ef070eSClaudio Fontana     }
2256a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
2257a3ef070eSClaudio Fontana }
2258a3ef070eSClaudio Fontana 
2259a3ef070eSClaudio Fontana /*
2260a3ef070eSClaudio Fontana  * 8x8->8 polynomial multiply.
2261a3ef070eSClaudio Fontana  *
2262a3ef070eSClaudio Fontana  * Polynomial multiplication is like integer multiplication except the
2263a3ef070eSClaudio Fontana  * partial products are XORed, not added.
2264a3ef070eSClaudio Fontana  *
2265a3ef070eSClaudio Fontana  * TODO: expose this as a generic vector operation, as it is a common
2266a3ef070eSClaudio Fontana  * crypto building block.
2267a3ef070eSClaudio Fontana  */
HELPER(gvec_pmul_b)2268a3ef070eSClaudio Fontana void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2269a3ef070eSClaudio Fontana {
22708e3da4c7SRichard Henderson     intptr_t i, opr_sz = simd_oprsz(desc);
2271a3ef070eSClaudio Fontana     uint64_t *d = vd, *n = vn, *m = vm;
2272a3ef070eSClaudio Fontana 
2273a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 8; ++i) {
22748e3da4c7SRichard Henderson         d[i] = clmul_8x8_low(n[i], m[i]);
2275a3ef070eSClaudio Fontana     }
2276a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
2277a3ef070eSClaudio Fontana }
2278a3ef070eSClaudio Fontana 
2279a3ef070eSClaudio Fontana /*
2280a3ef070eSClaudio Fontana  * 64x64->128 polynomial multiply.
2281a3ef070eSClaudio Fontana  * Because of the lanes are not accessed in strict columns,
2282a3ef070eSClaudio Fontana  * this probably cannot be turned into a generic helper.
2283a3ef070eSClaudio Fontana  */
HELPER(gvec_pmull_q)2284a3ef070eSClaudio Fontana void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2285a3ef070eSClaudio Fontana {
2286a50cfdf0SRichard Henderson     intptr_t i, opr_sz = simd_oprsz(desc);
2287a3ef070eSClaudio Fontana     intptr_t hi = simd_data(desc);
2288a3ef070eSClaudio Fontana     uint64_t *d = vd, *n = vn, *m = vm;
2289a3ef070eSClaudio Fontana 
2290a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 8; i += 2) {
2291a50cfdf0SRichard Henderson         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2292a50cfdf0SRichard Henderson         d[i] = int128_getlo(r);
2293a50cfdf0SRichard Henderson         d[i + 1] = int128_gethi(r);
2294a3ef070eSClaudio Fontana     }
2295a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
2296a3ef070eSClaudio Fontana }
2297a3ef070eSClaudio Fontana 
HELPER(neon_pmull_h)2298a3ef070eSClaudio Fontana void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2299a3ef070eSClaudio Fontana {
2300a3ef070eSClaudio Fontana     int hi = simd_data(desc);
2301a3ef070eSClaudio Fontana     uint64_t *d = vd, *n = vn, *m = vm;
2302a3ef070eSClaudio Fontana     uint64_t nn = n[hi], mm = m[hi];
2303a3ef070eSClaudio Fontana 
23048e3da4c7SRichard Henderson     d[0] = clmul_8x4_packed(nn, mm);
2305a3ef070eSClaudio Fontana     nn >>= 32;
2306a3ef070eSClaudio Fontana     mm >>= 32;
23078e3da4c7SRichard Henderson     d[1] = clmul_8x4_packed(nn, mm);
2308a3ef070eSClaudio Fontana 
2309a3ef070eSClaudio Fontana     clear_tail(d, 16, simd_maxsz(desc));
2310a3ef070eSClaudio Fontana }
2311a3ef070eSClaudio Fontana 
2312a3ef070eSClaudio Fontana #ifdef TARGET_AARCH64
HELPER(sve2_pmull_h)2313a3ef070eSClaudio Fontana void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2314a3ef070eSClaudio Fontana {
2315a3ef070eSClaudio Fontana     int shift = simd_data(desc) * 8;
2316a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
2317a3ef070eSClaudio Fontana     uint64_t *d = vd, *n = vn, *m = vm;
2318a3ef070eSClaudio Fontana 
2319a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 8; ++i) {
23208e3da4c7SRichard Henderson         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2321a3ef070eSClaudio Fontana     }
2322a3ef070eSClaudio Fontana }
2323a3ef070eSClaudio Fontana 
HELPER(sve2_pmull_d)2324a3ef070eSClaudio Fontana void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2325a3ef070eSClaudio Fontana {
2326a3ef070eSClaudio Fontana     intptr_t sel = H4(simd_data(desc));
2327a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
2328a3ef070eSClaudio Fontana     uint32_t *n = vn, *m = vm;
2329a3ef070eSClaudio Fontana     uint64_t *d = vd;
2330a3ef070eSClaudio Fontana 
2331a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 8; ++i) {
2332bae25f64SRichard Henderson         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2333a3ef070eSClaudio Fontana     }
2334a3ef070eSClaudio Fontana }
2335a3ef070eSClaudio Fontana #endif
2336a3ef070eSClaudio Fontana 
2337a3ef070eSClaudio Fontana #define DO_CMP0(NAME, TYPE, OP)                         \
2338a3ef070eSClaudio Fontana void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2339a3ef070eSClaudio Fontana {                                                       \
2340a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);              \
2341a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2342a3ef070eSClaudio Fontana         TYPE nn = *(TYPE *)(vn + i);                    \
2343a3ef070eSClaudio Fontana         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2344a3ef070eSClaudio Fontana     }                                                   \
2345a3ef070eSClaudio Fontana     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2346a3ef070eSClaudio Fontana }
2347a3ef070eSClaudio Fontana 
2348a3ef070eSClaudio Fontana DO_CMP0(gvec_ceq0_b, int8_t, ==)
2349a3ef070eSClaudio Fontana DO_CMP0(gvec_clt0_b, int8_t, <)
2350a3ef070eSClaudio Fontana DO_CMP0(gvec_cle0_b, int8_t, <=)
2351a3ef070eSClaudio Fontana DO_CMP0(gvec_cgt0_b, int8_t, >)
2352a3ef070eSClaudio Fontana DO_CMP0(gvec_cge0_b, int8_t, >=)
2353a3ef070eSClaudio Fontana 
2354a3ef070eSClaudio Fontana DO_CMP0(gvec_ceq0_h, int16_t, ==)
2355a3ef070eSClaudio Fontana DO_CMP0(gvec_clt0_h, int16_t, <)
2356a3ef070eSClaudio Fontana DO_CMP0(gvec_cle0_h, int16_t, <=)
2357a3ef070eSClaudio Fontana DO_CMP0(gvec_cgt0_h, int16_t, >)
2358a3ef070eSClaudio Fontana DO_CMP0(gvec_cge0_h, int16_t, >=)
2359a3ef070eSClaudio Fontana 
2360a3ef070eSClaudio Fontana #undef DO_CMP0
2361a3ef070eSClaudio Fontana 
2362a3ef070eSClaudio Fontana #define DO_ABD(NAME, TYPE)                                      \
2363a3ef070eSClaudio Fontana void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2364a3ef070eSClaudio Fontana {                                                               \
2365a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2366a3ef070eSClaudio Fontana     TYPE *d = vd, *n = vn, *m = vm;                             \
2367a3ef070eSClaudio Fontana                                                                 \
2368a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2369a3ef070eSClaudio Fontana         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2370a3ef070eSClaudio Fontana     }                                                           \
2371a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2372a3ef070eSClaudio Fontana }
2373a3ef070eSClaudio Fontana 
DO_ABD(gvec_sabd_b,int8_t)2374a3ef070eSClaudio Fontana DO_ABD(gvec_sabd_b, int8_t)
2375a3ef070eSClaudio Fontana DO_ABD(gvec_sabd_h, int16_t)
2376a3ef070eSClaudio Fontana DO_ABD(gvec_sabd_s, int32_t)
2377a3ef070eSClaudio Fontana DO_ABD(gvec_sabd_d, int64_t)
2378a3ef070eSClaudio Fontana 
2379a3ef070eSClaudio Fontana DO_ABD(gvec_uabd_b, uint8_t)
2380a3ef070eSClaudio Fontana DO_ABD(gvec_uabd_h, uint16_t)
2381a3ef070eSClaudio Fontana DO_ABD(gvec_uabd_s, uint32_t)
2382a3ef070eSClaudio Fontana DO_ABD(gvec_uabd_d, uint64_t)
2383a3ef070eSClaudio Fontana 
2384a3ef070eSClaudio Fontana #undef DO_ABD
2385a3ef070eSClaudio Fontana 
2386a3ef070eSClaudio Fontana #define DO_ABA(NAME, TYPE)                                      \
2387a3ef070eSClaudio Fontana void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2388a3ef070eSClaudio Fontana {                                                               \
2389a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2390a3ef070eSClaudio Fontana     TYPE *d = vd, *n = vn, *m = vm;                             \
2391a3ef070eSClaudio Fontana                                                                 \
2392a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2393a3ef070eSClaudio Fontana         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2394a3ef070eSClaudio Fontana     }                                                           \
2395a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2396a3ef070eSClaudio Fontana }
2397a3ef070eSClaudio Fontana 
2398a3ef070eSClaudio Fontana DO_ABA(gvec_saba_b, int8_t)
2399a3ef070eSClaudio Fontana DO_ABA(gvec_saba_h, int16_t)
2400a3ef070eSClaudio Fontana DO_ABA(gvec_saba_s, int32_t)
2401a3ef070eSClaudio Fontana DO_ABA(gvec_saba_d, int64_t)
2402a3ef070eSClaudio Fontana 
2403a3ef070eSClaudio Fontana DO_ABA(gvec_uaba_b, uint8_t)
2404a3ef070eSClaudio Fontana DO_ABA(gvec_uaba_h, uint16_t)
2405a3ef070eSClaudio Fontana DO_ABA(gvec_uaba_s, uint32_t)
2406a3ef070eSClaudio Fontana DO_ABA(gvec_uaba_d, uint64_t)
2407a3ef070eSClaudio Fontana 
2408a3ef070eSClaudio Fontana #undef DO_ABA
2409a3ef070eSClaudio Fontana 
241057801ca0SRichard Henderson #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
241157801ca0SRichard Henderson void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
241257801ca0SRichard Henderson {                                                                          \
241357801ca0SRichard Henderson     ARMVectorReg scratch;                                                  \
241457801ca0SRichard Henderson     intptr_t oprsz = simd_oprsz(desc);                                     \
241557801ca0SRichard Henderson     intptr_t half = oprsz / sizeof(TYPE) / 2;                              \
241657801ca0SRichard Henderson     TYPE *d = vd, *n = vn, *m = vm;                                        \
241757801ca0SRichard Henderson     if (unlikely(d == m)) {                                                \
241857801ca0SRichard Henderson         m = memcpy(&scratch, m, oprsz);                                    \
241957801ca0SRichard Henderson     }                                                                      \
242057801ca0SRichard Henderson     for (intptr_t i = 0; i < half; ++i) {                                  \
242157801ca0SRichard Henderson         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat);                \
242257801ca0SRichard Henderson     }                                                                      \
242357801ca0SRichard Henderson     for (intptr_t i = 0; i < half; ++i) {                                  \
242457801ca0SRichard Henderson         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat);         \
242557801ca0SRichard Henderson     }                                                                      \
242657801ca0SRichard Henderson     clear_tail(d, oprsz, simd_maxsz(desc));                                \
242757801ca0SRichard Henderson }
242857801ca0SRichard Henderson 
242957801ca0SRichard Henderson DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
243057801ca0SRichard Henderson DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
243157801ca0SRichard Henderson DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
243257801ca0SRichard Henderson 
2433a13f9fb5SRichard Henderson DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2434a13f9fb5SRichard Henderson DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2435a13f9fb5SRichard Henderson DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2436a13f9fb5SRichard Henderson 
2437a13f9fb5SRichard Henderson DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2438a13f9fb5SRichard Henderson DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2439a13f9fb5SRichard Henderson DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2440a13f9fb5SRichard Henderson 
2441a13f9fb5SRichard Henderson DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2442a13f9fb5SRichard Henderson DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2443a13f9fb5SRichard Henderson DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2444a13f9fb5SRichard Henderson 
2445a13f9fb5SRichard Henderson DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2446a13f9fb5SRichard Henderson DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2447a13f9fb5SRichard Henderson DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2448a13f9fb5SRichard Henderson 
2449a7e4eec6SRichard Henderson #undef DO_3OP_PAIR
2450a7e4eec6SRichard Henderson 
2451a7e4eec6SRichard Henderson #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2452a7e4eec6SRichard Henderson void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2453a7e4eec6SRichard Henderson {                                                               \
2454a7e4eec6SRichard Henderson     ARMVectorReg scratch;                                       \
2455a7e4eec6SRichard Henderson     intptr_t oprsz = simd_oprsz(desc);                          \
2456a7e4eec6SRichard Henderson     intptr_t half = oprsz / sizeof(TYPE) / 2;                   \
2457a7e4eec6SRichard Henderson     TYPE *d = vd, *n = vn, *m = vm;                             \
2458a7e4eec6SRichard Henderson     if (unlikely(d == m)) {                                     \
2459a7e4eec6SRichard Henderson         m = memcpy(&scratch, m, oprsz);                         \
2460a7e4eec6SRichard Henderson     }                                                           \
2461a7e4eec6SRichard Henderson     for (intptr_t i = 0; i < half; ++i) {                       \
2462a7e4eec6SRichard Henderson         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]);           \
2463a7e4eec6SRichard Henderson     }                                                           \
2464a7e4eec6SRichard Henderson     for (intptr_t i = 0; i < half; ++i) {                       \
2465a7e4eec6SRichard Henderson         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]);    \
2466a7e4eec6SRichard Henderson     }                                                           \
2467a7e4eec6SRichard Henderson     clear_tail(d, oprsz, simd_maxsz(desc));                     \
2468a7e4eec6SRichard Henderson }
2469a7e4eec6SRichard Henderson 
2470a7e4eec6SRichard Henderson #define ADD(A, B) (A + B)
2471a7e4eec6SRichard Henderson DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2472a7e4eec6SRichard Henderson DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2473a7e4eec6SRichard Henderson DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2474a7e4eec6SRichard Henderson DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2475a7e4eec6SRichard Henderson #undef  ADD
2476a7e4eec6SRichard Henderson 
247728b5451bSRichard Henderson DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
247828b5451bSRichard Henderson DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
247928b5451bSRichard Henderson DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
248028b5451bSRichard Henderson 
248128b5451bSRichard Henderson DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
248228b5451bSRichard Henderson DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
248328b5451bSRichard Henderson DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
248428b5451bSRichard Henderson 
248528b5451bSRichard Henderson DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
248628b5451bSRichard Henderson DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
248728b5451bSRichard Henderson DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
248828b5451bSRichard Henderson 
248928b5451bSRichard Henderson DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
249028b5451bSRichard Henderson DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
249128b5451bSRichard Henderson DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
249228b5451bSRichard Henderson 
2493a7e4eec6SRichard Henderson #undef DO_3OP_PAIR
2494a7e4eec6SRichard Henderson 
2495a3ef070eSClaudio Fontana #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2496a3ef070eSClaudio Fontana     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2497a3ef070eSClaudio Fontana     {                                                                   \
2498a3ef070eSClaudio Fontana         intptr_t i, oprsz = simd_oprsz(desc);                           \
2499a3ef070eSClaudio Fontana         int shift = simd_data(desc);                                    \
2500a3ef070eSClaudio Fontana         TYPE *d = vd, *n = vn;                                          \
2501a3ef070eSClaudio Fontana         float_status *fpst = stat;                                      \
2502a3ef070eSClaudio Fontana         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2503a3ef070eSClaudio Fontana             d[i] = FUNC(n[i], shift, fpst);                             \
2504a3ef070eSClaudio Fontana         }                                                               \
2505a3ef070eSClaudio Fontana         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2506a3ef070eSClaudio Fontana     }
2507a3ef070eSClaudio Fontana 
2508a3ef070eSClaudio Fontana DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2509a3ef070eSClaudio Fontana DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2510a3ef070eSClaudio Fontana DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2511a3ef070eSClaudio Fontana DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
2512a3ef070eSClaudio Fontana DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2513a3ef070eSClaudio Fontana DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2514a3ef070eSClaudio Fontana DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2515a3ef070eSClaudio Fontana DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2516a3ef070eSClaudio Fontana 
2517a3ef070eSClaudio Fontana #undef DO_VCVT_FIXED
2518a3ef070eSClaudio Fontana 
2519a3ef070eSClaudio Fontana #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2520a3ef070eSClaudio Fontana     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2521a3ef070eSClaudio Fontana     {                                                                   \
2522a3ef070eSClaudio Fontana         float_status *fpst = stat;                                      \
2523a3ef070eSClaudio Fontana         intptr_t i, oprsz = simd_oprsz(desc);                           \
2524a3ef070eSClaudio Fontana         uint32_t rmode = simd_data(desc);                               \
2525a3ef070eSClaudio Fontana         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2526a3ef070eSClaudio Fontana         TYPE *d = vd, *n = vn;                                          \
2527a3ef070eSClaudio Fontana         set_float_rounding_mode(rmode, fpst);                           \
2528a3ef070eSClaudio Fontana         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2529a3ef070eSClaudio Fontana             d[i] = FUNC(n[i], 0, fpst);                                 \
2530a3ef070eSClaudio Fontana         }                                                               \
2531a3ef070eSClaudio Fontana         set_float_rounding_mode(prev_rmode, fpst);                      \
2532a3ef070eSClaudio Fontana         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2533a3ef070eSClaudio Fontana     }
2534a3ef070eSClaudio Fontana 
2535a3ef070eSClaudio Fontana DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2536a3ef070eSClaudio Fontana DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2537a3ef070eSClaudio Fontana DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2538a3ef070eSClaudio Fontana DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2539a3ef070eSClaudio Fontana 
2540a3ef070eSClaudio Fontana #undef DO_VCVT_RMODE
2541a3ef070eSClaudio Fontana 
2542a3ef070eSClaudio Fontana #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2543a3ef070eSClaudio Fontana     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2544a3ef070eSClaudio Fontana     {                                                                   \
2545a3ef070eSClaudio Fontana         float_status *fpst = stat;                                      \
2546a3ef070eSClaudio Fontana         intptr_t i, oprsz = simd_oprsz(desc);                           \
2547a3ef070eSClaudio Fontana         uint32_t rmode = simd_data(desc);                               \
2548a3ef070eSClaudio Fontana         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2549a3ef070eSClaudio Fontana         TYPE *d = vd, *n = vn;                                          \
2550a3ef070eSClaudio Fontana         set_float_rounding_mode(rmode, fpst);                           \
2551a3ef070eSClaudio Fontana         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2552a3ef070eSClaudio Fontana             d[i] = FUNC(n[i], fpst);                                    \
2553a3ef070eSClaudio Fontana         }                                                               \
2554a3ef070eSClaudio Fontana         set_float_rounding_mode(prev_rmode, fpst);                      \
2555a3ef070eSClaudio Fontana         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2556a3ef070eSClaudio Fontana     }
2557a3ef070eSClaudio Fontana 
2558a3ef070eSClaudio Fontana DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2559a3ef070eSClaudio Fontana DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2560a3ef070eSClaudio Fontana 
2561a3ef070eSClaudio Fontana #undef DO_VRINT_RMODE
2562a3ef070eSClaudio Fontana 
2563a3ef070eSClaudio Fontana #ifdef TARGET_AARCH64
2564a3ef070eSClaudio Fontana void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2565a3ef070eSClaudio Fontana {
2566a3ef070eSClaudio Fontana     const uint8_t *indices = vm;
2567a3ef070eSClaudio Fontana     CPUARMState *env = venv;
2568a3ef070eSClaudio Fontana     size_t oprsz = simd_oprsz(desc);
2569a3ef070eSClaudio Fontana     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2570a3ef070eSClaudio Fontana     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2571a3ef070eSClaudio Fontana     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2572a3ef070eSClaudio Fontana     union {
2573a3ef070eSClaudio Fontana         uint8_t b[16];
2574a3ef070eSClaudio Fontana         uint64_t d[2];
2575a3ef070eSClaudio Fontana     } result;
2576a3ef070eSClaudio Fontana 
2577a3ef070eSClaudio Fontana     /*
2578a3ef070eSClaudio Fontana      * We must construct the final result in a temp, lest the output
2579a3ef070eSClaudio Fontana      * overlaps the input table.  For TBL, begin with zero; for TBX,
2580a3ef070eSClaudio Fontana      * begin with the original register contents.  Note that we always
2581a3ef070eSClaudio Fontana      * copy 16 bytes here to avoid an extra branch; clearing the high
2582a3ef070eSClaudio Fontana      * bits of the register for oprsz == 8 is handled below.
2583a3ef070eSClaudio Fontana      */
2584a3ef070eSClaudio Fontana     if (is_tbx) {
2585a3ef070eSClaudio Fontana         memcpy(&result, vd, 16);
2586a3ef070eSClaudio Fontana     } else {
2587a3ef070eSClaudio Fontana         memset(&result, 0, 16);
2588a3ef070eSClaudio Fontana     }
2589a3ef070eSClaudio Fontana 
2590a3ef070eSClaudio Fontana     for (size_t i = 0; i < oprsz; ++i) {
2591a3ef070eSClaudio Fontana         uint32_t index = indices[H1(i)];
2592a3ef070eSClaudio Fontana 
2593a3ef070eSClaudio Fontana         if (index < table_len) {
2594a3ef070eSClaudio Fontana             /*
2595a3ef070eSClaudio Fontana              * Convert index (a byte offset into the virtual table
2596a3ef070eSClaudio Fontana              * which is a series of 128-bit vectors concatenated)
2597a3ef070eSClaudio Fontana              * into the correct register element, bearing in mind
2598a3ef070eSClaudio Fontana              * that the table can wrap around from V31 to V0.
2599a3ef070eSClaudio Fontana              */
2600a3ef070eSClaudio Fontana             const uint8_t *table = (const uint8_t *)
2601a3ef070eSClaudio Fontana                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2602a3ef070eSClaudio Fontana             result.b[H1(i)] = table[H1(index % 16)];
2603a3ef070eSClaudio Fontana         }
2604a3ef070eSClaudio Fontana     }
2605a3ef070eSClaudio Fontana 
2606a3ef070eSClaudio Fontana     memcpy(vd, &result, 16);
2607a3ef070eSClaudio Fontana     clear_tail(vd, oprsz, simd_maxsz(desc));
2608a3ef070eSClaudio Fontana }
2609a3ef070eSClaudio Fontana #endif
2610a3ef070eSClaudio Fontana 
2611a3ef070eSClaudio Fontana /*
2612a3ef070eSClaudio Fontana  * NxN -> N highpart multiply
2613a3ef070eSClaudio Fontana  *
2614a3ef070eSClaudio Fontana  * TODO: expose this as a generic vector operation.
2615a3ef070eSClaudio Fontana  */
2616a3ef070eSClaudio Fontana 
HELPER(gvec_smulh_b)2617a3ef070eSClaudio Fontana void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2618a3ef070eSClaudio Fontana {
2619a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
2620a3ef070eSClaudio Fontana     int8_t *d = vd, *n = vn, *m = vm;
2621a3ef070eSClaudio Fontana 
2622a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz; ++i) {
2623a3ef070eSClaudio Fontana         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2624a3ef070eSClaudio Fontana     }
2625a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
2626a3ef070eSClaudio Fontana }
2627a3ef070eSClaudio Fontana 
HELPER(gvec_smulh_h)2628a3ef070eSClaudio Fontana void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2629a3ef070eSClaudio Fontana {
2630a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
2631a3ef070eSClaudio Fontana     int16_t *d = vd, *n = vn, *m = vm;
2632a3ef070eSClaudio Fontana 
2633a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 2; ++i) {
2634a3ef070eSClaudio Fontana         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2635a3ef070eSClaudio Fontana     }
2636a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
2637a3ef070eSClaudio Fontana }
2638a3ef070eSClaudio Fontana 
HELPER(gvec_smulh_s)2639a3ef070eSClaudio Fontana void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2640a3ef070eSClaudio Fontana {
2641a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
2642a3ef070eSClaudio Fontana     int32_t *d = vd, *n = vn, *m = vm;
2643a3ef070eSClaudio Fontana 
2644a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 4; ++i) {
2645a3ef070eSClaudio Fontana         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2646a3ef070eSClaudio Fontana     }
2647a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
2648a3ef070eSClaudio Fontana }
2649a3ef070eSClaudio Fontana 
HELPER(gvec_smulh_d)2650a3ef070eSClaudio Fontana void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2651a3ef070eSClaudio Fontana {
2652a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
2653a3ef070eSClaudio Fontana     uint64_t *d = vd, *n = vn, *m = vm;
2654a3ef070eSClaudio Fontana     uint64_t discard;
2655a3ef070eSClaudio Fontana 
2656a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 8; ++i) {
2657a3ef070eSClaudio Fontana         muls64(&discard, &d[i], n[i], m[i]);
2658a3ef070eSClaudio Fontana     }
2659a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
2660a3ef070eSClaudio Fontana }
2661a3ef070eSClaudio Fontana 
HELPER(gvec_umulh_b)2662a3ef070eSClaudio Fontana void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2663a3ef070eSClaudio Fontana {
2664a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
2665a3ef070eSClaudio Fontana     uint8_t *d = vd, *n = vn, *m = vm;
2666a3ef070eSClaudio Fontana 
2667a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz; ++i) {
2668a3ef070eSClaudio Fontana         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2669a3ef070eSClaudio Fontana     }
2670a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
2671a3ef070eSClaudio Fontana }
2672a3ef070eSClaudio Fontana 
HELPER(gvec_umulh_h)2673a3ef070eSClaudio Fontana void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2674a3ef070eSClaudio Fontana {
2675a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
2676a3ef070eSClaudio Fontana     uint16_t *d = vd, *n = vn, *m = vm;
2677a3ef070eSClaudio Fontana 
2678a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 2; ++i) {
2679a3ef070eSClaudio Fontana         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2680a3ef070eSClaudio Fontana     }
2681a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
2682a3ef070eSClaudio Fontana }
2683a3ef070eSClaudio Fontana 
HELPER(gvec_umulh_s)2684a3ef070eSClaudio Fontana void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2685a3ef070eSClaudio Fontana {
2686a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
2687a3ef070eSClaudio Fontana     uint32_t *d = vd, *n = vn, *m = vm;
2688a3ef070eSClaudio Fontana 
2689a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 4; ++i) {
2690a3ef070eSClaudio Fontana         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2691a3ef070eSClaudio Fontana     }
2692a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
2693a3ef070eSClaudio Fontana }
2694a3ef070eSClaudio Fontana 
HELPER(gvec_umulh_d)2695a3ef070eSClaudio Fontana void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2696a3ef070eSClaudio Fontana {
2697a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
2698a3ef070eSClaudio Fontana     uint64_t *d = vd, *n = vn, *m = vm;
2699a3ef070eSClaudio Fontana     uint64_t discard;
2700a3ef070eSClaudio Fontana 
2701a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz / 8; ++i) {
2702a3ef070eSClaudio Fontana         mulu64(&discard, &d[i], n[i], m[i]);
2703a3ef070eSClaudio Fontana     }
2704a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
2705a3ef070eSClaudio Fontana }
2706a3ef070eSClaudio Fontana 
HELPER(gvec_xar_d)2707a3ef070eSClaudio Fontana void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2708a3ef070eSClaudio Fontana {
2709a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2710a3ef070eSClaudio Fontana     int shr = simd_data(desc);
2711a3ef070eSClaudio Fontana     uint64_t *d = vd, *n = vn, *m = vm;
2712a3ef070eSClaudio Fontana 
2713a3ef070eSClaudio Fontana     for (i = 0; i < opr_sz; ++i) {
2714a3ef070eSClaudio Fontana         d[i] = ror64(n[i] ^ m[i], shr);
2715a3ef070eSClaudio Fontana     }
2716a3ef070eSClaudio Fontana     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2717a3ef070eSClaudio Fontana }
2718a3ef070eSClaudio Fontana 
2719a3ef070eSClaudio Fontana /*
2720a3ef070eSClaudio Fontana  * Integer matrix-multiply accumulate
2721a3ef070eSClaudio Fontana  */
2722a3ef070eSClaudio Fontana 
do_smmla_b(uint32_t sum,void * vn,void * vm)2723a3ef070eSClaudio Fontana static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2724a3ef070eSClaudio Fontana {
2725a3ef070eSClaudio Fontana     int8_t *n = vn, *m = vm;
2726a3ef070eSClaudio Fontana 
2727a3ef070eSClaudio Fontana     for (intptr_t k = 0; k < 8; ++k) {
2728a3ef070eSClaudio Fontana         sum += n[H1(k)] * m[H1(k)];
2729a3ef070eSClaudio Fontana     }
2730a3ef070eSClaudio Fontana     return sum;
2731a3ef070eSClaudio Fontana }
2732a3ef070eSClaudio Fontana 
do_ummla_b(uint32_t sum,void * vn,void * vm)2733a3ef070eSClaudio Fontana static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2734a3ef070eSClaudio Fontana {
2735a3ef070eSClaudio Fontana     uint8_t *n = vn, *m = vm;
2736a3ef070eSClaudio Fontana 
2737a3ef070eSClaudio Fontana     for (intptr_t k = 0; k < 8; ++k) {
2738a3ef070eSClaudio Fontana         sum += n[H1(k)] * m[H1(k)];
2739a3ef070eSClaudio Fontana     }
2740a3ef070eSClaudio Fontana     return sum;
2741a3ef070eSClaudio Fontana }
2742a3ef070eSClaudio Fontana 
do_usmmla_b(uint32_t sum,void * vn,void * vm)2743a3ef070eSClaudio Fontana static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2744a3ef070eSClaudio Fontana {
2745a3ef070eSClaudio Fontana     uint8_t *n = vn;
2746a3ef070eSClaudio Fontana     int8_t *m = vm;
2747a3ef070eSClaudio Fontana 
2748a3ef070eSClaudio Fontana     for (intptr_t k = 0; k < 8; ++k) {
2749a3ef070eSClaudio Fontana         sum += n[H1(k)] * m[H1(k)];
2750a3ef070eSClaudio Fontana     }
2751a3ef070eSClaudio Fontana     return sum;
2752a3ef070eSClaudio Fontana }
2753a3ef070eSClaudio Fontana 
do_mmla_b(void * vd,void * vn,void * vm,void * va,uint32_t desc,uint32_t (* inner_loop)(uint32_t,void *,void *))2754a3ef070eSClaudio Fontana static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2755a3ef070eSClaudio Fontana                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2756a3ef070eSClaudio Fontana {
2757a3ef070eSClaudio Fontana     intptr_t seg, opr_sz = simd_oprsz(desc);
2758a3ef070eSClaudio Fontana 
2759a3ef070eSClaudio Fontana     for (seg = 0; seg < opr_sz; seg += 16) {
2760a3ef070eSClaudio Fontana         uint32_t *d = vd + seg;
2761a3ef070eSClaudio Fontana         uint32_t *a = va + seg;
2762a3ef070eSClaudio Fontana         uint32_t sum0, sum1, sum2, sum3;
2763a3ef070eSClaudio Fontana 
2764a3ef070eSClaudio Fontana         /*
2765a3ef070eSClaudio Fontana          * Process the entire segment at once, writing back the
2766a3ef070eSClaudio Fontana          * results only after we've consumed all of the inputs.
2767a3ef070eSClaudio Fontana          *
2768a3ef070eSClaudio Fontana          * Key to indices by column:
2769a3ef070eSClaudio Fontana          *          i   j                  i             j
2770a3ef070eSClaudio Fontana          */
2771a3ef070eSClaudio Fontana         sum0 = a[H4(0 + 0)];
2772a3ef070eSClaudio Fontana         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2773a3ef070eSClaudio Fontana         sum1 = a[H4(0 + 1)];
2774a3ef070eSClaudio Fontana         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2775a3ef070eSClaudio Fontana         sum2 = a[H4(2 + 0)];
2776a3ef070eSClaudio Fontana         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2777a3ef070eSClaudio Fontana         sum3 = a[H4(2 + 1)];
2778a3ef070eSClaudio Fontana         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2779a3ef070eSClaudio Fontana 
2780a3ef070eSClaudio Fontana         d[H4(0)] = sum0;
2781a3ef070eSClaudio Fontana         d[H4(1)] = sum1;
2782a3ef070eSClaudio Fontana         d[H4(2)] = sum2;
2783a3ef070eSClaudio Fontana         d[H4(3)] = sum3;
2784a3ef070eSClaudio Fontana     }
2785a3ef070eSClaudio Fontana     clear_tail(vd, opr_sz, simd_maxsz(desc));
2786a3ef070eSClaudio Fontana }
2787a3ef070eSClaudio Fontana 
2788a3ef070eSClaudio Fontana #define DO_MMLA_B(NAME, INNER) \
2789a3ef070eSClaudio Fontana     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2790a3ef070eSClaudio Fontana     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2791a3ef070eSClaudio Fontana 
DO_MMLA_B(gvec_smmla_b,do_smmla_b)2792a3ef070eSClaudio Fontana DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2793a3ef070eSClaudio Fontana DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2794a3ef070eSClaudio Fontana DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2795a3ef070eSClaudio Fontana 
2796a3ef070eSClaudio Fontana /*
2797a3ef070eSClaudio Fontana  * BFloat16 Dot Product
2798a3ef070eSClaudio Fontana  */
2799a3ef070eSClaudio Fontana 
2800a3ef070eSClaudio Fontana bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp)
2801a3ef070eSClaudio Fontana {
2802a3ef070eSClaudio Fontana     /*
2803a3ef070eSClaudio Fontana      * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF.
2804a3ef070eSClaudio Fontana      * For EBF = 0, we ignore the FPCR bits which determine rounding
2805a3ef070eSClaudio Fontana      * mode and denormal-flushing, and we do unfused multiplies and
2806a3ef070eSClaudio Fontana      * additions with intermediate rounding of all products and sums.
2807a3ef070eSClaudio Fontana      * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits,
2808a3ef070eSClaudio Fontana      * and we perform a fused two-way sum-of-products without intermediate
2809a3ef070eSClaudio Fontana      * rounding of the products.
2810a3ef070eSClaudio Fontana      * In either case, we don't set fp exception flags.
2811a3ef070eSClaudio Fontana      *
2812a3ef070eSClaudio Fontana      * EBF is AArch64 only, so even if it's set in the FPCR it has
2813a3ef070eSClaudio Fontana      * no effect on AArch32 instructions.
2814a3ef070eSClaudio Fontana      */
2815a3ef070eSClaudio Fontana     bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF;
2816a3ef070eSClaudio Fontana     *statusp = (float_status){
2817a3ef070eSClaudio Fontana         .tininess_before_rounding = float_tininess_before_rounding,
2818a3ef070eSClaudio Fontana         .float_rounding_mode = float_round_to_odd_inf,
2819a3ef070eSClaudio Fontana         .flush_to_zero = true,
2820a3ef070eSClaudio Fontana         .flush_inputs_to_zero = true,
2821a3ef070eSClaudio Fontana         .default_nan_mode = true,
2822a3ef070eSClaudio Fontana     };
2823a3ef070eSClaudio Fontana 
2824a3ef070eSClaudio Fontana     if (ebf) {
2825a3ef070eSClaudio Fontana         float_status *fpst = &env->vfp.fp_status;
2826a3ef070eSClaudio Fontana         set_flush_to_zero(get_flush_to_zero(fpst), statusp);
2827a3ef070eSClaudio Fontana         set_flush_inputs_to_zero(get_flush_inputs_to_zero(fpst), statusp);
2828a3ef070eSClaudio Fontana         set_float_rounding_mode(get_float_rounding_mode(fpst), statusp);
2829a3ef070eSClaudio Fontana 
2830a3ef070eSClaudio Fontana         /* EBF=1 needs to do a step with round-to-odd semantics */
2831a3ef070eSClaudio Fontana         *oddstatusp = *statusp;
2832a3ef070eSClaudio Fontana         set_float_rounding_mode(float_round_to_odd, oddstatusp);
2833a3ef070eSClaudio Fontana     }
2834a3ef070eSClaudio Fontana 
2835a3ef070eSClaudio Fontana     return ebf;
2836a3ef070eSClaudio Fontana }
2837a3ef070eSClaudio Fontana 
bfdotadd(float32 sum,uint32_t e1,uint32_t e2,float_status * fpst)2838a3ef070eSClaudio Fontana float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst)
2839a3ef070eSClaudio Fontana {
2840a3ef070eSClaudio Fontana     float32 t1, t2;
2841a3ef070eSClaudio Fontana 
2842a3ef070eSClaudio Fontana     /*
2843a3ef070eSClaudio Fontana      * Extract each BFloat16 from the element pair, and shift
2844a3ef070eSClaudio Fontana      * them such that they become float32.
2845a3ef070eSClaudio Fontana      */
2846a3ef070eSClaudio Fontana     t1 = float32_mul(e1 << 16, e2 << 16, fpst);
2847a3ef070eSClaudio Fontana     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst);
2848a3ef070eSClaudio Fontana     t1 = float32_add(t1, t2, fpst);
2849a3ef070eSClaudio Fontana     t1 = float32_add(sum, t1, fpst);
2850a3ef070eSClaudio Fontana 
2851a3ef070eSClaudio Fontana     return t1;
2852a3ef070eSClaudio Fontana }
2853a3ef070eSClaudio Fontana 
bfdotadd_ebf(float32 sum,uint32_t e1,uint32_t e2,float_status * fpst,float_status * fpst_odd)2854a3ef070eSClaudio Fontana float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
2855a3ef070eSClaudio Fontana                      float_status *fpst, float_status *fpst_odd)
2856a3ef070eSClaudio Fontana {
2857a3ef070eSClaudio Fontana     /*
2858a3ef070eSClaudio Fontana      * Compare f16_dotadd() in sme_helper.c, but here we have
2859a3ef070eSClaudio Fontana      * bfloat16 inputs. In particular that means that we do not
2860a3ef070eSClaudio Fontana      * want the FPCR.FZ16 flush semantics, so we use the normal
2861a3ef070eSClaudio Fontana      * float_status for the input handling here.
2862a3ef070eSClaudio Fontana      */
2863a3ef070eSClaudio Fontana     float64 e1r = float32_to_float64(e1 << 16, fpst);
2864a3ef070eSClaudio Fontana     float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst);
2865a3ef070eSClaudio Fontana     float64 e2r = float32_to_float64(e2 << 16, fpst);
2866a3ef070eSClaudio Fontana     float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst);
2867a3ef070eSClaudio Fontana     float64 t64;
2868a3ef070eSClaudio Fontana     float32 t32;
2869673d8215SMichael Tokarev 
2870a3ef070eSClaudio Fontana     /*
2871a3ef070eSClaudio Fontana      * The ARM pseudocode function FPDot performs both multiplies
2872a3ef070eSClaudio Fontana      * and the add with a single rounding operation.  Emulate this
2873a3ef070eSClaudio Fontana      * by performing the first multiply in round-to-odd, then doing
2874a3ef070eSClaudio Fontana      * the second multiply as fused multiply-add, and rounding to
2875a3ef070eSClaudio Fontana      * float32 all in one step.
2876a3ef070eSClaudio Fontana      */
2877a3ef070eSClaudio Fontana     t64 = float64_mul(e1r, e2r, fpst_odd);
2878a3ef070eSClaudio Fontana     t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
2879a3ef070eSClaudio Fontana 
2880a3ef070eSClaudio Fontana     /* This conversion is exact, because we've already rounded. */
2881a3ef070eSClaudio Fontana     t32 = float64_to_float32(t64, fpst);
2882a3ef070eSClaudio Fontana 
2883a3ef070eSClaudio Fontana     /* The final accumulation step is not fused. */
2884a3ef070eSClaudio Fontana     return float32_add(sum, t32, fpst);
2885a3ef070eSClaudio Fontana }
2886a3ef070eSClaudio Fontana 
HELPER(gvec_bfdot)2887a3ef070eSClaudio Fontana void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va,
2888a3ef070eSClaudio Fontana                         CPUARMState *env, uint32_t desc)
2889a3ef070eSClaudio Fontana {
2890a3ef070eSClaudio Fontana     intptr_t i, opr_sz = simd_oprsz(desc);
2891a3ef070eSClaudio Fontana     float32 *d = vd, *a = va;
2892a3ef070eSClaudio Fontana     uint32_t *n = vn, *m = vm;
2893a3ef070eSClaudio Fontana     float_status fpst, fpst_odd;
2894a3ef070eSClaudio Fontana 
2895a3ef070eSClaudio Fontana     if (is_ebf(env, &fpst, &fpst_odd)) {
2896a3ef070eSClaudio Fontana         for (i = 0; i < opr_sz / 4; ++i) {
2897a3ef070eSClaudio Fontana             d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd);
2898a3ef070eSClaudio Fontana         }
2899a3ef070eSClaudio Fontana     } else {
2900a3ef070eSClaudio Fontana         for (i = 0; i < opr_sz / 4; ++i) {
2901a3ef070eSClaudio Fontana             d[i] = bfdotadd(a[i], n[i], m[i], &fpst);
2902a3ef070eSClaudio Fontana         }
2903a3ef070eSClaudio Fontana     }
2904a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
2905a3ef070eSClaudio Fontana }
2906a3ef070eSClaudio Fontana 
HELPER(gvec_bfdot_idx)2907a3ef070eSClaudio Fontana void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2908a3ef070eSClaudio Fontana                             void *va, CPUARMState *env, uint32_t desc)
2909a3ef070eSClaudio Fontana {
2910a3ef070eSClaudio Fontana     intptr_t i, j, opr_sz = simd_oprsz(desc);
2911a3ef070eSClaudio Fontana     intptr_t index = simd_data(desc);
2912a3ef070eSClaudio Fontana     intptr_t elements = opr_sz / 4;
2913a3ef070eSClaudio Fontana     intptr_t eltspersegment = MIN(16 / 4, elements);
2914a3ef070eSClaudio Fontana     float32 *d = vd, *a = va;
2915a3ef070eSClaudio Fontana     uint32_t *n = vn, *m = vm;
2916a3ef070eSClaudio Fontana     float_status fpst, fpst_odd;
2917a3ef070eSClaudio Fontana 
2918a3ef070eSClaudio Fontana     if (is_ebf(env, &fpst, &fpst_odd)) {
2919a3ef070eSClaudio Fontana         for (i = 0; i < elements; i += eltspersegment) {
2920a3ef070eSClaudio Fontana             uint32_t m_idx = m[i + H4(index)];
2921a3ef070eSClaudio Fontana 
2922a3ef070eSClaudio Fontana             for (j = i; j < i + eltspersegment; j++) {
2923a3ef070eSClaudio Fontana                 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd);
2924a3ef070eSClaudio Fontana             }
2925a3ef070eSClaudio Fontana         }
2926a3ef070eSClaudio Fontana     } else {
2927a3ef070eSClaudio Fontana         for (i = 0; i < elements; i += eltspersegment) {
2928a3ef070eSClaudio Fontana             uint32_t m_idx = m[i + H4(index)];
2929a3ef070eSClaudio Fontana 
2930a3ef070eSClaudio Fontana             for (j = i; j < i + eltspersegment; j++) {
2931a3ef070eSClaudio Fontana                 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst);
2932a3ef070eSClaudio Fontana             }
2933a3ef070eSClaudio Fontana         }
2934a3ef070eSClaudio Fontana     }
2935a3ef070eSClaudio Fontana     clear_tail(d, opr_sz, simd_maxsz(desc));
2936a3ef070eSClaudio Fontana }
2937a3ef070eSClaudio Fontana 
HELPER(gvec_bfmmla)2938a3ef070eSClaudio Fontana void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
2939a3ef070eSClaudio Fontana                          CPUARMState *env, uint32_t desc)
2940a3ef070eSClaudio Fontana {
2941a3ef070eSClaudio Fontana     intptr_t s, opr_sz = simd_oprsz(desc);
2942a3ef070eSClaudio Fontana     float32 *d = vd, *a = va;
2943a3ef070eSClaudio Fontana     uint32_t *n = vn, *m = vm;
2944a3ef070eSClaudio Fontana     float_status fpst, fpst_odd;
2945a3ef070eSClaudio Fontana 
2946a3ef070eSClaudio Fontana     if (is_ebf(env, &fpst, &fpst_odd)) {
2947a3ef070eSClaudio Fontana         for (s = 0; s < opr_sz / 4; s += 4) {
2948a3ef070eSClaudio Fontana             float32 sum00, sum01, sum10, sum11;
2949a3ef070eSClaudio Fontana 
2950a3ef070eSClaudio Fontana             /*
2951a3ef070eSClaudio Fontana              * Process the entire segment at once, writing back the
2952a3ef070eSClaudio Fontana              * results only after we've consumed all of the inputs.
2953a3ef070eSClaudio Fontana              *
2954a3ef070eSClaudio Fontana              * Key to indices by column:
2955a3ef070eSClaudio Fontana              *               i   j               i   k             j   k
2956a3ef070eSClaudio Fontana              */
2957             sum00 = a[s + H4(0 + 0)];
2958             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
2959             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
2960 
2961             sum01 = a[s + H4(0 + 1)];
2962             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
2963             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
2964 
2965             sum10 = a[s + H4(2 + 0)];
2966             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
2967             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
2968 
2969             sum11 = a[s + H4(2 + 1)];
2970             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
2971             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
2972 
2973             d[s + H4(0 + 0)] = sum00;
2974             d[s + H4(0 + 1)] = sum01;
2975             d[s + H4(2 + 0)] = sum10;
2976             d[s + H4(2 + 1)] = sum11;
2977         }
2978     } else {
2979         for (s = 0; s < opr_sz / 4; s += 4) {
2980             float32 sum00, sum01, sum10, sum11;
2981 
2982             /*
2983              * Process the entire segment at once, writing back the
2984              * results only after we've consumed all of the inputs.
2985              *
2986              * Key to indices by column:
2987              *               i   j           i   k             j   k
2988              */
2989             sum00 = a[s + H4(0 + 0)];
2990             sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst);
2991             sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst);
2992 
2993             sum01 = a[s + H4(0 + 1)];
2994             sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst);
2995             sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst);
2996 
2997             sum10 = a[s + H4(2 + 0)];
2998             sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst);
2999             sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst);
3000 
3001             sum11 = a[s + H4(2 + 1)];
3002             sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst);
3003             sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst);
3004 
3005             d[s + H4(0 + 0)] = sum00;
3006             d[s + H4(0 + 1)] = sum01;
3007             d[s + H4(2 + 0)] = sum10;
3008             d[s + H4(2 + 1)] = sum11;
3009         }
3010     }
3011     clear_tail(d, opr_sz, simd_maxsz(desc));
3012 }
3013 
HELPER(gvec_bfmlal)3014 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
3015                          void *stat, uint32_t desc)
3016 {
3017     intptr_t i, opr_sz = simd_oprsz(desc);
3018     intptr_t sel = simd_data(desc);
3019     float32 *d = vd, *a = va;
3020     bfloat16 *n = vn, *m = vm;
3021 
3022     for (i = 0; i < opr_sz / 4; ++i) {
3023         float32 nn = n[H2(i * 2 + sel)] << 16;
3024         float32 mm = m[H2(i * 2 + sel)] << 16;
3025         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
3026     }
3027     clear_tail(d, opr_sz, simd_maxsz(desc));
3028 }
3029 
HELPER(gvec_bfmlal_idx)3030 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
3031                              void *va, void *stat, uint32_t desc)
3032 {
3033     intptr_t i, j, opr_sz = simd_oprsz(desc);
3034     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
3035     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
3036     intptr_t elements = opr_sz / 4;
3037     intptr_t eltspersegment = MIN(16 / 4, elements);
3038     float32 *d = vd, *a = va;
3039     bfloat16 *n = vn, *m = vm;
3040 
3041     for (i = 0; i < elements; i += eltspersegment) {
3042         float32 m_idx = m[H2(2 * i + index)] << 16;
3043 
3044         for (j = i; j < i + eltspersegment; j++) {
3045             float32 n_j = n[H2(2 * j + sel)] << 16;
3046             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
3047         }
3048     }
3049     clear_tail(d, opr_sz, simd_maxsz(desc));
3050 }
3051 
3052 #define DO_CLAMP(NAME, TYPE) \
3053 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
3054 {                                                                       \
3055     intptr_t i, opr_sz = simd_oprsz(desc);                              \
3056     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
3057         TYPE aa = *(TYPE *)(a + i);                                     \
3058         TYPE nn = *(TYPE *)(n + i);                                     \
3059         TYPE mm = *(TYPE *)(m + i);                                     \
3060         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
3061         *(TYPE *)(d + i) = dd;                                          \
3062     }                                                                   \
3063     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
3064 }
3065 
3066 DO_CLAMP(gvec_sclamp_b, int8_t)
3067 DO_CLAMP(gvec_sclamp_h, int16_t)
3068 DO_CLAMP(gvec_sclamp_s, int32_t)
3069 DO_CLAMP(gvec_sclamp_d, int64_t)
3070 
3071 DO_CLAMP(gvec_uclamp_b, uint8_t)
3072 DO_CLAMP(gvec_uclamp_h, uint16_t)
3073 DO_CLAMP(gvec_uclamp_s, uint32_t)
3074 DO_CLAMP(gvec_uclamp_d, uint64_t)
3075