1 /*
2  * QEMU TCG support -- s390x vector string instruction support
3  *
4  * Copyright (C) 2019 Red Hat Inc
5  *
6  * Authors:
7  *   David Hildenbrand <david@redhat.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2 or later.
10  * See the COPYING file in the top-level directory.
11  */
12 #include "qemu/osdep.h"
13 #include "cpu.h"
14 #include "s390x-internal.h"
15 #include "vec.h"
16 #include "tcg/tcg.h"
17 #include "tcg/tcg-gvec-desc.h"
18 #include "exec/helper-proto.h"
19 
20 /*
21  * Returns a bit set in the MSB of each element that is zero,
22  * as defined by the mask.
23  */
zero_search(uint64_t a,uint64_t mask)24 static inline uint64_t zero_search(uint64_t a, uint64_t mask)
25 {
26     return ~(((a & mask) + mask) | a | mask);
27 }
28 
29 /*
30  * Returns a bit set in the MSB of each element that is not zero,
31  * as defined by the mask.
32  */
nonzero_search(uint64_t a,uint64_t mask)33 static inline uint64_t nonzero_search(uint64_t a, uint64_t mask)
34 {
35     return (((a & mask) + mask) | a) & ~mask;
36 }
37 
38 /*
39  * Returns the byte offset for the first match, or 16 for no match.
40  */
match_index(uint64_t c0,uint64_t c1)41 static inline int match_index(uint64_t c0, uint64_t c1)
42 {
43     return (c0 ? clz64(c0) : clz64(c1) + 64) >> 3;
44 }
45 
46 /*
47  * Returns the number of bits composing one element.
48  */
get_element_bits(uint8_t es)49 static uint8_t get_element_bits(uint8_t es)
50 {
51     return (1 << es) * BITS_PER_BYTE;
52 }
53 
54 /*
55  * Returns the bitmask for a single element.
56  */
get_single_element_mask(uint8_t es)57 static uint64_t get_single_element_mask(uint8_t es)
58 {
59     return -1ull >> (64 - get_element_bits(es));
60 }
61 
62 /*
63  * Returns the bitmask for a single element (excluding the MSB).
64  */
get_single_element_lsbs_mask(uint8_t es)65 static uint64_t get_single_element_lsbs_mask(uint8_t es)
66 {
67     return -1ull >> (65 - get_element_bits(es));
68 }
69 
70 /*
71  * Returns the bitmasks for multiple elements (excluding the MSBs).
72  */
get_element_lsbs_mask(uint8_t es)73 static uint64_t get_element_lsbs_mask(uint8_t es)
74 {
75     return dup_const(es, get_single_element_lsbs_mask(es));
76 }
77 
vfae(void * v1,const void * v2,const void * v3,bool in,bool rt,bool zs,uint8_t es)78 static int vfae(void *v1, const void *v2, const void *v3, bool in,
79                 bool rt, bool zs, uint8_t es)
80 {
81     const uint64_t mask = get_element_lsbs_mask(es);
82     const int bits = get_element_bits(es);
83     uint64_t a0, a1, b0, b1, e0, e1, t0, t1, z0, z1;
84     uint64_t first_zero = 16;
85     uint64_t first_equal;
86     int i;
87 
88     a0 = s390_vec_read_element64(v2, 0);
89     a1 = s390_vec_read_element64(v2, 1);
90     b0 = s390_vec_read_element64(v3, 0);
91     b1 = s390_vec_read_element64(v3, 1);
92     e0 = 0;
93     e1 = 0;
94     /* compare against equality with every other element */
95     for (i = 0; i < 64; i += bits) {
96         t0 = rol64(b0, i);
97         t1 = rol64(b1, i);
98         e0 |= zero_search(a0 ^ t0, mask);
99         e0 |= zero_search(a0 ^ t1, mask);
100         e1 |= zero_search(a1 ^ t0, mask);
101         e1 |= zero_search(a1 ^ t1, mask);
102     }
103     /* invert the result if requested - invert only the MSBs */
104     if (in) {
105         e0 = ~e0 & ~mask;
106         e1 = ~e1 & ~mask;
107     }
108     first_equal = match_index(e0, e1);
109 
110     if (zs) {
111         z0 = zero_search(a0, mask);
112         z1 = zero_search(a1, mask);
113         first_zero = match_index(z0, z1);
114     }
115 
116     if (rt) {
117         e0 = (e0 >> (bits - 1)) * get_single_element_mask(es);
118         e1 = (e1 >> (bits - 1)) * get_single_element_mask(es);
119         s390_vec_write_element64(v1, 0, e0);
120         s390_vec_write_element64(v1, 1, e1);
121     } else {
122         s390_vec_write_element64(v1, 0, MIN(first_equal, first_zero));
123         s390_vec_write_element64(v1, 1, 0);
124     }
125 
126     if (first_zero == 16 && first_equal == 16) {
127         return 3; /* no match */
128     } else if (first_zero == 16) {
129         return 1; /* matching elements, no match for zero */
130     } else if (first_equal < first_zero) {
131         return 2; /* matching elements before match for zero */
132     }
133     return 0; /* match for zero */
134 }
135 
136 #define DEF_VFAE_HELPER(BITS)                                                  \
137 void HELPER(gvec_vfae##BITS)(void *v1, const void *v2, const void *v3,         \
138                              uint32_t desc)                                    \
139 {                                                                              \
140     const bool in = extract32(simd_data(desc), 3, 1);                          \
141     const bool rt = extract32(simd_data(desc), 2, 1);                          \
142     const bool zs = extract32(simd_data(desc), 1, 1);                          \
143                                                                                \
144     vfae(v1, v2, v3, in, rt, zs, MO_##BITS);                                   \
145 }
146 DEF_VFAE_HELPER(8)
147 DEF_VFAE_HELPER(16)
148 DEF_VFAE_HELPER(32)
149 
150 #define DEF_VFAE_CC_HELPER(BITS)                                               \
151 void HELPER(gvec_vfae_cc##BITS)(void *v1, const void *v2, const void *v3,      \
152                                 CPUS390XState *env, uint32_t desc)             \
153 {                                                                              \
154     const bool in = extract32(simd_data(desc), 3, 1);                          \
155     const bool rt = extract32(simd_data(desc), 2, 1);                          \
156     const bool zs = extract32(simd_data(desc), 1, 1);                          \
157                                                                                \
158     env->cc_op = vfae(v1, v2, v3, in, rt, zs, MO_##BITS);                      \
159 }
160 DEF_VFAE_CC_HELPER(8)
161 DEF_VFAE_CC_HELPER(16)
162 DEF_VFAE_CC_HELPER(32)
163 
vfee(void * v1,const void * v2,const void * v3,bool zs,uint8_t es)164 static int vfee(void *v1, const void *v2, const void *v3, bool zs, uint8_t es)
165 {
166     const uint64_t mask = get_element_lsbs_mask(es);
167     uint64_t a0, a1, b0, b1, e0, e1, z0, z1;
168     uint64_t first_zero = 16;
169     uint64_t first_equal;
170 
171     a0 = s390_vec_read_element64(v2, 0);
172     a1 = s390_vec_read_element64(v2, 1);
173     b0 = s390_vec_read_element64(v3, 0);
174     b1 = s390_vec_read_element64(v3, 1);
175     e0 = zero_search(a0 ^ b0, mask);
176     e1 = zero_search(a1 ^ b1, mask);
177     first_equal = match_index(e0, e1);
178 
179     if (zs) {
180         z0 = zero_search(a0, mask);
181         z1 = zero_search(a1, mask);
182         first_zero = match_index(z0, z1);
183     }
184 
185     s390_vec_write_element64(v1, 0, MIN(first_equal, first_zero));
186     s390_vec_write_element64(v1, 1, 0);
187     if (first_zero == 16 && first_equal == 16) {
188         return 3; /* no match */
189     } else if (first_zero == 16) {
190         return 1; /* matching elements, no match for zero */
191     } else if (first_equal < first_zero) {
192         return 2; /* matching elements before match for zero */
193     }
194     return 0; /* match for zero */
195 }
196 
197 #define DEF_VFEE_HELPER(BITS)                                                  \
198 void HELPER(gvec_vfee##BITS)(void *v1, const void *v2, const void *v3,         \
199                              uint32_t desc)                                    \
200 {                                                                              \
201     const bool zs = extract32(simd_data(desc), 1, 1);                          \
202                                                                                \
203     vfee(v1, v2, v3, zs, MO_##BITS);                                           \
204 }
205 DEF_VFEE_HELPER(8)
206 DEF_VFEE_HELPER(16)
207 DEF_VFEE_HELPER(32)
208 
209 #define DEF_VFEE_CC_HELPER(BITS)                                               \
210 void HELPER(gvec_vfee_cc##BITS)(void *v1, const void *v2, const void *v3,      \
211                                 CPUS390XState *env, uint32_t desc)             \
212 {                                                                              \
213     const bool zs = extract32(simd_data(desc), 1, 1);                          \
214                                                                                \
215     env->cc_op = vfee(v1, v2, v3, zs, MO_##BITS);                              \
216 }
217 DEF_VFEE_CC_HELPER(8)
218 DEF_VFEE_CC_HELPER(16)
219 DEF_VFEE_CC_HELPER(32)
220 
vfene(void * v1,const void * v2,const void * v3,bool zs,uint8_t es)221 static int vfene(void *v1, const void *v2, const void *v3, bool zs, uint8_t es)
222 {
223     const uint64_t mask = get_element_lsbs_mask(es);
224     uint64_t a0, a1, b0, b1, e0, e1, z0, z1;
225     uint64_t first_zero = 16;
226     uint64_t first_inequal;
227     bool smaller = false;
228 
229     a0 = s390_vec_read_element64(v2, 0);
230     a1 = s390_vec_read_element64(v2, 1);
231     b0 = s390_vec_read_element64(v3, 0);
232     b1 = s390_vec_read_element64(v3, 1);
233     e0 = nonzero_search(a0 ^ b0, mask);
234     e1 = nonzero_search(a1 ^ b1, mask);
235     first_inequal = match_index(e0, e1);
236 
237     /* identify the smaller element */
238     if (first_inequal < 16) {
239         uint8_t enr = first_inequal / (1 << es);
240         uint32_t a = s390_vec_read_element(v2, enr, es);
241         uint32_t b = s390_vec_read_element(v3, enr, es);
242 
243         smaller = a < b;
244     }
245 
246     if (zs) {
247         z0 = zero_search(a0, mask);
248         z1 = zero_search(a1, mask);
249         first_zero = match_index(z0, z1);
250     }
251 
252     s390_vec_write_element64(v1, 0, MIN(first_inequal, first_zero));
253     s390_vec_write_element64(v1, 1, 0);
254     if (first_zero == 16 && first_inequal == 16) {
255         return 3;
256     } else if (first_zero < first_inequal) {
257         return 0;
258     }
259     return smaller ? 1 : 2;
260 }
261 
262 #define DEF_VFENE_HELPER(BITS)                                                 \
263 void HELPER(gvec_vfene##BITS)(void *v1, const void *v2, const void *v3,        \
264                               uint32_t desc)                                   \
265 {                                                                              \
266     const bool zs = extract32(simd_data(desc), 1, 1);                          \
267                                                                                \
268     vfene(v1, v2, v3, zs, MO_##BITS);                                          \
269 }
270 DEF_VFENE_HELPER(8)
271 DEF_VFENE_HELPER(16)
272 DEF_VFENE_HELPER(32)
273 
274 #define DEF_VFENE_CC_HELPER(BITS)                                              \
275 void HELPER(gvec_vfene_cc##BITS)(void *v1, const void *v2, const void *v3,     \
276                                  CPUS390XState *env, uint32_t desc)            \
277 {                                                                              \
278     const bool zs = extract32(simd_data(desc), 1, 1);                          \
279                                                                                \
280     env->cc_op = vfene(v1, v2, v3, zs, MO_##BITS);                             \
281 }
282 DEF_VFENE_CC_HELPER(8)
283 DEF_VFENE_CC_HELPER(16)
284 DEF_VFENE_CC_HELPER(32)
285 
vistr(void * v1,const void * v2,uint8_t es)286 static int vistr(void *v1, const void *v2, uint8_t es)
287 {
288     const uint64_t mask = get_element_lsbs_mask(es);
289     uint64_t a0 = s390_vec_read_element64(v2, 0);
290     uint64_t a1 = s390_vec_read_element64(v2, 1);
291     uint64_t z;
292     int cc = 3;
293 
294     z = zero_search(a0, mask);
295     if (z) {
296         a0 &= ~(-1ull >> clz64(z));
297         a1 = 0;
298         cc = 0;
299     } else {
300         z = zero_search(a1, mask);
301         if (z) {
302             a1 &= ~(-1ull >> clz64(z));
303             cc = 0;
304         }
305     }
306 
307     s390_vec_write_element64(v1, 0, a0);
308     s390_vec_write_element64(v1, 1, a1);
309     return cc;
310 }
311 
312 #define DEF_VISTR_HELPER(BITS)                                                 \
313 void HELPER(gvec_vistr##BITS)(void *v1, const void *v2, uint32_t desc)         \
314 {                                                                              \
315     vistr(v1, v2, MO_##BITS);                                                  \
316 }
317 DEF_VISTR_HELPER(8)
318 DEF_VISTR_HELPER(16)
319 DEF_VISTR_HELPER(32)
320 
321 #define DEF_VISTR_CC_HELPER(BITS)                                              \
322 void HELPER(gvec_vistr_cc##BITS)(void *v1, const void *v2, CPUS390XState *env, \
323                                 uint32_t desc)                                 \
324 {                                                                              \
325     env->cc_op = vistr(v1, v2, MO_##BITS);                                     \
326 }
327 DEF_VISTR_CC_HELPER(8)
328 DEF_VISTR_CC_HELPER(16)
329 DEF_VISTR_CC_HELPER(32)
330 
element_compare(uint32_t data,uint32_t l,uint8_t c)331 static bool element_compare(uint32_t data, uint32_t l, uint8_t c)
332 {
333     const bool equal = extract32(c, 7, 1);
334     const bool lower = extract32(c, 6, 1);
335     const bool higher = extract32(c, 5, 1);
336 
337     if (data < l) {
338         return lower;
339     } else if (data > l) {
340         return higher;
341     }
342     return equal;
343 }
344 
vstrc(void * v1,const void * v2,const void * v3,const void * v4,bool in,bool rt,bool zs,uint8_t es)345 static int vstrc(void *v1, const void *v2, const void *v3, const void *v4,
346                  bool in, bool rt, bool zs, uint8_t es)
347 {
348     const uint64_t mask = get_element_lsbs_mask(es);
349     uint64_t a0 = s390_vec_read_element64(v2, 0);
350     uint64_t a1 = s390_vec_read_element64(v2, 1);
351     int first_zero = 16, first_match = 16;
352     S390Vector rt_result = {};
353     uint64_t z0, z1;
354     int i, j;
355 
356     if (zs) {
357         z0 = zero_search(a0, mask);
358         z1 = zero_search(a1, mask);
359         first_zero = match_index(z0, z1);
360     }
361 
362     for (i = 0; i < 16 / (1 << es); i++) {
363         const uint32_t data = s390_vec_read_element(v2, i, es);
364         const int cur_byte = i * (1 << es);
365         bool any_match = false;
366 
367         /* if we don't need a bit vector, we can stop early */
368         if (cur_byte == first_zero && !rt) {
369             break;
370         }
371 
372         for (j = 0; j < 16 / (1 << es); j += 2) {
373             const uint32_t l1 = s390_vec_read_element(v3, j, es);
374             const uint32_t l2 = s390_vec_read_element(v3, j + 1, es);
375             /* we are only interested in the highest byte of each element */
376             const uint8_t c1 = s390_vec_read_element8(v4, j * (1 << es));
377             const uint8_t c2 = s390_vec_read_element8(v4, (j + 1) * (1 << es));
378 
379             if (element_compare(data, l1, c1) &&
380                 element_compare(data, l2, c2)) {
381                 any_match = true;
382                 break;
383             }
384         }
385         /* invert the result if requested */
386         any_match = in ^ any_match;
387 
388         if (any_match) {
389             /* indicate bit vector if requested */
390             if (rt) {
391                 const uint64_t val = -1ull;
392 
393                 first_match = MIN(cur_byte, first_match);
394                 s390_vec_write_element(&rt_result, i, es, val);
395             } else {
396                 /* stop on the first match */
397                 first_match = cur_byte;
398                 break;
399             }
400         }
401     }
402 
403     if (rt) {
404         *(S390Vector *)v1 = rt_result;
405     } else {
406         s390_vec_write_element64(v1, 0, MIN(first_match, first_zero));
407         s390_vec_write_element64(v1, 1, 0);
408     }
409 
410     if (first_zero == 16 && first_match == 16) {
411         return 3; /* no match */
412     } else if (first_zero == 16) {
413         return 1; /* matching elements, no match for zero */
414     } else if (first_match < first_zero) {
415         return 2; /* matching elements before match for zero */
416     }
417     return 0; /* match for zero */
418 }
419 
420 #define DEF_VSTRC_HELPER(BITS)                                                 \
421 void HELPER(gvec_vstrc##BITS)(void *v1, const void *v2, const void *v3,        \
422                               const void *v4, uint32_t desc)                   \
423 {                                                                              \
424     const bool in = extract32(simd_data(desc), 3, 1);                          \
425     const bool zs = extract32(simd_data(desc), 1, 1);                          \
426                                                                                \
427     vstrc(v1, v2, v3, v4, in, 0, zs, MO_##BITS);                               \
428 }
429 DEF_VSTRC_HELPER(8)
430 DEF_VSTRC_HELPER(16)
431 DEF_VSTRC_HELPER(32)
432 
433 #define DEF_VSTRC_RT_HELPER(BITS)                                              \
434 void HELPER(gvec_vstrc_rt##BITS)(void *v1, const void *v2, const void *v3,     \
435                                  const void *v4, uint32_t desc)                \
436 {                                                                              \
437     const bool in = extract32(simd_data(desc), 3, 1);                          \
438     const bool zs = extract32(simd_data(desc), 1, 1);                          \
439                                                                                \
440     vstrc(v1, v2, v3, v4, in, 1, zs, MO_##BITS);                               \
441 }
442 DEF_VSTRC_RT_HELPER(8)
443 DEF_VSTRC_RT_HELPER(16)
444 DEF_VSTRC_RT_HELPER(32)
445 
446 #define DEF_VSTRC_CC_HELPER(BITS)                                              \
447 void HELPER(gvec_vstrc_cc##BITS)(void *v1, const void *v2, const void *v3,     \
448                                  const void *v4, CPUS390XState *env,           \
449                                  uint32_t desc)                                \
450 {                                                                              \
451     const bool in = extract32(simd_data(desc), 3, 1);                          \
452     const bool zs = extract32(simd_data(desc), 1, 1);                          \
453                                                                                \
454     env->cc_op = vstrc(v1, v2, v3, v4, in, 0, zs, MO_##BITS);                  \
455 }
456 DEF_VSTRC_CC_HELPER(8)
457 DEF_VSTRC_CC_HELPER(16)
458 DEF_VSTRC_CC_HELPER(32)
459 
460 #define DEF_VSTRC_CC_RT_HELPER(BITS)                                           \
461 void HELPER(gvec_vstrc_cc_rt##BITS)(void *v1, const void *v2, const void *v3,  \
462                                     const void *v4, CPUS390XState *env,        \
463                                     uint32_t desc)                             \
464 {                                                                              \
465     const bool in = extract32(simd_data(desc), 3, 1);                          \
466     const bool zs = extract32(simd_data(desc), 1, 1);                          \
467                                                                                \
468     env->cc_op = vstrc(v1, v2, v3, v4, in, 1, zs, MO_##BITS);                  \
469 }
470 DEF_VSTRC_CC_RT_HELPER(8)
471 DEF_VSTRC_CC_RT_HELPER(16)
472 DEF_VSTRC_CC_RT_HELPER(32)
473 
vstrs(S390Vector * v1,const S390Vector * v2,const S390Vector * v3,const S390Vector * v4,uint8_t es,bool zs)474 static int vstrs(S390Vector *v1, const S390Vector *v2, const S390Vector *v3,
475                  const S390Vector *v4, uint8_t es, bool zs)
476 {
477     int substr_elen, i, j, k, cc;
478     int nelem = 16 >> es;
479     int str_leftmost_0;
480 
481     substr_elen = s390_vec_read_element8(v4, 7) >> es;
482 
483     /* If ZS, bound substr length by min(nelem, strlen(v3)). */
484     if (zs) {
485         substr_elen = MIN(substr_elen, nelem);
486         for (i = 0; i < substr_elen; i++) {
487             if (s390_vec_read_element(v3, i, es) == 0) {
488                 substr_elen = i;
489                 break;
490             }
491         }
492     }
493 
494     if (substr_elen == 0) {
495         cc = 2; /* full match for degenerate case of empty substr */
496         k = 0;
497         goto done;
498     }
499 
500     /* If ZS, look for eos in the searched string. */
501     str_leftmost_0 = nelem;
502     if (zs) {
503         for (k = 0; k < nelem; k++) {
504             if (s390_vec_read_element(v2, k, es) == 0) {
505                 str_leftmost_0 = k;
506                 break;
507             }
508         }
509     }
510 
511     cc = str_leftmost_0 == nelem ? 0 : 1;  /* No match. */
512     for (k = 0; k < nelem; k++) {
513         i = MIN(nelem, k + substr_elen);
514         for (j = k; j < i; j++) {
515             uint32_t e2 = s390_vec_read_element(v2, j, es);
516             uint32_t e3 = s390_vec_read_element(v3, j - k, es);
517             if (e2 != e3) {
518                 break;
519             }
520         }
521         if (j == i) {
522             /* All elements matched. */
523             if (k > str_leftmost_0) {
524                 cc = 1;  /* Ignored match. */
525                 k = nelem;
526             } else if (i - k == substr_elen) {
527                 cc = 2;  /* Full match. */
528             } else {
529                 cc = 3;  /* Partial match. */
530             }
531             break;
532         }
533     }
534 
535  done:
536     s390_vec_write_element64(v1, 0, k << es);
537     s390_vec_write_element64(v1, 1, 0);
538     return cc;
539 }
540 
541 #define DEF_VSTRS_HELPER(BITS)                                             \
542 void QEMU_FLATTEN HELPER(gvec_vstrs_##BITS)(void *v1, const void *v2,      \
543     const void *v3, const void *v4, CPUS390XState *env, uint32_t desc)     \
544     { env->cc_op = vstrs(v1, v2, v3, v4, MO_##BITS, false); }              \
545 void QEMU_FLATTEN HELPER(gvec_vstrs_zs##BITS)(void *v1, const void *v2,    \
546     const void *v3, const void *v4, CPUS390XState *env, uint32_t desc)     \
547     { env->cc_op = vstrs(v1, v2, v3, v4, MO_##BITS, true); }
548 
549 DEF_VSTRS_HELPER(8)
550 DEF_VSTRS_HELPER(16)
551 DEF_VSTRS_HELPER(32)
552