1 /*
2 * QEMU TCG support -- s390x vector string instruction support
3 *
4 * Copyright (C) 2019 Red Hat Inc
5 *
6 * Authors:
7 * David Hildenbrand <david@redhat.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2 or later.
10 * See the COPYING file in the top-level directory.
11 */
12 #include "qemu/osdep.h"
13 #include "cpu.h"
14 #include "s390x-internal.h"
15 #include "vec.h"
16 #include "tcg/tcg.h"
17 #include "tcg/tcg-gvec-desc.h"
18 #include "exec/helper-proto.h"
19
20 /*
21 * Returns a bit set in the MSB of each element that is zero,
22 * as defined by the mask.
23 */
zero_search(uint64_t a,uint64_t mask)24 static inline uint64_t zero_search(uint64_t a, uint64_t mask)
25 {
26 return ~(((a & mask) + mask) | a | mask);
27 }
28
29 /*
30 * Returns a bit set in the MSB of each element that is not zero,
31 * as defined by the mask.
32 */
nonzero_search(uint64_t a,uint64_t mask)33 static inline uint64_t nonzero_search(uint64_t a, uint64_t mask)
34 {
35 return (((a & mask) + mask) | a) & ~mask;
36 }
37
38 /*
39 * Returns the byte offset for the first match, or 16 for no match.
40 */
match_index(uint64_t c0,uint64_t c1)41 static inline int match_index(uint64_t c0, uint64_t c1)
42 {
43 return (c0 ? clz64(c0) : clz64(c1) + 64) >> 3;
44 }
45
46 /*
47 * Returns the number of bits composing one element.
48 */
get_element_bits(uint8_t es)49 static uint8_t get_element_bits(uint8_t es)
50 {
51 return (1 << es) * BITS_PER_BYTE;
52 }
53
54 /*
55 * Returns the bitmask for a single element.
56 */
get_single_element_mask(uint8_t es)57 static uint64_t get_single_element_mask(uint8_t es)
58 {
59 return -1ull >> (64 - get_element_bits(es));
60 }
61
62 /*
63 * Returns the bitmask for a single element (excluding the MSB).
64 */
get_single_element_lsbs_mask(uint8_t es)65 static uint64_t get_single_element_lsbs_mask(uint8_t es)
66 {
67 return -1ull >> (65 - get_element_bits(es));
68 }
69
70 /*
71 * Returns the bitmasks for multiple elements (excluding the MSBs).
72 */
get_element_lsbs_mask(uint8_t es)73 static uint64_t get_element_lsbs_mask(uint8_t es)
74 {
75 return dup_const(es, get_single_element_lsbs_mask(es));
76 }
77
vfae(void * v1,const void * v2,const void * v3,bool in,bool rt,bool zs,uint8_t es)78 static int vfae(void *v1, const void *v2, const void *v3, bool in,
79 bool rt, bool zs, uint8_t es)
80 {
81 const uint64_t mask = get_element_lsbs_mask(es);
82 const int bits = get_element_bits(es);
83 uint64_t a0, a1, b0, b1, e0, e1, t0, t1, z0, z1;
84 uint64_t first_zero = 16;
85 uint64_t first_equal;
86 int i;
87
88 a0 = s390_vec_read_element64(v2, 0);
89 a1 = s390_vec_read_element64(v2, 1);
90 b0 = s390_vec_read_element64(v3, 0);
91 b1 = s390_vec_read_element64(v3, 1);
92 e0 = 0;
93 e1 = 0;
94 /* compare against equality with every other element */
95 for (i = 0; i < 64; i += bits) {
96 t0 = rol64(b0, i);
97 t1 = rol64(b1, i);
98 e0 |= zero_search(a0 ^ t0, mask);
99 e0 |= zero_search(a0 ^ t1, mask);
100 e1 |= zero_search(a1 ^ t0, mask);
101 e1 |= zero_search(a1 ^ t1, mask);
102 }
103 /* invert the result if requested - invert only the MSBs */
104 if (in) {
105 e0 = ~e0 & ~mask;
106 e1 = ~e1 & ~mask;
107 }
108 first_equal = match_index(e0, e1);
109
110 if (zs) {
111 z0 = zero_search(a0, mask);
112 z1 = zero_search(a1, mask);
113 first_zero = match_index(z0, z1);
114 }
115
116 if (rt) {
117 e0 = (e0 >> (bits - 1)) * get_single_element_mask(es);
118 e1 = (e1 >> (bits - 1)) * get_single_element_mask(es);
119 s390_vec_write_element64(v1, 0, e0);
120 s390_vec_write_element64(v1, 1, e1);
121 } else {
122 s390_vec_write_element64(v1, 0, MIN(first_equal, first_zero));
123 s390_vec_write_element64(v1, 1, 0);
124 }
125
126 if (first_zero == 16 && first_equal == 16) {
127 return 3; /* no match */
128 } else if (first_zero == 16) {
129 return 1; /* matching elements, no match for zero */
130 } else if (first_equal < first_zero) {
131 return 2; /* matching elements before match for zero */
132 }
133 return 0; /* match for zero */
134 }
135
136 #define DEF_VFAE_HELPER(BITS) \
137 void HELPER(gvec_vfae##BITS)(void *v1, const void *v2, const void *v3, \
138 uint32_t desc) \
139 { \
140 const bool in = extract32(simd_data(desc), 3, 1); \
141 const bool rt = extract32(simd_data(desc), 2, 1); \
142 const bool zs = extract32(simd_data(desc), 1, 1); \
143 \
144 vfae(v1, v2, v3, in, rt, zs, MO_##BITS); \
145 }
146 DEF_VFAE_HELPER(8)
147 DEF_VFAE_HELPER(16)
148 DEF_VFAE_HELPER(32)
149
150 #define DEF_VFAE_CC_HELPER(BITS) \
151 void HELPER(gvec_vfae_cc##BITS)(void *v1, const void *v2, const void *v3, \
152 CPUS390XState *env, uint32_t desc) \
153 { \
154 const bool in = extract32(simd_data(desc), 3, 1); \
155 const bool rt = extract32(simd_data(desc), 2, 1); \
156 const bool zs = extract32(simd_data(desc), 1, 1); \
157 \
158 env->cc_op = vfae(v1, v2, v3, in, rt, zs, MO_##BITS); \
159 }
160 DEF_VFAE_CC_HELPER(8)
161 DEF_VFAE_CC_HELPER(16)
162 DEF_VFAE_CC_HELPER(32)
163
vfee(void * v1,const void * v2,const void * v3,bool zs,uint8_t es)164 static int vfee(void *v1, const void *v2, const void *v3, bool zs, uint8_t es)
165 {
166 const uint64_t mask = get_element_lsbs_mask(es);
167 uint64_t a0, a1, b0, b1, e0, e1, z0, z1;
168 uint64_t first_zero = 16;
169 uint64_t first_equal;
170
171 a0 = s390_vec_read_element64(v2, 0);
172 a1 = s390_vec_read_element64(v2, 1);
173 b0 = s390_vec_read_element64(v3, 0);
174 b1 = s390_vec_read_element64(v3, 1);
175 e0 = zero_search(a0 ^ b0, mask);
176 e1 = zero_search(a1 ^ b1, mask);
177 first_equal = match_index(e0, e1);
178
179 if (zs) {
180 z0 = zero_search(a0, mask);
181 z1 = zero_search(a1, mask);
182 first_zero = match_index(z0, z1);
183 }
184
185 s390_vec_write_element64(v1, 0, MIN(first_equal, first_zero));
186 s390_vec_write_element64(v1, 1, 0);
187 if (first_zero == 16 && first_equal == 16) {
188 return 3; /* no match */
189 } else if (first_zero == 16) {
190 return 1; /* matching elements, no match for zero */
191 } else if (first_equal < first_zero) {
192 return 2; /* matching elements before match for zero */
193 }
194 return 0; /* match for zero */
195 }
196
197 #define DEF_VFEE_HELPER(BITS) \
198 void HELPER(gvec_vfee##BITS)(void *v1, const void *v2, const void *v3, \
199 uint32_t desc) \
200 { \
201 const bool zs = extract32(simd_data(desc), 1, 1); \
202 \
203 vfee(v1, v2, v3, zs, MO_##BITS); \
204 }
205 DEF_VFEE_HELPER(8)
206 DEF_VFEE_HELPER(16)
207 DEF_VFEE_HELPER(32)
208
209 #define DEF_VFEE_CC_HELPER(BITS) \
210 void HELPER(gvec_vfee_cc##BITS)(void *v1, const void *v2, const void *v3, \
211 CPUS390XState *env, uint32_t desc) \
212 { \
213 const bool zs = extract32(simd_data(desc), 1, 1); \
214 \
215 env->cc_op = vfee(v1, v2, v3, zs, MO_##BITS); \
216 }
217 DEF_VFEE_CC_HELPER(8)
218 DEF_VFEE_CC_HELPER(16)
219 DEF_VFEE_CC_HELPER(32)
220
vfene(void * v1,const void * v2,const void * v3,bool zs,uint8_t es)221 static int vfene(void *v1, const void *v2, const void *v3, bool zs, uint8_t es)
222 {
223 const uint64_t mask = get_element_lsbs_mask(es);
224 uint64_t a0, a1, b0, b1, e0, e1, z0, z1;
225 uint64_t first_zero = 16;
226 uint64_t first_inequal;
227 bool smaller = false;
228
229 a0 = s390_vec_read_element64(v2, 0);
230 a1 = s390_vec_read_element64(v2, 1);
231 b0 = s390_vec_read_element64(v3, 0);
232 b1 = s390_vec_read_element64(v3, 1);
233 e0 = nonzero_search(a0 ^ b0, mask);
234 e1 = nonzero_search(a1 ^ b1, mask);
235 first_inequal = match_index(e0, e1);
236
237 /* identify the smaller element */
238 if (first_inequal < 16) {
239 uint8_t enr = first_inequal / (1 << es);
240 uint32_t a = s390_vec_read_element(v2, enr, es);
241 uint32_t b = s390_vec_read_element(v3, enr, es);
242
243 smaller = a < b;
244 }
245
246 if (zs) {
247 z0 = zero_search(a0, mask);
248 z1 = zero_search(a1, mask);
249 first_zero = match_index(z0, z1);
250 }
251
252 s390_vec_write_element64(v1, 0, MIN(first_inequal, first_zero));
253 s390_vec_write_element64(v1, 1, 0);
254 if (first_zero == 16 && first_inequal == 16) {
255 return 3;
256 } else if (first_zero < first_inequal) {
257 return 0;
258 }
259 return smaller ? 1 : 2;
260 }
261
262 #define DEF_VFENE_HELPER(BITS) \
263 void HELPER(gvec_vfene##BITS)(void *v1, const void *v2, const void *v3, \
264 uint32_t desc) \
265 { \
266 const bool zs = extract32(simd_data(desc), 1, 1); \
267 \
268 vfene(v1, v2, v3, zs, MO_##BITS); \
269 }
270 DEF_VFENE_HELPER(8)
271 DEF_VFENE_HELPER(16)
272 DEF_VFENE_HELPER(32)
273
274 #define DEF_VFENE_CC_HELPER(BITS) \
275 void HELPER(gvec_vfene_cc##BITS)(void *v1, const void *v2, const void *v3, \
276 CPUS390XState *env, uint32_t desc) \
277 { \
278 const bool zs = extract32(simd_data(desc), 1, 1); \
279 \
280 env->cc_op = vfene(v1, v2, v3, zs, MO_##BITS); \
281 }
282 DEF_VFENE_CC_HELPER(8)
283 DEF_VFENE_CC_HELPER(16)
284 DEF_VFENE_CC_HELPER(32)
285
vistr(void * v1,const void * v2,uint8_t es)286 static int vistr(void *v1, const void *v2, uint8_t es)
287 {
288 const uint64_t mask = get_element_lsbs_mask(es);
289 uint64_t a0 = s390_vec_read_element64(v2, 0);
290 uint64_t a1 = s390_vec_read_element64(v2, 1);
291 uint64_t z;
292 int cc = 3;
293
294 z = zero_search(a0, mask);
295 if (z) {
296 a0 &= ~(-1ull >> clz64(z));
297 a1 = 0;
298 cc = 0;
299 } else {
300 z = zero_search(a1, mask);
301 if (z) {
302 a1 &= ~(-1ull >> clz64(z));
303 cc = 0;
304 }
305 }
306
307 s390_vec_write_element64(v1, 0, a0);
308 s390_vec_write_element64(v1, 1, a1);
309 return cc;
310 }
311
312 #define DEF_VISTR_HELPER(BITS) \
313 void HELPER(gvec_vistr##BITS)(void *v1, const void *v2, uint32_t desc) \
314 { \
315 vistr(v1, v2, MO_##BITS); \
316 }
317 DEF_VISTR_HELPER(8)
318 DEF_VISTR_HELPER(16)
319 DEF_VISTR_HELPER(32)
320
321 #define DEF_VISTR_CC_HELPER(BITS) \
322 void HELPER(gvec_vistr_cc##BITS)(void *v1, const void *v2, CPUS390XState *env, \
323 uint32_t desc) \
324 { \
325 env->cc_op = vistr(v1, v2, MO_##BITS); \
326 }
327 DEF_VISTR_CC_HELPER(8)
328 DEF_VISTR_CC_HELPER(16)
329 DEF_VISTR_CC_HELPER(32)
330
element_compare(uint32_t data,uint32_t l,uint8_t c)331 static bool element_compare(uint32_t data, uint32_t l, uint8_t c)
332 {
333 const bool equal = extract32(c, 7, 1);
334 const bool lower = extract32(c, 6, 1);
335 const bool higher = extract32(c, 5, 1);
336
337 if (data < l) {
338 return lower;
339 } else if (data > l) {
340 return higher;
341 }
342 return equal;
343 }
344
vstrc(void * v1,const void * v2,const void * v3,const void * v4,bool in,bool rt,bool zs,uint8_t es)345 static int vstrc(void *v1, const void *v2, const void *v3, const void *v4,
346 bool in, bool rt, bool zs, uint8_t es)
347 {
348 const uint64_t mask = get_element_lsbs_mask(es);
349 uint64_t a0 = s390_vec_read_element64(v2, 0);
350 uint64_t a1 = s390_vec_read_element64(v2, 1);
351 int first_zero = 16, first_match = 16;
352 S390Vector rt_result = {};
353 uint64_t z0, z1;
354 int i, j;
355
356 if (zs) {
357 z0 = zero_search(a0, mask);
358 z1 = zero_search(a1, mask);
359 first_zero = match_index(z0, z1);
360 }
361
362 for (i = 0; i < 16 / (1 << es); i++) {
363 const uint32_t data = s390_vec_read_element(v2, i, es);
364 const int cur_byte = i * (1 << es);
365 bool any_match = false;
366
367 /* if we don't need a bit vector, we can stop early */
368 if (cur_byte == first_zero && !rt) {
369 break;
370 }
371
372 for (j = 0; j < 16 / (1 << es); j += 2) {
373 const uint32_t l1 = s390_vec_read_element(v3, j, es);
374 const uint32_t l2 = s390_vec_read_element(v3, j + 1, es);
375 /* we are only interested in the highest byte of each element */
376 const uint8_t c1 = s390_vec_read_element8(v4, j * (1 << es));
377 const uint8_t c2 = s390_vec_read_element8(v4, (j + 1) * (1 << es));
378
379 if (element_compare(data, l1, c1) &&
380 element_compare(data, l2, c2)) {
381 any_match = true;
382 break;
383 }
384 }
385 /* invert the result if requested */
386 any_match = in ^ any_match;
387
388 if (any_match) {
389 /* indicate bit vector if requested */
390 if (rt) {
391 const uint64_t val = -1ull;
392
393 first_match = MIN(cur_byte, first_match);
394 s390_vec_write_element(&rt_result, i, es, val);
395 } else {
396 /* stop on the first match */
397 first_match = cur_byte;
398 break;
399 }
400 }
401 }
402
403 if (rt) {
404 *(S390Vector *)v1 = rt_result;
405 } else {
406 s390_vec_write_element64(v1, 0, MIN(first_match, first_zero));
407 s390_vec_write_element64(v1, 1, 0);
408 }
409
410 if (first_zero == 16 && first_match == 16) {
411 return 3; /* no match */
412 } else if (first_zero == 16) {
413 return 1; /* matching elements, no match for zero */
414 } else if (first_match < first_zero) {
415 return 2; /* matching elements before match for zero */
416 }
417 return 0; /* match for zero */
418 }
419
420 #define DEF_VSTRC_HELPER(BITS) \
421 void HELPER(gvec_vstrc##BITS)(void *v1, const void *v2, const void *v3, \
422 const void *v4, uint32_t desc) \
423 { \
424 const bool in = extract32(simd_data(desc), 3, 1); \
425 const bool zs = extract32(simd_data(desc), 1, 1); \
426 \
427 vstrc(v1, v2, v3, v4, in, 0, zs, MO_##BITS); \
428 }
429 DEF_VSTRC_HELPER(8)
430 DEF_VSTRC_HELPER(16)
431 DEF_VSTRC_HELPER(32)
432
433 #define DEF_VSTRC_RT_HELPER(BITS) \
434 void HELPER(gvec_vstrc_rt##BITS)(void *v1, const void *v2, const void *v3, \
435 const void *v4, uint32_t desc) \
436 { \
437 const bool in = extract32(simd_data(desc), 3, 1); \
438 const bool zs = extract32(simd_data(desc), 1, 1); \
439 \
440 vstrc(v1, v2, v3, v4, in, 1, zs, MO_##BITS); \
441 }
442 DEF_VSTRC_RT_HELPER(8)
443 DEF_VSTRC_RT_HELPER(16)
444 DEF_VSTRC_RT_HELPER(32)
445
446 #define DEF_VSTRC_CC_HELPER(BITS) \
447 void HELPER(gvec_vstrc_cc##BITS)(void *v1, const void *v2, const void *v3, \
448 const void *v4, CPUS390XState *env, \
449 uint32_t desc) \
450 { \
451 const bool in = extract32(simd_data(desc), 3, 1); \
452 const bool zs = extract32(simd_data(desc), 1, 1); \
453 \
454 env->cc_op = vstrc(v1, v2, v3, v4, in, 0, zs, MO_##BITS); \
455 }
456 DEF_VSTRC_CC_HELPER(8)
457 DEF_VSTRC_CC_HELPER(16)
458 DEF_VSTRC_CC_HELPER(32)
459
460 #define DEF_VSTRC_CC_RT_HELPER(BITS) \
461 void HELPER(gvec_vstrc_cc_rt##BITS)(void *v1, const void *v2, const void *v3, \
462 const void *v4, CPUS390XState *env, \
463 uint32_t desc) \
464 { \
465 const bool in = extract32(simd_data(desc), 3, 1); \
466 const bool zs = extract32(simd_data(desc), 1, 1); \
467 \
468 env->cc_op = vstrc(v1, v2, v3, v4, in, 1, zs, MO_##BITS); \
469 }
470 DEF_VSTRC_CC_RT_HELPER(8)
471 DEF_VSTRC_CC_RT_HELPER(16)
472 DEF_VSTRC_CC_RT_HELPER(32)
473
vstrs(S390Vector * v1,const S390Vector * v2,const S390Vector * v3,const S390Vector * v4,uint8_t es,bool zs)474 static int vstrs(S390Vector *v1, const S390Vector *v2, const S390Vector *v3,
475 const S390Vector *v4, uint8_t es, bool zs)
476 {
477 int substr_elen, i, j, k, cc;
478 int nelem = 16 >> es;
479 int str_leftmost_0;
480
481 substr_elen = s390_vec_read_element8(v4, 7) >> es;
482
483 /* If ZS, bound substr length by min(nelem, strlen(v3)). */
484 if (zs) {
485 substr_elen = MIN(substr_elen, nelem);
486 for (i = 0; i < substr_elen; i++) {
487 if (s390_vec_read_element(v3, i, es) == 0) {
488 substr_elen = i;
489 break;
490 }
491 }
492 }
493
494 if (substr_elen == 0) {
495 cc = 2; /* full match for degenerate case of empty substr */
496 k = 0;
497 goto done;
498 }
499
500 /* If ZS, look for eos in the searched string. */
501 str_leftmost_0 = nelem;
502 if (zs) {
503 for (k = 0; k < nelem; k++) {
504 if (s390_vec_read_element(v2, k, es) == 0) {
505 str_leftmost_0 = k;
506 break;
507 }
508 }
509 }
510
511 cc = str_leftmost_0 == nelem ? 0 : 1; /* No match. */
512 for (k = 0; k < nelem; k++) {
513 i = MIN(nelem, k + substr_elen);
514 for (j = k; j < i; j++) {
515 uint32_t e2 = s390_vec_read_element(v2, j, es);
516 uint32_t e3 = s390_vec_read_element(v3, j - k, es);
517 if (e2 != e3) {
518 break;
519 }
520 }
521 if (j == i) {
522 /* All elements matched. */
523 if (k > str_leftmost_0) {
524 cc = 1; /* Ignored match. */
525 k = nelem;
526 } else if (i - k == substr_elen) {
527 cc = 2; /* Full match. */
528 } else {
529 cc = 3; /* Partial match. */
530 }
531 break;
532 }
533 }
534
535 done:
536 s390_vec_write_element64(v1, 0, k << es);
537 s390_vec_write_element64(v1, 1, 0);
538 return cc;
539 }
540
541 #define DEF_VSTRS_HELPER(BITS) \
542 void QEMU_FLATTEN HELPER(gvec_vstrs_##BITS)(void *v1, const void *v2, \
543 const void *v3, const void *v4, CPUS390XState *env, uint32_t desc) \
544 { env->cc_op = vstrs(v1, v2, v3, v4, MO_##BITS, false); } \
545 void QEMU_FLATTEN HELPER(gvec_vstrs_zs##BITS)(void *v1, const void *v2, \
546 const void *v3, const void *v4, CPUS390XState *env, uint32_t desc) \
547 { env->cc_op = vstrs(v1, v2, v3, v4, MO_##BITS, true); }
548
549 DEF_VSTRS_HELPER(8)
550 DEF_VSTRS_HELPER(16)
551 DEF_VSTRS_HELPER(32)
552