1 /* 2 * QEMU TCG support -- s390x vector string instruction support 3 * 4 * Copyright (C) 2019 Red Hat Inc 5 * 6 * Authors: 7 * David Hildenbrand <david@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2 or later. 10 * See the COPYING file in the top-level directory. 11 */ 12 #include "qemu/osdep.h" 13 #include "cpu.h" 14 #include "s390x-internal.h" 15 #include "vec.h" 16 #include "tcg/tcg.h" 17 #include "tcg/tcg-gvec-desc.h" 18 #include "exec/helper-proto.h" 19 20 /* 21 * Returns a bit set in the MSB of each element that is zero, 22 * as defined by the mask. 23 */ 24 static inline uint64_t zero_search(uint64_t a, uint64_t mask) 25 { 26 return ~(((a & mask) + mask) | a | mask); 27 } 28 29 /* 30 * Returns a bit set in the MSB of each element that is not zero, 31 * as defined by the mask. 32 */ 33 static inline uint64_t nonzero_search(uint64_t a, uint64_t mask) 34 { 35 return (((a & mask) + mask) | a) & ~mask; 36 } 37 38 /* 39 * Returns the byte offset for the first match, or 16 for no match. 40 */ 41 static inline int match_index(uint64_t c0, uint64_t c1) 42 { 43 return (c0 ? clz64(c0) : clz64(c1) + 64) >> 3; 44 } 45 46 /* 47 * Returns the number of bits composing one element. 48 */ 49 static uint8_t get_element_bits(uint8_t es) 50 { 51 return (1 << es) * BITS_PER_BYTE; 52 } 53 54 /* 55 * Returns the bitmask for a single element. 56 */ 57 static uint64_t get_single_element_mask(uint8_t es) 58 { 59 return -1ull >> (64 - get_element_bits(es)); 60 } 61 62 /* 63 * Returns the bitmask for a single element (excluding the MSB). 64 */ 65 static uint64_t get_single_element_lsbs_mask(uint8_t es) 66 { 67 return -1ull >> (65 - get_element_bits(es)); 68 } 69 70 /* 71 * Returns the bitmasks for multiple elements (excluding the MSBs). 72 */ 73 static uint64_t get_element_lsbs_mask(uint8_t es) 74 { 75 return dup_const(es, get_single_element_lsbs_mask(es)); 76 } 77 78 static int vfae(void *v1, const void *v2, const void *v3, bool in, 79 bool rt, bool zs, uint8_t es) 80 { 81 const uint64_t mask = get_element_lsbs_mask(es); 82 const int bits = get_element_bits(es); 83 uint64_t a0, a1, b0, b1, e0, e1, t0, t1, z0, z1; 84 uint64_t first_zero = 16; 85 uint64_t first_equal; 86 int i; 87 88 a0 = s390_vec_read_element64(v2, 0); 89 a1 = s390_vec_read_element64(v2, 1); 90 b0 = s390_vec_read_element64(v3, 0); 91 b1 = s390_vec_read_element64(v3, 1); 92 e0 = 0; 93 e1 = 0; 94 /* compare against equality with every other element */ 95 for (i = 0; i < 64; i += bits) { 96 t0 = rol64(b0, i); 97 t1 = rol64(b1, i); 98 e0 |= zero_search(a0 ^ t0, mask); 99 e0 |= zero_search(a0 ^ t1, mask); 100 e1 |= zero_search(a1 ^ t0, mask); 101 e1 |= zero_search(a1 ^ t1, mask); 102 } 103 /* invert the result if requested - invert only the MSBs */ 104 if (in) { 105 e0 = ~e0 & ~mask; 106 e1 = ~e1 & ~mask; 107 } 108 first_equal = match_index(e0, e1); 109 110 if (zs) { 111 z0 = zero_search(a0, mask); 112 z1 = zero_search(a1, mask); 113 first_zero = match_index(z0, z1); 114 } 115 116 if (rt) { 117 e0 = (e0 >> (bits - 1)) * get_single_element_mask(es); 118 e1 = (e1 >> (bits - 1)) * get_single_element_mask(es); 119 s390_vec_write_element64(v1, 0, e0); 120 s390_vec_write_element64(v1, 1, e1); 121 } else { 122 s390_vec_write_element64(v1, 0, MIN(first_equal, first_zero)); 123 s390_vec_write_element64(v1, 1, 0); 124 } 125 126 if (first_zero == 16 && first_equal == 16) { 127 return 3; /* no match */ 128 } else if (first_zero == 16) { 129 return 1; /* matching elements, no match for zero */ 130 } else if (first_equal < first_zero) { 131 return 2; /* matching elements before match for zero */ 132 } 133 return 0; /* match for zero */ 134 } 135 136 #define DEF_VFAE_HELPER(BITS) \ 137 void HELPER(gvec_vfae##BITS)(void *v1, const void *v2, const void *v3, \ 138 uint32_t desc) \ 139 { \ 140 const bool in = extract32(simd_data(desc), 3, 1); \ 141 const bool rt = extract32(simd_data(desc), 2, 1); \ 142 const bool zs = extract32(simd_data(desc), 1, 1); \ 143 \ 144 vfae(v1, v2, v3, in, rt, zs, MO_##BITS); \ 145 } 146 DEF_VFAE_HELPER(8) 147 DEF_VFAE_HELPER(16) 148 DEF_VFAE_HELPER(32) 149 150 #define DEF_VFAE_CC_HELPER(BITS) \ 151 void HELPER(gvec_vfae_cc##BITS)(void *v1, const void *v2, const void *v3, \ 152 CPUS390XState *env, uint32_t desc) \ 153 { \ 154 const bool in = extract32(simd_data(desc), 3, 1); \ 155 const bool rt = extract32(simd_data(desc), 2, 1); \ 156 const bool zs = extract32(simd_data(desc), 1, 1); \ 157 \ 158 env->cc_op = vfae(v1, v2, v3, in, rt, zs, MO_##BITS); \ 159 } 160 DEF_VFAE_CC_HELPER(8) 161 DEF_VFAE_CC_HELPER(16) 162 DEF_VFAE_CC_HELPER(32) 163 164 static int vfee(void *v1, const void *v2, const void *v3, bool zs, uint8_t es) 165 { 166 const uint64_t mask = get_element_lsbs_mask(es); 167 uint64_t a0, a1, b0, b1, e0, e1, z0, z1; 168 uint64_t first_zero = 16; 169 uint64_t first_equal; 170 171 a0 = s390_vec_read_element64(v2, 0); 172 a1 = s390_vec_read_element64(v2, 1); 173 b0 = s390_vec_read_element64(v3, 0); 174 b1 = s390_vec_read_element64(v3, 1); 175 e0 = zero_search(a0 ^ b0, mask); 176 e1 = zero_search(a1 ^ b1, mask); 177 first_equal = match_index(e0, e1); 178 179 if (zs) { 180 z0 = zero_search(a0, mask); 181 z1 = zero_search(a1, mask); 182 first_zero = match_index(z0, z1); 183 } 184 185 s390_vec_write_element64(v1, 0, MIN(first_equal, first_zero)); 186 s390_vec_write_element64(v1, 1, 0); 187 if (first_zero == 16 && first_equal == 16) { 188 return 3; /* no match */ 189 } else if (first_zero == 16) { 190 return 1; /* matching elements, no match for zero */ 191 } else if (first_equal < first_zero) { 192 return 2; /* matching elements before match for zero */ 193 } 194 return 0; /* match for zero */ 195 } 196 197 #define DEF_VFEE_HELPER(BITS) \ 198 void HELPER(gvec_vfee##BITS)(void *v1, const void *v2, const void *v3, \ 199 uint32_t desc) \ 200 { \ 201 const bool zs = extract32(simd_data(desc), 1, 1); \ 202 \ 203 vfee(v1, v2, v3, zs, MO_##BITS); \ 204 } 205 DEF_VFEE_HELPER(8) 206 DEF_VFEE_HELPER(16) 207 DEF_VFEE_HELPER(32) 208 209 #define DEF_VFEE_CC_HELPER(BITS) \ 210 void HELPER(gvec_vfee_cc##BITS)(void *v1, const void *v2, const void *v3, \ 211 CPUS390XState *env, uint32_t desc) \ 212 { \ 213 const bool zs = extract32(simd_data(desc), 1, 1); \ 214 \ 215 env->cc_op = vfee(v1, v2, v3, zs, MO_##BITS); \ 216 } 217 DEF_VFEE_CC_HELPER(8) 218 DEF_VFEE_CC_HELPER(16) 219 DEF_VFEE_CC_HELPER(32) 220 221 static int vfene(void *v1, const void *v2, const void *v3, bool zs, uint8_t es) 222 { 223 const uint64_t mask = get_element_lsbs_mask(es); 224 uint64_t a0, a1, b0, b1, e0, e1, z0, z1; 225 uint64_t first_zero = 16; 226 uint64_t first_inequal; 227 bool smaller = false; 228 229 a0 = s390_vec_read_element64(v2, 0); 230 a1 = s390_vec_read_element64(v2, 1); 231 b0 = s390_vec_read_element64(v3, 0); 232 b1 = s390_vec_read_element64(v3, 1); 233 e0 = nonzero_search(a0 ^ b0, mask); 234 e1 = nonzero_search(a1 ^ b1, mask); 235 first_inequal = match_index(e0, e1); 236 237 /* identify the smaller element */ 238 if (first_inequal < 16) { 239 uint8_t enr = first_inequal / (1 << es); 240 uint32_t a = s390_vec_read_element(v2, enr, es); 241 uint32_t b = s390_vec_read_element(v3, enr, es); 242 243 smaller = a < b; 244 } 245 246 if (zs) { 247 z0 = zero_search(a0, mask); 248 z1 = zero_search(a1, mask); 249 first_zero = match_index(z0, z1); 250 } 251 252 s390_vec_write_element64(v1, 0, MIN(first_inequal, first_zero)); 253 s390_vec_write_element64(v1, 1, 0); 254 if (first_zero == 16 && first_inequal == 16) { 255 return 3; 256 } else if (first_zero < first_inequal) { 257 return 0; 258 } 259 return smaller ? 1 : 2; 260 } 261 262 #define DEF_VFENE_HELPER(BITS) \ 263 void HELPER(gvec_vfene##BITS)(void *v1, const void *v2, const void *v3, \ 264 uint32_t desc) \ 265 { \ 266 const bool zs = extract32(simd_data(desc), 1, 1); \ 267 \ 268 vfene(v1, v2, v3, zs, MO_##BITS); \ 269 } 270 DEF_VFENE_HELPER(8) 271 DEF_VFENE_HELPER(16) 272 DEF_VFENE_HELPER(32) 273 274 #define DEF_VFENE_CC_HELPER(BITS) \ 275 void HELPER(gvec_vfene_cc##BITS)(void *v1, const void *v2, const void *v3, \ 276 CPUS390XState *env, uint32_t desc) \ 277 { \ 278 const bool zs = extract32(simd_data(desc), 1, 1); \ 279 \ 280 env->cc_op = vfene(v1, v2, v3, zs, MO_##BITS); \ 281 } 282 DEF_VFENE_CC_HELPER(8) 283 DEF_VFENE_CC_HELPER(16) 284 DEF_VFENE_CC_HELPER(32) 285 286 static int vistr(void *v1, const void *v2, uint8_t es) 287 { 288 const uint64_t mask = get_element_lsbs_mask(es); 289 uint64_t a0 = s390_vec_read_element64(v2, 0); 290 uint64_t a1 = s390_vec_read_element64(v2, 1); 291 uint64_t z; 292 int cc = 3; 293 294 z = zero_search(a0, mask); 295 if (z) { 296 a0 &= ~(-1ull >> clz64(z)); 297 a1 = 0; 298 cc = 0; 299 } else { 300 z = zero_search(a1, mask); 301 if (z) { 302 a1 &= ~(-1ull >> clz64(z)); 303 cc = 0; 304 } 305 } 306 307 s390_vec_write_element64(v1, 0, a0); 308 s390_vec_write_element64(v1, 1, a1); 309 return cc; 310 } 311 312 #define DEF_VISTR_HELPER(BITS) \ 313 void HELPER(gvec_vistr##BITS)(void *v1, const void *v2, uint32_t desc) \ 314 { \ 315 vistr(v1, v2, MO_##BITS); \ 316 } 317 DEF_VISTR_HELPER(8) 318 DEF_VISTR_HELPER(16) 319 DEF_VISTR_HELPER(32) 320 321 #define DEF_VISTR_CC_HELPER(BITS) \ 322 void HELPER(gvec_vistr_cc##BITS)(void *v1, const void *v2, CPUS390XState *env, \ 323 uint32_t desc) \ 324 { \ 325 env->cc_op = vistr(v1, v2, MO_##BITS); \ 326 } 327 DEF_VISTR_CC_HELPER(8) 328 DEF_VISTR_CC_HELPER(16) 329 DEF_VISTR_CC_HELPER(32) 330 331 static bool element_compare(uint32_t data, uint32_t l, uint8_t c) 332 { 333 const bool equal = extract32(c, 7, 1); 334 const bool lower = extract32(c, 6, 1); 335 const bool higher = extract32(c, 5, 1); 336 337 if (data < l) { 338 return lower; 339 } else if (data > l) { 340 return higher; 341 } 342 return equal; 343 } 344 345 static int vstrc(void *v1, const void *v2, const void *v3, const void *v4, 346 bool in, bool rt, bool zs, uint8_t es) 347 { 348 const uint64_t mask = get_element_lsbs_mask(es); 349 uint64_t a0 = s390_vec_read_element64(v2, 0); 350 uint64_t a1 = s390_vec_read_element64(v2, 1); 351 int first_zero = 16, first_match = 16; 352 S390Vector rt_result = {}; 353 uint64_t z0, z1; 354 int i, j; 355 356 if (zs) { 357 z0 = zero_search(a0, mask); 358 z1 = zero_search(a1, mask); 359 first_zero = match_index(z0, z1); 360 } 361 362 for (i = 0; i < 16 / (1 << es); i++) { 363 const uint32_t data = s390_vec_read_element(v2, i, es); 364 const int cur_byte = i * (1 << es); 365 bool any_match = false; 366 367 /* if we don't need a bit vector, we can stop early */ 368 if (cur_byte == first_zero && !rt) { 369 break; 370 } 371 372 for (j = 0; j < 16 / (1 << es); j += 2) { 373 const uint32_t l1 = s390_vec_read_element(v3, j, es); 374 const uint32_t l2 = s390_vec_read_element(v3, j + 1, es); 375 /* we are only interested in the highest byte of each element */ 376 const uint8_t c1 = s390_vec_read_element8(v4, j * (1 << es)); 377 const uint8_t c2 = s390_vec_read_element8(v4, (j + 1) * (1 << es)); 378 379 if (element_compare(data, l1, c1) && 380 element_compare(data, l2, c2)) { 381 any_match = true; 382 break; 383 } 384 } 385 /* invert the result if requested */ 386 any_match = in ^ any_match; 387 388 if (any_match) { 389 /* indicate bit vector if requested */ 390 if (rt) { 391 const uint64_t val = -1ull; 392 393 first_match = MIN(cur_byte, first_match); 394 s390_vec_write_element(&rt_result, i, es, val); 395 } else { 396 /* stop on the first match */ 397 first_match = cur_byte; 398 break; 399 } 400 } 401 } 402 403 if (rt) { 404 *(S390Vector *)v1 = rt_result; 405 } else { 406 s390_vec_write_element64(v1, 0, MIN(first_match, first_zero)); 407 s390_vec_write_element64(v1, 1, 0); 408 } 409 410 if (first_zero == 16 && first_match == 16) { 411 return 3; /* no match */ 412 } else if (first_zero == 16) { 413 return 1; /* matching elements, no match for zero */ 414 } else if (first_match < first_zero) { 415 return 2; /* matching elements before match for zero */ 416 } 417 return 0; /* match for zero */ 418 } 419 420 #define DEF_VSTRC_HELPER(BITS) \ 421 void HELPER(gvec_vstrc##BITS)(void *v1, const void *v2, const void *v3, \ 422 const void *v4, uint32_t desc) \ 423 { \ 424 const bool in = extract32(simd_data(desc), 3, 1); \ 425 const bool zs = extract32(simd_data(desc), 1, 1); \ 426 \ 427 vstrc(v1, v2, v3, v4, in, 0, zs, MO_##BITS); \ 428 } 429 DEF_VSTRC_HELPER(8) 430 DEF_VSTRC_HELPER(16) 431 DEF_VSTRC_HELPER(32) 432 433 #define DEF_VSTRC_RT_HELPER(BITS) \ 434 void HELPER(gvec_vstrc_rt##BITS)(void *v1, const void *v2, const void *v3, \ 435 const void *v4, uint32_t desc) \ 436 { \ 437 const bool in = extract32(simd_data(desc), 3, 1); \ 438 const bool zs = extract32(simd_data(desc), 1, 1); \ 439 \ 440 vstrc(v1, v2, v3, v4, in, 1, zs, MO_##BITS); \ 441 } 442 DEF_VSTRC_RT_HELPER(8) 443 DEF_VSTRC_RT_HELPER(16) 444 DEF_VSTRC_RT_HELPER(32) 445 446 #define DEF_VSTRC_CC_HELPER(BITS) \ 447 void HELPER(gvec_vstrc_cc##BITS)(void *v1, const void *v2, const void *v3, \ 448 const void *v4, CPUS390XState *env, \ 449 uint32_t desc) \ 450 { \ 451 const bool in = extract32(simd_data(desc), 3, 1); \ 452 const bool zs = extract32(simd_data(desc), 1, 1); \ 453 \ 454 env->cc_op = vstrc(v1, v2, v3, v4, in, 0, zs, MO_##BITS); \ 455 } 456 DEF_VSTRC_CC_HELPER(8) 457 DEF_VSTRC_CC_HELPER(16) 458 DEF_VSTRC_CC_HELPER(32) 459 460 #define DEF_VSTRC_CC_RT_HELPER(BITS) \ 461 void HELPER(gvec_vstrc_cc_rt##BITS)(void *v1, const void *v2, const void *v3, \ 462 const void *v4, CPUS390XState *env, \ 463 uint32_t desc) \ 464 { \ 465 const bool in = extract32(simd_data(desc), 3, 1); \ 466 const bool zs = extract32(simd_data(desc), 1, 1); \ 467 \ 468 env->cc_op = vstrc(v1, v2, v3, v4, in, 1, zs, MO_##BITS); \ 469 } 470 DEF_VSTRC_CC_RT_HELPER(8) 471 DEF_VSTRC_CC_RT_HELPER(16) 472 DEF_VSTRC_CC_RT_HELPER(32) 473 474 static int vstrs(S390Vector *v1, const S390Vector *v2, const S390Vector *v3, 475 const S390Vector *v4, uint8_t es, bool zs) 476 { 477 int substr_elen, i, j, k, cc; 478 int nelem = 16 >> es; 479 int str_leftmost_0; 480 481 substr_elen = s390_vec_read_element8(v4, 7) >> es; 482 483 /* If ZS, bound substr length by min(nelem, strlen(v3)). */ 484 if (zs) { 485 substr_elen = MIN(substr_elen, nelem); 486 for (i = 0; i < substr_elen; i++) { 487 if (s390_vec_read_element(v3, i, es) == 0) { 488 substr_elen = i; 489 break; 490 } 491 } 492 } 493 494 if (substr_elen == 0) { 495 cc = 2; /* full match for degenerate case of empty substr */ 496 k = 0; 497 goto done; 498 } 499 500 /* If ZS, look for eos in the searched string. */ 501 str_leftmost_0 = nelem; 502 if (zs) { 503 for (k = 0; k < nelem; k++) { 504 if (s390_vec_read_element(v2, k, es) == 0) { 505 str_leftmost_0 = k; 506 break; 507 } 508 } 509 } 510 511 cc = str_leftmost_0 == nelem ? 0 : 1; /* No match. */ 512 for (k = 0; k < nelem; k++) { 513 i = MIN(nelem, k + substr_elen); 514 for (j = k; j < i; j++) { 515 uint32_t e2 = s390_vec_read_element(v2, j, es); 516 uint32_t e3 = s390_vec_read_element(v3, j - k, es); 517 if (e2 != e3) { 518 break; 519 } 520 } 521 if (j == i) { 522 /* All elements matched. */ 523 if (k > str_leftmost_0) { 524 cc = 1; /* Ignored match. */ 525 k = nelem; 526 } else if (i - k == substr_elen) { 527 cc = 2; /* Full match. */ 528 } else { 529 cc = 3; /* Partial match. */ 530 } 531 break; 532 } 533 } 534 535 done: 536 s390_vec_write_element64(v1, 0, k << es); 537 s390_vec_write_element64(v1, 1, 0); 538 return cc; 539 } 540 541 #define DEF_VSTRS_HELPER(BITS) \ 542 void QEMU_FLATTEN HELPER(gvec_vstrs_##BITS)(void *v1, const void *v2, \ 543 const void *v3, const void *v4, CPUS390XState *env, uint32_t desc) \ 544 { env->cc_op = vstrs(v1, v2, v3, v4, MO_##BITS, false); } \ 545 void QEMU_FLATTEN HELPER(gvec_vstrs_zs##BITS)(void *v1, const void *v2, \ 546 const void *v3, const void *v4, CPUS390XState *env, uint32_t desc) \ 547 { env->cc_op = vstrs(v1, v2, v3, v4, MO_##BITS, true); } 548 549 DEF_VSTRS_HELPER(8) 550 DEF_VSTRS_HELPER(16) 551 DEF_VSTRS_HELPER(32) 552