1/* 2 * Author: Anton Blanchard <anton@au.ibm.com> 3 * Copyright 2015 IBM Corporation. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License 7 * as published by the Free Software Foundation; either version 8 * 2 of the License, or (at your option) any later version. 9 */ 10#include <asm/ppc_asm.h> 11#include <asm/export.h> 12#include <asm/ppc-opcode.h> 13 14#define off8 r6 15#define off16 r7 16#define off24 r8 17 18#define rA r9 19#define rB r10 20#define rC r11 21#define rD r27 22#define rE r28 23#define rF r29 24#define rG r30 25#define rH r31 26 27#ifdef __LITTLE_ENDIAN__ 28#define LH lhbrx 29#define LW lwbrx 30#define LD ldbrx 31#define LVS lvsr 32#define VPERM(_VRT,_VRA,_VRB,_VRC) \ 33 vperm _VRT,_VRB,_VRA,_VRC 34#else 35#define LH lhzx 36#define LW lwzx 37#define LD ldx 38#define LVS lvsl 39#define VPERM(_VRT,_VRA,_VRB,_VRC) \ 40 vperm _VRT,_VRA,_VRB,_VRC 41#endif 42 43#define VMX_THRESH 4096 44#define ENTER_VMX_OPS \ 45 mflr r0; \ 46 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ 47 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ 48 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ 49 std r0,16(r1); \ 50 stdu r1,-STACKFRAMESIZE(r1); \ 51 bl enter_vmx_ops; \ 52 cmpwi cr1,r3,0; \ 53 ld r0,STACKFRAMESIZE+16(r1); \ 54 ld r3,STK_REG(R31)(r1); \ 55 ld r4,STK_REG(R30)(r1); \ 56 ld r5,STK_REG(R29)(r1); \ 57 addi r1,r1,STACKFRAMESIZE; \ 58 mtlr r0 59 60#define EXIT_VMX_OPS \ 61 mflr r0; \ 62 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ 63 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ 64 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ 65 std r0,16(r1); \ 66 stdu r1,-STACKFRAMESIZE(r1); \ 67 bl exit_vmx_ops; \ 68 ld r0,STACKFRAMESIZE+16(r1); \ 69 ld r3,STK_REG(R31)(r1); \ 70 ld r4,STK_REG(R30)(r1); \ 71 ld r5,STK_REG(R29)(r1); \ 72 addi r1,r1,STACKFRAMESIZE; \ 73 mtlr r0 74 75/* 76 * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with 77 * 16 bytes boundary and permute the result with the 1st 16 bytes. 78 79 * | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z | 80 * ^ ^ ^ 81 * 0xbbbb10 0xbbbb20 0xbbb30 82 * ^ 83 * _vaddr 84 * 85 * 86 * _vmask is the mask generated by LVS 87 * _v1st_qw is the 1st aligned QW of current addr which is already loaded. 88 * for example: 0xyyyyyyyyyyyyy012 for big endian 89 * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded. 90 * for example: 0x3456789abcdefzzz for big endian 91 * The permute result is saved in _v_res. 92 * for example: 0x0123456789abcdef for big endian. 93 */ 94#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \ 95 lvx _v2nd_qw,_vaddr,off16; \ 96 VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask) 97 98/* 99 * There are 2 categories for memcmp: 100 * 1) src/dst has the same offset to the 8 bytes boundary. The handlers 101 * are named like .Lsameoffset_xxxx 102 * 2) src/dst has different offset to the 8 bytes boundary. The handlers 103 * are named like .Ldiffoffset_xxxx 104 */ 105_GLOBAL_TOC(memcmp) 106 cmpdi cr1,r5,0 107 108 /* Use the short loop if the src/dst addresses are not 109 * with the same offset of 8 bytes align boundary. 110 */ 111 xor r6,r3,r4 112 andi. r6,r6,7 113 114 /* Fall back to short loop if compare at aligned addrs 115 * with less than 8 bytes. 116 */ 117 cmpdi cr6,r5,7 118 119 beq cr1,.Lzero 120 bgt cr6,.Lno_short 121 122.Lshort: 123 mtctr r5 1241: lbz rA,0(r3) 125 lbz rB,0(r4) 126 subf. rC,rB,rA 127 bne .Lnon_zero 128 bdz .Lzero 129 130 lbz rA,1(r3) 131 lbz rB,1(r4) 132 subf. rC,rB,rA 133 bne .Lnon_zero 134 bdz .Lzero 135 136 lbz rA,2(r3) 137 lbz rB,2(r4) 138 subf. rC,rB,rA 139 bne .Lnon_zero 140 bdz .Lzero 141 142 lbz rA,3(r3) 143 lbz rB,3(r4) 144 subf. rC,rB,rA 145 bne .Lnon_zero 146 147 addi r3,r3,4 148 addi r4,r4,4 149 150 bdnz 1b 151 152.Lzero: 153 li r3,0 154 blr 155 156.Lno_short: 157 dcbt 0,r3 158 dcbt 0,r4 159 bne .Ldiffoffset_8bytes_make_align_start 160 161 162.Lsameoffset_8bytes_make_align_start: 163 /* attempt to compare bytes not aligned with 8 bytes so that 164 * rest comparison can run based on 8 bytes alignment. 165 */ 166 andi. r6,r3,7 167 168 /* Try to compare the first double word which is not 8 bytes aligned: 169 * load the first double word at (src & ~7UL) and shift left appropriate 170 * bits before comparision. 171 */ 172 rlwinm r6,r3,3,26,28 173 beq .Lsameoffset_8bytes_aligned 174 clrrdi r3,r3,3 175 clrrdi r4,r4,3 176 LD rA,0,r3 177 LD rB,0,r4 178 sld rA,rA,r6 179 sld rB,rB,r6 180 cmpld cr0,rA,rB 181 srwi r6,r6,3 182 bne cr0,.LcmpAB_lightweight 183 subfic r6,r6,8 184 subf. r5,r6,r5 185 addi r3,r3,8 186 addi r4,r4,8 187 beq .Lzero 188 189.Lsameoffset_8bytes_aligned: 190 /* now we are aligned with 8 bytes. 191 * Use .Llong loop if left cmp bytes are equal or greater than 32B. 192 */ 193 cmpdi cr6,r5,31 194 bgt cr6,.Llong 195 196.Lcmp_lt32bytes: 197 /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */ 198 cmpdi cr5,r5,7 199 srdi r0,r5,3 200 ble cr5,.Lcmp_rest_lt8bytes 201 202 /* handle 8 ~ 31 bytes */ 203 clrldi r5,r5,61 204 mtctr r0 2052: 206 LD rA,0,r3 207 LD rB,0,r4 208 cmpld cr0,rA,rB 209 addi r3,r3,8 210 addi r4,r4,8 211 bne cr0,.LcmpAB_lightweight 212 bdnz 2b 213 214 cmpwi r5,0 215 beq .Lzero 216 217.Lcmp_rest_lt8bytes: 218 /* Here we have only less than 8 bytes to compare with. at least s1 219 * Address is aligned with 8 bytes. 220 * The next double words are load and shift right with appropriate 221 * bits. 222 */ 223 subfic r6,r5,8 224 slwi r6,r6,3 225 LD rA,0,r3 226 LD rB,0,r4 227 srd rA,rA,r6 228 srd rB,rB,r6 229 cmpld cr0,rA,rB 230 bne cr0,.LcmpAB_lightweight 231 b .Lzero 232 233.Lnon_zero: 234 mr r3,rC 235 blr 236 237.Llong: 238#ifdef CONFIG_ALTIVEC 239BEGIN_FTR_SECTION 240 /* Try to use vmx loop if length is equal or greater than 4K */ 241 cmpldi cr6,r5,VMX_THRESH 242 bge cr6,.Lsameoffset_vmx_cmp 243END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 244 245.Llong_novmx_cmp: 246#endif 247 /* At least s1 addr is aligned with 8 bytes */ 248 li off8,8 249 li off16,16 250 li off24,24 251 252 std r31,-8(r1) 253 std r30,-16(r1) 254 std r29,-24(r1) 255 std r28,-32(r1) 256 std r27,-40(r1) 257 258 srdi r0,r5,5 259 mtctr r0 260 andi. r5,r5,31 261 262 LD rA,0,r3 263 LD rB,0,r4 264 265 LD rC,off8,r3 266 LD rD,off8,r4 267 268 LD rE,off16,r3 269 LD rF,off16,r4 270 271 LD rG,off24,r3 272 LD rH,off24,r4 273 cmpld cr0,rA,rB 274 275 addi r3,r3,32 276 addi r4,r4,32 277 278 bdz .Lfirst32 279 280 LD rA,0,r3 281 LD rB,0,r4 282 cmpld cr1,rC,rD 283 284 LD rC,off8,r3 285 LD rD,off8,r4 286 cmpld cr6,rE,rF 287 288 LD rE,off16,r3 289 LD rF,off16,r4 290 cmpld cr7,rG,rH 291 bne cr0,.LcmpAB 292 293 LD rG,off24,r3 294 LD rH,off24,r4 295 cmpld cr0,rA,rB 296 bne cr1,.LcmpCD 297 298 addi r3,r3,32 299 addi r4,r4,32 300 301 bdz .Lsecond32 302 303 .balign 16 304 3051: LD rA,0,r3 306 LD rB,0,r4 307 cmpld cr1,rC,rD 308 bne cr6,.LcmpEF 309 310 LD rC,off8,r3 311 LD rD,off8,r4 312 cmpld cr6,rE,rF 313 bne cr7,.LcmpGH 314 315 LD rE,off16,r3 316 LD rF,off16,r4 317 cmpld cr7,rG,rH 318 bne cr0,.LcmpAB 319 320 LD rG,off24,r3 321 LD rH,off24,r4 322 cmpld cr0,rA,rB 323 bne cr1,.LcmpCD 324 325 addi r3,r3,32 326 addi r4,r4,32 327 328 bdnz 1b 329 330.Lsecond32: 331 cmpld cr1,rC,rD 332 bne cr6,.LcmpEF 333 334 cmpld cr6,rE,rF 335 bne cr7,.LcmpGH 336 337 cmpld cr7,rG,rH 338 bne cr0,.LcmpAB 339 340 bne cr1,.LcmpCD 341 bne cr6,.LcmpEF 342 bne cr7,.LcmpGH 343 344.Ltail: 345 ld r31,-8(r1) 346 ld r30,-16(r1) 347 ld r29,-24(r1) 348 ld r28,-32(r1) 349 ld r27,-40(r1) 350 351 cmpdi r5,0 352 beq .Lzero 353 b .Lshort 354 355.Lfirst32: 356 cmpld cr1,rC,rD 357 cmpld cr6,rE,rF 358 cmpld cr7,rG,rH 359 360 bne cr0,.LcmpAB 361 bne cr1,.LcmpCD 362 bne cr6,.LcmpEF 363 bne cr7,.LcmpGH 364 365 b .Ltail 366 367.LcmpAB: 368 li r3,1 369 bgt cr0,.Lout 370 li r3,-1 371 b .Lout 372 373.LcmpCD: 374 li r3,1 375 bgt cr1,.Lout 376 li r3,-1 377 b .Lout 378 379.LcmpEF: 380 li r3,1 381 bgt cr6,.Lout 382 li r3,-1 383 b .Lout 384 385.LcmpGH: 386 li r3,1 387 bgt cr7,.Lout 388 li r3,-1 389 390.Lout: 391 ld r31,-8(r1) 392 ld r30,-16(r1) 393 ld r29,-24(r1) 394 ld r28,-32(r1) 395 ld r27,-40(r1) 396 blr 397 398.LcmpAB_lightweight: /* skip NV GPRS restore */ 399 li r3,1 400 bgtlr 401 li r3,-1 402 blr 403 404#ifdef CONFIG_ALTIVEC 405.Lsameoffset_vmx_cmp: 406 /* Enter with src/dst addrs has the same offset with 8 bytes 407 * align boundary. 408 * 409 * There is an optimization based on following fact: memcmp() 410 * prones to fail early at the first 32 bytes. 411 * Before applying VMX instructions which will lead to 32x128bits 412 * VMX regs load/restore penalty, we compare the first 32 bytes 413 * so that we can catch the ~80% fail cases. 414 */ 415 416 li r0,4 417 mtctr r0 418.Lsameoffset_prechk_32B_loop: 419 LD rA,0,r3 420 LD rB,0,r4 421 cmpld cr0,rA,rB 422 addi r3,r3,8 423 addi r4,r4,8 424 bne cr0,.LcmpAB_lightweight 425 addi r5,r5,-8 426 bdnz .Lsameoffset_prechk_32B_loop 427 428 ENTER_VMX_OPS 429 beq cr1,.Llong_novmx_cmp 430 4313: 432 /* need to check whether r4 has the same offset with r3 433 * for 16 bytes boundary. 434 */ 435 xor r0,r3,r4 436 andi. r0,r0,0xf 437 bne .Ldiffoffset_vmx_cmp_start 438 439 /* len is no less than 4KB. Need to align with 16 bytes further. 440 */ 441 andi. rA,r3,8 442 LD rA,0,r3 443 beq 4f 444 LD rB,0,r4 445 cmpld cr0,rA,rB 446 addi r3,r3,8 447 addi r4,r4,8 448 addi r5,r5,-8 449 450 beq cr0,4f 451 /* save and restore cr0 */ 452 mfocrf r5,128 453 EXIT_VMX_OPS 454 mtocrf 128,r5 455 b .LcmpAB_lightweight 456 4574: 458 /* compare 32 bytes for each loop */ 459 srdi r0,r5,5 460 mtctr r0 461 clrldi r5,r5,59 462 li off16,16 463 464.balign 16 4655: 466 lvx v0,0,r3 467 lvx v1,0,r4 468 VCMPEQUD_RC(v0,v0,v1) 469 bnl cr6,7f 470 lvx v0,off16,r3 471 lvx v1,off16,r4 472 VCMPEQUD_RC(v0,v0,v1) 473 bnl cr6,6f 474 addi r3,r3,32 475 addi r4,r4,32 476 bdnz 5b 477 478 EXIT_VMX_OPS 479 cmpdi r5,0 480 beq .Lzero 481 b .Lcmp_lt32bytes 482 4836: 484 addi r3,r3,16 485 addi r4,r4,16 486 4877: 488 /* diff the last 16 bytes */ 489 EXIT_VMX_OPS 490 LD rA,0,r3 491 LD rB,0,r4 492 cmpld cr0,rA,rB 493 li off8,8 494 bne cr0,.LcmpAB_lightweight 495 496 LD rA,off8,r3 497 LD rB,off8,r4 498 cmpld cr0,rA,rB 499 bne cr0,.LcmpAB_lightweight 500 b .Lzero 501#endif 502 503.Ldiffoffset_8bytes_make_align_start: 504 /* now try to align s1 with 8 bytes */ 505 rlwinm r6,r3,3,26,28 506 beq .Ldiffoffset_align_s1_8bytes 507 508 clrrdi r3,r3,3 509 LD rA,0,r3 510 LD rB,0,r4 /* unaligned load */ 511 sld rA,rA,r6 512 srd rA,rA,r6 513 srd rB,rB,r6 514 cmpld cr0,rA,rB 515 srwi r6,r6,3 516 bne cr0,.LcmpAB_lightweight 517 518 subfic r6,r6,8 519 subf. r5,r6,r5 520 addi r3,r3,8 521 add r4,r4,r6 522 523 beq .Lzero 524 525.Ldiffoffset_align_s1_8bytes: 526 /* now s1 is aligned with 8 bytes. */ 527#ifdef CONFIG_ALTIVEC 528BEGIN_FTR_SECTION 529 /* only do vmx ops when the size equal or greater than 4K bytes */ 530 cmpdi cr5,r5,VMX_THRESH 531 bge cr5,.Ldiffoffset_vmx_cmp 532END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 533 534.Ldiffoffset_novmx_cmp: 535#endif 536 537 538 cmpdi cr5,r5,31 539 ble cr5,.Lcmp_lt32bytes 540 541#ifdef CONFIG_ALTIVEC 542 b .Llong_novmx_cmp 543#else 544 b .Llong 545#endif 546 547#ifdef CONFIG_ALTIVEC 548.Ldiffoffset_vmx_cmp: 549 /* perform a 32 bytes pre-checking before 550 * enable VMX operations. 551 */ 552 li r0,4 553 mtctr r0 554.Ldiffoffset_prechk_32B_loop: 555 LD rA,0,r3 556 LD rB,0,r4 557 cmpld cr0,rA,rB 558 addi r3,r3,8 559 addi r4,r4,8 560 bne cr0,.LcmpAB_lightweight 561 addi r5,r5,-8 562 bdnz .Ldiffoffset_prechk_32B_loop 563 564 ENTER_VMX_OPS 565 beq cr1,.Ldiffoffset_novmx_cmp 566 567.Ldiffoffset_vmx_cmp_start: 568 /* Firstly try to align r3 with 16 bytes */ 569 andi. r6,r3,0xf 570 li off16,16 571 beq .Ldiffoffset_vmx_s1_16bytes_align 572 573 LVS v3,0,r3 574 LVS v4,0,r4 575 576 lvx v5,0,r3 577 lvx v6,0,r4 578 LD_VSR_CROSS16B(r3,v3,v5,v7,v9) 579 LD_VSR_CROSS16B(r4,v4,v6,v8,v10) 580 581 VCMPEQUB_RC(v7,v9,v10) 582 bnl cr6,.Ldiffoffset_vmx_diff_found 583 584 subfic r6,r6,16 585 subf r5,r6,r5 586 add r3,r3,r6 587 add r4,r4,r6 588 589.Ldiffoffset_vmx_s1_16bytes_align: 590 /* now s1 is aligned with 16 bytes */ 591 lvx v6,0,r4 592 LVS v4,0,r4 593 srdi r6,r5,5 /* loop for 32 bytes each */ 594 clrldi r5,r5,59 595 mtctr r6 596 597.balign 16 598.Ldiffoffset_vmx_32bytesloop: 599 /* the first qw of r4 was saved in v6 */ 600 lvx v9,0,r3 601 LD_VSR_CROSS16B(r4,v4,v6,v8,v10) 602 VCMPEQUB_RC(v7,v9,v10) 603 vor v6,v8,v8 604 bnl cr6,.Ldiffoffset_vmx_diff_found 605 606 addi r3,r3,16 607 addi r4,r4,16 608 609 lvx v9,0,r3 610 LD_VSR_CROSS16B(r4,v4,v6,v8,v10) 611 VCMPEQUB_RC(v7,v9,v10) 612 vor v6,v8,v8 613 bnl cr6,.Ldiffoffset_vmx_diff_found 614 615 addi r3,r3,16 616 addi r4,r4,16 617 618 bdnz .Ldiffoffset_vmx_32bytesloop 619 620 EXIT_VMX_OPS 621 622 cmpdi r5,0 623 beq .Lzero 624 b .Lcmp_lt32bytes 625 626.Ldiffoffset_vmx_diff_found: 627 EXIT_VMX_OPS 628 /* anyway, the diff will appear in next 16 bytes */ 629 li r5,16 630 b .Lcmp_lt32bytes 631 632#endif 633EXPORT_SYMBOL(memcmp) 634