1/* 2 * Author: Anton Blanchard <anton@au.ibm.com> 3 * Copyright 2015 IBM Corporation. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License 7 * as published by the Free Software Foundation; either version 8 * 2 of the License, or (at your option) any later version. 9 */ 10#include <asm/ppc_asm.h> 11#include <asm/export.h> 12#include <asm/ppc-opcode.h> 13 14#define off8 r6 15#define off16 r7 16#define off24 r8 17 18#define rA r9 19#define rB r10 20#define rC r11 21#define rD r27 22#define rE r28 23#define rF r29 24#define rG r30 25#define rH r31 26 27#ifdef __LITTLE_ENDIAN__ 28#define LH lhbrx 29#define LW lwbrx 30#define LD ldbrx 31#define LVS lvsr 32#define VPERM(_VRT,_VRA,_VRB,_VRC) \ 33 vperm _VRT,_VRB,_VRA,_VRC 34#else 35#define LH lhzx 36#define LW lwzx 37#define LD ldx 38#define LVS lvsl 39#define VPERM(_VRT,_VRA,_VRB,_VRC) \ 40 vperm _VRT,_VRA,_VRB,_VRC 41#endif 42 43#define VMX_THRESH 4096 44#define ENTER_VMX_OPS \ 45 mflr r0; \ 46 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ 47 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ 48 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ 49 std r0,16(r1); \ 50 stdu r1,-STACKFRAMESIZE(r1); \ 51 bl enter_vmx_ops; \ 52 cmpwi cr1,r3,0; \ 53 ld r0,STACKFRAMESIZE+16(r1); \ 54 ld r3,STK_REG(R31)(r1); \ 55 ld r4,STK_REG(R30)(r1); \ 56 ld r5,STK_REG(R29)(r1); \ 57 addi r1,r1,STACKFRAMESIZE; \ 58 mtlr r0 59 60#define EXIT_VMX_OPS \ 61 mflr r0; \ 62 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ 63 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ 64 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ 65 std r0,16(r1); \ 66 stdu r1,-STACKFRAMESIZE(r1); \ 67 bl exit_vmx_ops; \ 68 ld r0,STACKFRAMESIZE+16(r1); \ 69 ld r3,STK_REG(R31)(r1); \ 70 ld r4,STK_REG(R30)(r1); \ 71 ld r5,STK_REG(R29)(r1); \ 72 addi r1,r1,STACKFRAMESIZE; \ 73 mtlr r0 74 75/* 76 * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with 77 * 16 bytes boundary and permute the result with the 1st 16 bytes. 78 79 * | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z | 80 * ^ ^ ^ 81 * 0xbbbb10 0xbbbb20 0xbbb30 82 * ^ 83 * _vaddr 84 * 85 * 86 * _vmask is the mask generated by LVS 87 * _v1st_qw is the 1st aligned QW of current addr which is already loaded. 88 * for example: 0xyyyyyyyyyyyyy012 for big endian 89 * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded. 90 * for example: 0x3456789abcdefzzz for big endian 91 * The permute result is saved in _v_res. 92 * for example: 0x0123456789abcdef for big endian. 93 */ 94#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \ 95 lvx _v2nd_qw,_vaddr,off16; \ 96 VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask) 97 98/* 99 * There are 2 categories for memcmp: 100 * 1) src/dst has the same offset to the 8 bytes boundary. The handlers 101 * are named like .Lsameoffset_xxxx 102 * 2) src/dst has different offset to the 8 bytes boundary. The handlers 103 * are named like .Ldiffoffset_xxxx 104 */ 105_GLOBAL_TOC(memcmp) 106 cmpdi cr1,r5,0 107 108 /* Use the short loop if the src/dst addresses are not 109 * with the same offset of 8 bytes align boundary. 110 */ 111 xor r6,r3,r4 112 andi. r6,r6,7 113 114 /* Fall back to short loop if compare at aligned addrs 115 * with less than 8 bytes. 116 */ 117 cmpdi cr6,r5,7 118 119 beq cr1,.Lzero 120 bgt cr6,.Lno_short 121 122.Lshort: 123 mtctr r5 1241: lbz rA,0(r3) 125 lbz rB,0(r4) 126 subf. rC,rB,rA 127 bne .Lnon_zero 128 bdz .Lzero 129 130 lbz rA,1(r3) 131 lbz rB,1(r4) 132 subf. rC,rB,rA 133 bne .Lnon_zero 134 bdz .Lzero 135 136 lbz rA,2(r3) 137 lbz rB,2(r4) 138 subf. rC,rB,rA 139 bne .Lnon_zero 140 bdz .Lzero 141 142 lbz rA,3(r3) 143 lbz rB,3(r4) 144 subf. rC,rB,rA 145 bne .Lnon_zero 146 147 addi r3,r3,4 148 addi r4,r4,4 149 150 bdnz 1b 151 152.Lzero: 153 li r3,0 154 blr 155 156.Lno_short: 157 dcbt 0,r3 158 dcbt 0,r4 159 bne .Ldiffoffset_8bytes_make_align_start 160 161 162.Lsameoffset_8bytes_make_align_start: 163 /* attempt to compare bytes not aligned with 8 bytes so that 164 * rest comparison can run based on 8 bytes alignment. 165 */ 166 andi. r6,r3,7 167 168 /* Try to compare the first double word which is not 8 bytes aligned: 169 * load the first double word at (src & ~7UL) and shift left appropriate 170 * bits before comparision. 171 */ 172 rlwinm r6,r3,3,26,28 173 beq .Lsameoffset_8bytes_aligned 174 clrrdi r3,r3,3 175 clrrdi r4,r4,3 176 LD rA,0,r3 177 LD rB,0,r4 178 sld rA,rA,r6 179 sld rB,rB,r6 180 cmpld cr0,rA,rB 181 srwi r6,r6,3 182 bne cr0,.LcmpAB_lightweight 183 subfic r6,r6,8 184 subf. r5,r6,r5 185 addi r3,r3,8 186 addi r4,r4,8 187 beq .Lzero 188 189.Lsameoffset_8bytes_aligned: 190 /* now we are aligned with 8 bytes. 191 * Use .Llong loop if left cmp bytes are equal or greater than 32B. 192 */ 193 cmpdi cr6,r5,31 194 bgt cr6,.Llong 195 196.Lcmp_lt32bytes: 197 /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */ 198 cmpdi cr5,r5,7 199 srdi r0,r5,3 200 ble cr5,.Lcmp_rest_lt8bytes 201 202 /* handle 8 ~ 31 bytes */ 203 clrldi r5,r5,61 204 mtctr r0 2052: 206 LD rA,0,r3 207 LD rB,0,r4 208 cmpld cr0,rA,rB 209 addi r3,r3,8 210 addi r4,r4,8 211 bne cr0,.LcmpAB_lightweight 212 bdnz 2b 213 214 cmpwi r5,0 215 beq .Lzero 216 217.Lcmp_rest_lt8bytes: 218 /* 219 * Here we have less than 8 bytes to compare. At least s1 is aligned to 220 * 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a 221 * page boundary, otherwise we might read past the end of the buffer and 222 * trigger a page fault. We use 4K as the conservative minimum page 223 * size. If we detect that case we go to the byte-by-byte loop. 224 * 225 * Otherwise the next double word is loaded from s1 and s2, and shifted 226 * right to compare the appropriate bits. 227 */ 228 clrldi r6,r4,(64-12) // r6 = r4 & 0xfff 229 cmpdi r6,0xff8 230 bgt .Lshort 231 232 subfic r6,r5,8 233 slwi r6,r6,3 234 LD rA,0,r3 235 LD rB,0,r4 236 srd rA,rA,r6 237 srd rB,rB,r6 238 cmpld cr0,rA,rB 239 bne cr0,.LcmpAB_lightweight 240 b .Lzero 241 242.Lnon_zero: 243 mr r3,rC 244 blr 245 246.Llong: 247#ifdef CONFIG_ALTIVEC 248BEGIN_FTR_SECTION 249 /* Try to use vmx loop if length is equal or greater than 4K */ 250 cmpldi cr6,r5,VMX_THRESH 251 bge cr6,.Lsameoffset_vmx_cmp 252END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 253 254.Llong_novmx_cmp: 255#endif 256 /* At least s1 addr is aligned with 8 bytes */ 257 li off8,8 258 li off16,16 259 li off24,24 260 261 std r31,-8(r1) 262 std r30,-16(r1) 263 std r29,-24(r1) 264 std r28,-32(r1) 265 std r27,-40(r1) 266 267 srdi r0,r5,5 268 mtctr r0 269 andi. r5,r5,31 270 271 LD rA,0,r3 272 LD rB,0,r4 273 274 LD rC,off8,r3 275 LD rD,off8,r4 276 277 LD rE,off16,r3 278 LD rF,off16,r4 279 280 LD rG,off24,r3 281 LD rH,off24,r4 282 cmpld cr0,rA,rB 283 284 addi r3,r3,32 285 addi r4,r4,32 286 287 bdz .Lfirst32 288 289 LD rA,0,r3 290 LD rB,0,r4 291 cmpld cr1,rC,rD 292 293 LD rC,off8,r3 294 LD rD,off8,r4 295 cmpld cr6,rE,rF 296 297 LD rE,off16,r3 298 LD rF,off16,r4 299 cmpld cr7,rG,rH 300 bne cr0,.LcmpAB 301 302 LD rG,off24,r3 303 LD rH,off24,r4 304 cmpld cr0,rA,rB 305 bne cr1,.LcmpCD 306 307 addi r3,r3,32 308 addi r4,r4,32 309 310 bdz .Lsecond32 311 312 .balign 16 313 3141: LD rA,0,r3 315 LD rB,0,r4 316 cmpld cr1,rC,rD 317 bne cr6,.LcmpEF 318 319 LD rC,off8,r3 320 LD rD,off8,r4 321 cmpld cr6,rE,rF 322 bne cr7,.LcmpGH 323 324 LD rE,off16,r3 325 LD rF,off16,r4 326 cmpld cr7,rG,rH 327 bne cr0,.LcmpAB 328 329 LD rG,off24,r3 330 LD rH,off24,r4 331 cmpld cr0,rA,rB 332 bne cr1,.LcmpCD 333 334 addi r3,r3,32 335 addi r4,r4,32 336 337 bdnz 1b 338 339.Lsecond32: 340 cmpld cr1,rC,rD 341 bne cr6,.LcmpEF 342 343 cmpld cr6,rE,rF 344 bne cr7,.LcmpGH 345 346 cmpld cr7,rG,rH 347 bne cr0,.LcmpAB 348 349 bne cr1,.LcmpCD 350 bne cr6,.LcmpEF 351 bne cr7,.LcmpGH 352 353.Ltail: 354 ld r31,-8(r1) 355 ld r30,-16(r1) 356 ld r29,-24(r1) 357 ld r28,-32(r1) 358 ld r27,-40(r1) 359 360 cmpdi r5,0 361 beq .Lzero 362 b .Lshort 363 364.Lfirst32: 365 cmpld cr1,rC,rD 366 cmpld cr6,rE,rF 367 cmpld cr7,rG,rH 368 369 bne cr0,.LcmpAB 370 bne cr1,.LcmpCD 371 bne cr6,.LcmpEF 372 bne cr7,.LcmpGH 373 374 b .Ltail 375 376.LcmpAB: 377 li r3,1 378 bgt cr0,.Lout 379 li r3,-1 380 b .Lout 381 382.LcmpCD: 383 li r3,1 384 bgt cr1,.Lout 385 li r3,-1 386 b .Lout 387 388.LcmpEF: 389 li r3,1 390 bgt cr6,.Lout 391 li r3,-1 392 b .Lout 393 394.LcmpGH: 395 li r3,1 396 bgt cr7,.Lout 397 li r3,-1 398 399.Lout: 400 ld r31,-8(r1) 401 ld r30,-16(r1) 402 ld r29,-24(r1) 403 ld r28,-32(r1) 404 ld r27,-40(r1) 405 blr 406 407.LcmpAB_lightweight: /* skip NV GPRS restore */ 408 li r3,1 409 bgtlr 410 li r3,-1 411 blr 412 413#ifdef CONFIG_ALTIVEC 414.Lsameoffset_vmx_cmp: 415 /* Enter with src/dst addrs has the same offset with 8 bytes 416 * align boundary. 417 * 418 * There is an optimization based on following fact: memcmp() 419 * prones to fail early at the first 32 bytes. 420 * Before applying VMX instructions which will lead to 32x128bits 421 * VMX regs load/restore penalty, we compare the first 32 bytes 422 * so that we can catch the ~80% fail cases. 423 */ 424 425 li r0,4 426 mtctr r0 427.Lsameoffset_prechk_32B_loop: 428 LD rA,0,r3 429 LD rB,0,r4 430 cmpld cr0,rA,rB 431 addi r3,r3,8 432 addi r4,r4,8 433 bne cr0,.LcmpAB_lightweight 434 addi r5,r5,-8 435 bdnz .Lsameoffset_prechk_32B_loop 436 437 ENTER_VMX_OPS 438 beq cr1,.Llong_novmx_cmp 439 4403: 441 /* need to check whether r4 has the same offset with r3 442 * for 16 bytes boundary. 443 */ 444 xor r0,r3,r4 445 andi. r0,r0,0xf 446 bne .Ldiffoffset_vmx_cmp_start 447 448 /* len is no less than 4KB. Need to align with 16 bytes further. 449 */ 450 andi. rA,r3,8 451 LD rA,0,r3 452 beq 4f 453 LD rB,0,r4 454 cmpld cr0,rA,rB 455 addi r3,r3,8 456 addi r4,r4,8 457 addi r5,r5,-8 458 459 beq cr0,4f 460 /* save and restore cr0 */ 461 mfocrf r5,128 462 EXIT_VMX_OPS 463 mtocrf 128,r5 464 b .LcmpAB_lightweight 465 4664: 467 /* compare 32 bytes for each loop */ 468 srdi r0,r5,5 469 mtctr r0 470 clrldi r5,r5,59 471 li off16,16 472 473.balign 16 4745: 475 lvx v0,0,r3 476 lvx v1,0,r4 477 VCMPEQUD_RC(v0,v0,v1) 478 bnl cr6,7f 479 lvx v0,off16,r3 480 lvx v1,off16,r4 481 VCMPEQUD_RC(v0,v0,v1) 482 bnl cr6,6f 483 addi r3,r3,32 484 addi r4,r4,32 485 bdnz 5b 486 487 EXIT_VMX_OPS 488 cmpdi r5,0 489 beq .Lzero 490 b .Lcmp_lt32bytes 491 4926: 493 addi r3,r3,16 494 addi r4,r4,16 495 4967: 497 /* diff the last 16 bytes */ 498 EXIT_VMX_OPS 499 LD rA,0,r3 500 LD rB,0,r4 501 cmpld cr0,rA,rB 502 li off8,8 503 bne cr0,.LcmpAB_lightweight 504 505 LD rA,off8,r3 506 LD rB,off8,r4 507 cmpld cr0,rA,rB 508 bne cr0,.LcmpAB_lightweight 509 b .Lzero 510#endif 511 512.Ldiffoffset_8bytes_make_align_start: 513 /* now try to align s1 with 8 bytes */ 514 rlwinm r6,r3,3,26,28 515 beq .Ldiffoffset_align_s1_8bytes 516 517 clrrdi r3,r3,3 518 LD rA,0,r3 519 LD rB,0,r4 /* unaligned load */ 520 sld rA,rA,r6 521 srd rA,rA,r6 522 srd rB,rB,r6 523 cmpld cr0,rA,rB 524 srwi r6,r6,3 525 bne cr0,.LcmpAB_lightweight 526 527 subfic r6,r6,8 528 subf. r5,r6,r5 529 addi r3,r3,8 530 add r4,r4,r6 531 532 beq .Lzero 533 534.Ldiffoffset_align_s1_8bytes: 535 /* now s1 is aligned with 8 bytes. */ 536#ifdef CONFIG_ALTIVEC 537BEGIN_FTR_SECTION 538 /* only do vmx ops when the size equal or greater than 4K bytes */ 539 cmpdi cr5,r5,VMX_THRESH 540 bge cr5,.Ldiffoffset_vmx_cmp 541END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 542 543.Ldiffoffset_novmx_cmp: 544#endif 545 546 547 cmpdi cr5,r5,31 548 ble cr5,.Lcmp_lt32bytes 549 550#ifdef CONFIG_ALTIVEC 551 b .Llong_novmx_cmp 552#else 553 b .Llong 554#endif 555 556#ifdef CONFIG_ALTIVEC 557.Ldiffoffset_vmx_cmp: 558 /* perform a 32 bytes pre-checking before 559 * enable VMX operations. 560 */ 561 li r0,4 562 mtctr r0 563.Ldiffoffset_prechk_32B_loop: 564 LD rA,0,r3 565 LD rB,0,r4 566 cmpld cr0,rA,rB 567 addi r3,r3,8 568 addi r4,r4,8 569 bne cr0,.LcmpAB_lightweight 570 addi r5,r5,-8 571 bdnz .Ldiffoffset_prechk_32B_loop 572 573 ENTER_VMX_OPS 574 beq cr1,.Ldiffoffset_novmx_cmp 575 576.Ldiffoffset_vmx_cmp_start: 577 /* Firstly try to align r3 with 16 bytes */ 578 andi. r6,r3,0xf 579 li off16,16 580 beq .Ldiffoffset_vmx_s1_16bytes_align 581 582 LVS v3,0,r3 583 LVS v4,0,r4 584 585 lvx v5,0,r3 586 lvx v6,0,r4 587 LD_VSR_CROSS16B(r3,v3,v5,v7,v9) 588 LD_VSR_CROSS16B(r4,v4,v6,v8,v10) 589 590 VCMPEQUB_RC(v7,v9,v10) 591 bnl cr6,.Ldiffoffset_vmx_diff_found 592 593 subfic r6,r6,16 594 subf r5,r6,r5 595 add r3,r3,r6 596 add r4,r4,r6 597 598.Ldiffoffset_vmx_s1_16bytes_align: 599 /* now s1 is aligned with 16 bytes */ 600 lvx v6,0,r4 601 LVS v4,0,r4 602 srdi r6,r5,5 /* loop for 32 bytes each */ 603 clrldi r5,r5,59 604 mtctr r6 605 606.balign 16 607.Ldiffoffset_vmx_32bytesloop: 608 /* the first qw of r4 was saved in v6 */ 609 lvx v9,0,r3 610 LD_VSR_CROSS16B(r4,v4,v6,v8,v10) 611 VCMPEQUB_RC(v7,v9,v10) 612 vor v6,v8,v8 613 bnl cr6,.Ldiffoffset_vmx_diff_found 614 615 addi r3,r3,16 616 addi r4,r4,16 617 618 lvx v9,0,r3 619 LD_VSR_CROSS16B(r4,v4,v6,v8,v10) 620 VCMPEQUB_RC(v7,v9,v10) 621 vor v6,v8,v8 622 bnl cr6,.Ldiffoffset_vmx_diff_found 623 624 addi r3,r3,16 625 addi r4,r4,16 626 627 bdnz .Ldiffoffset_vmx_32bytesloop 628 629 EXIT_VMX_OPS 630 631 cmpdi r5,0 632 beq .Lzero 633 b .Lcmp_lt32bytes 634 635.Ldiffoffset_vmx_diff_found: 636 EXIT_VMX_OPS 637 /* anyway, the diff will appear in next 16 bytes */ 638 li r5,16 639 b .Lcmp_lt32bytes 640 641#endif 642EXPORT_SYMBOL(memcmp) 643