1/* 2 * Core of the accelerated CRC algorithm. 3 * In your file, define the constants and CRC_FUNCTION_NAME 4 * Then include this file. 5 * 6 * Calculate the checksum of data that is 16 byte aligned and a multiple of 7 * 16 bytes. 8 * 9 * The first step is to reduce it to 1024 bits. We do this in 8 parallel 10 * chunks in order to mask the latency of the vpmsum instructions. If we 11 * have more than 32 kB of data to checksum we repeat this step multiple 12 * times, passing in the previous 1024 bits. 13 * 14 * The next step is to reduce the 1024 bits to 64 bits. This step adds 15 * 32 bits of 0s to the end - this matches what a CRC does. We just 16 * calculate constants that land the data in this 32 bits. 17 * 18 * We then use fixed point Barrett reduction to compute a mod n over GF(2) 19 * for n = CRC using POWER8 instructions. We use x = 32. 20 * 21 * http://en.wikipedia.org/wiki/Barrett_reduction 22 * 23 * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM 24 * 25 * This program is free software; you can redistribute it and/or 26 * modify it under the terms of the GNU General Public License 27 * as published by the Free Software Foundation; either version 28 * 2 of the License, or (at your option) any later version. 29*/ 30 31#include <asm/ppc_asm.h> 32#include <asm/ppc-opcode.h> 33 34#define MAX_SIZE 32768 35 36 .text 37 38#if defined(__BIG_ENDIAN__) && defined(REFLECT) 39#define BYTESWAP_DATA 40#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT) 41#define BYTESWAP_DATA 42#else 43#undef BYTESWAP_DATA 44#endif 45 46#define off16 r25 47#define off32 r26 48#define off48 r27 49#define off64 r28 50#define off80 r29 51#define off96 r30 52#define off112 r31 53 54#define const1 v24 55#define const2 v25 56 57#define byteswap v26 58#define mask_32bit v27 59#define mask_64bit v28 60#define zeroes v29 61 62#ifdef BYTESWAP_DATA 63#define VPERM(A, B, C, D) vperm A, B, C, D 64#else 65#define VPERM(A, B, C, D) 66#endif 67 68/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */ 69FUNC_START(CRC_FUNCTION_NAME) 70 std r31,-8(r1) 71 std r30,-16(r1) 72 std r29,-24(r1) 73 std r28,-32(r1) 74 std r27,-40(r1) 75 std r26,-48(r1) 76 std r25,-56(r1) 77 78 li off16,16 79 li off32,32 80 li off48,48 81 li off64,64 82 li off80,80 83 li off96,96 84 li off112,112 85 li r0,0 86 87 /* Enough room for saving 10 non volatile VMX registers */ 88 subi r6,r1,56+10*16 89 subi r7,r1,56+2*16 90 91 stvx v20,0,r6 92 stvx v21,off16,r6 93 stvx v22,off32,r6 94 stvx v23,off48,r6 95 stvx v24,off64,r6 96 stvx v25,off80,r6 97 stvx v26,off96,r6 98 stvx v27,off112,r6 99 stvx v28,0,r7 100 stvx v29,off16,r7 101 102 mr r10,r3 103 104 vxor zeroes,zeroes,zeroes 105 vspltisw v0,-1 106 107 vsldoi mask_32bit,zeroes,v0,4 108 vsldoi mask_64bit,zeroes,v0,8 109 110 /* Get the initial value into v8 */ 111 vxor v8,v8,v8 112 MTVRD(v8, R3) 113#ifdef REFLECT 114 vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */ 115#else 116 vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */ 117#endif 118 119#ifdef BYTESWAP_DATA 120 addis r3,r2,.byteswap_constant@toc@ha 121 addi r3,r3,.byteswap_constant@toc@l 122 123 lvx byteswap,0,r3 124 addi r3,r3,16 125#endif 126 127 cmpdi r5,256 128 blt .Lshort 129 130 rldicr r6,r5,0,56 131 132 /* Checksum in blocks of MAX_SIZE */ 1331: lis r7,MAX_SIZE@h 134 ori r7,r7,MAX_SIZE@l 135 mr r9,r7 136 cmpd r6,r7 137 bgt 2f 138 mr r7,r6 1392: subf r6,r7,r6 140 141 /* our main loop does 128 bytes at a time */ 142 srdi r7,r7,7 143 144 /* 145 * Work out the offset into the constants table to start at. Each 146 * constant is 16 bytes, and it is used against 128 bytes of input 147 * data - 128 / 16 = 8 148 */ 149 sldi r8,r7,4 150 srdi r9,r9,3 151 subf r8,r8,r9 152 153 /* We reduce our final 128 bytes in a separate step */ 154 addi r7,r7,-1 155 mtctr r7 156 157 addis r3,r2,.constants@toc@ha 158 addi r3,r3,.constants@toc@l 159 160 /* Find the start of our constants */ 161 add r3,r3,r8 162 163 /* zero v0-v7 which will contain our checksums */ 164 vxor v0,v0,v0 165 vxor v1,v1,v1 166 vxor v2,v2,v2 167 vxor v3,v3,v3 168 vxor v4,v4,v4 169 vxor v5,v5,v5 170 vxor v6,v6,v6 171 vxor v7,v7,v7 172 173 lvx const1,0,r3 174 175 /* 176 * If we are looping back to consume more data we use the values 177 * already in v16-v23. 178 */ 179 cmpdi r0,1 180 beq 2f 181 182 /* First warm up pass */ 183 lvx v16,0,r4 184 lvx v17,off16,r4 185 VPERM(v16,v16,v16,byteswap) 186 VPERM(v17,v17,v17,byteswap) 187 lvx v18,off32,r4 188 lvx v19,off48,r4 189 VPERM(v18,v18,v18,byteswap) 190 VPERM(v19,v19,v19,byteswap) 191 lvx v20,off64,r4 192 lvx v21,off80,r4 193 VPERM(v20,v20,v20,byteswap) 194 VPERM(v21,v21,v21,byteswap) 195 lvx v22,off96,r4 196 lvx v23,off112,r4 197 VPERM(v22,v22,v22,byteswap) 198 VPERM(v23,v23,v23,byteswap) 199 addi r4,r4,8*16 200 201 /* xor in initial value */ 202 vxor v16,v16,v8 203 2042: bdz .Lfirst_warm_up_done 205 206 addi r3,r3,16 207 lvx const2,0,r3 208 209 /* Second warm up pass */ 210 VPMSUMD(v8,v16,const1) 211 lvx v16,0,r4 212 VPERM(v16,v16,v16,byteswap) 213 ori r2,r2,0 214 215 VPMSUMD(v9,v17,const1) 216 lvx v17,off16,r4 217 VPERM(v17,v17,v17,byteswap) 218 ori r2,r2,0 219 220 VPMSUMD(v10,v18,const1) 221 lvx v18,off32,r4 222 VPERM(v18,v18,v18,byteswap) 223 ori r2,r2,0 224 225 VPMSUMD(v11,v19,const1) 226 lvx v19,off48,r4 227 VPERM(v19,v19,v19,byteswap) 228 ori r2,r2,0 229 230 VPMSUMD(v12,v20,const1) 231 lvx v20,off64,r4 232 VPERM(v20,v20,v20,byteswap) 233 ori r2,r2,0 234 235 VPMSUMD(v13,v21,const1) 236 lvx v21,off80,r4 237 VPERM(v21,v21,v21,byteswap) 238 ori r2,r2,0 239 240 VPMSUMD(v14,v22,const1) 241 lvx v22,off96,r4 242 VPERM(v22,v22,v22,byteswap) 243 ori r2,r2,0 244 245 VPMSUMD(v15,v23,const1) 246 lvx v23,off112,r4 247 VPERM(v23,v23,v23,byteswap) 248 249 addi r4,r4,8*16 250 251 bdz .Lfirst_cool_down 252 253 /* 254 * main loop. We modulo schedule it such that it takes three iterations 255 * to complete - first iteration load, second iteration vpmsum, third 256 * iteration xor. 257 */ 258 .balign 16 2594: lvx const1,0,r3 260 addi r3,r3,16 261 ori r2,r2,0 262 263 vxor v0,v0,v8 264 VPMSUMD(v8,v16,const2) 265 lvx v16,0,r4 266 VPERM(v16,v16,v16,byteswap) 267 ori r2,r2,0 268 269 vxor v1,v1,v9 270 VPMSUMD(v9,v17,const2) 271 lvx v17,off16,r4 272 VPERM(v17,v17,v17,byteswap) 273 ori r2,r2,0 274 275 vxor v2,v2,v10 276 VPMSUMD(v10,v18,const2) 277 lvx v18,off32,r4 278 VPERM(v18,v18,v18,byteswap) 279 ori r2,r2,0 280 281 vxor v3,v3,v11 282 VPMSUMD(v11,v19,const2) 283 lvx v19,off48,r4 284 VPERM(v19,v19,v19,byteswap) 285 lvx const2,0,r3 286 ori r2,r2,0 287 288 vxor v4,v4,v12 289 VPMSUMD(v12,v20,const1) 290 lvx v20,off64,r4 291 VPERM(v20,v20,v20,byteswap) 292 ori r2,r2,0 293 294 vxor v5,v5,v13 295 VPMSUMD(v13,v21,const1) 296 lvx v21,off80,r4 297 VPERM(v21,v21,v21,byteswap) 298 ori r2,r2,0 299 300 vxor v6,v6,v14 301 VPMSUMD(v14,v22,const1) 302 lvx v22,off96,r4 303 VPERM(v22,v22,v22,byteswap) 304 ori r2,r2,0 305 306 vxor v7,v7,v15 307 VPMSUMD(v15,v23,const1) 308 lvx v23,off112,r4 309 VPERM(v23,v23,v23,byteswap) 310 311 addi r4,r4,8*16 312 313 bdnz 4b 314 315.Lfirst_cool_down: 316 /* First cool down pass */ 317 lvx const1,0,r3 318 addi r3,r3,16 319 320 vxor v0,v0,v8 321 VPMSUMD(v8,v16,const1) 322 ori r2,r2,0 323 324 vxor v1,v1,v9 325 VPMSUMD(v9,v17,const1) 326 ori r2,r2,0 327 328 vxor v2,v2,v10 329 VPMSUMD(v10,v18,const1) 330 ori r2,r2,0 331 332 vxor v3,v3,v11 333 VPMSUMD(v11,v19,const1) 334 ori r2,r2,0 335 336 vxor v4,v4,v12 337 VPMSUMD(v12,v20,const1) 338 ori r2,r2,0 339 340 vxor v5,v5,v13 341 VPMSUMD(v13,v21,const1) 342 ori r2,r2,0 343 344 vxor v6,v6,v14 345 VPMSUMD(v14,v22,const1) 346 ori r2,r2,0 347 348 vxor v7,v7,v15 349 VPMSUMD(v15,v23,const1) 350 ori r2,r2,0 351 352.Lsecond_cool_down: 353 /* Second cool down pass */ 354 vxor v0,v0,v8 355 vxor v1,v1,v9 356 vxor v2,v2,v10 357 vxor v3,v3,v11 358 vxor v4,v4,v12 359 vxor v5,v5,v13 360 vxor v6,v6,v14 361 vxor v7,v7,v15 362 363#ifdef REFLECT 364 /* 365 * vpmsumd produces a 96 bit result in the least significant bits 366 * of the register. Since we are bit reflected we have to shift it 367 * left 32 bits so it occupies the least significant bits in the 368 * bit reflected domain. 369 */ 370 vsldoi v0,v0,zeroes,4 371 vsldoi v1,v1,zeroes,4 372 vsldoi v2,v2,zeroes,4 373 vsldoi v3,v3,zeroes,4 374 vsldoi v4,v4,zeroes,4 375 vsldoi v5,v5,zeroes,4 376 vsldoi v6,v6,zeroes,4 377 vsldoi v7,v7,zeroes,4 378#endif 379 380 /* xor with last 1024 bits */ 381 lvx v8,0,r4 382 lvx v9,off16,r4 383 VPERM(v8,v8,v8,byteswap) 384 VPERM(v9,v9,v9,byteswap) 385 lvx v10,off32,r4 386 lvx v11,off48,r4 387 VPERM(v10,v10,v10,byteswap) 388 VPERM(v11,v11,v11,byteswap) 389 lvx v12,off64,r4 390 lvx v13,off80,r4 391 VPERM(v12,v12,v12,byteswap) 392 VPERM(v13,v13,v13,byteswap) 393 lvx v14,off96,r4 394 lvx v15,off112,r4 395 VPERM(v14,v14,v14,byteswap) 396 VPERM(v15,v15,v15,byteswap) 397 398 addi r4,r4,8*16 399 400 vxor v16,v0,v8 401 vxor v17,v1,v9 402 vxor v18,v2,v10 403 vxor v19,v3,v11 404 vxor v20,v4,v12 405 vxor v21,v5,v13 406 vxor v22,v6,v14 407 vxor v23,v7,v15 408 409 li r0,1 410 cmpdi r6,0 411 addi r6,r6,128 412 bne 1b 413 414 /* Work out how many bytes we have left */ 415 andi. r5,r5,127 416 417 /* Calculate where in the constant table we need to start */ 418 subfic r6,r5,128 419 add r3,r3,r6 420 421 /* How many 16 byte chunks are in the tail */ 422 srdi r7,r5,4 423 mtctr r7 424 425 /* 426 * Reduce the previously calculated 1024 bits to 64 bits, shifting 427 * 32 bits to include the trailing 32 bits of zeros 428 */ 429 lvx v0,0,r3 430 lvx v1,off16,r3 431 lvx v2,off32,r3 432 lvx v3,off48,r3 433 lvx v4,off64,r3 434 lvx v5,off80,r3 435 lvx v6,off96,r3 436 lvx v7,off112,r3 437 addi r3,r3,8*16 438 439 VPMSUMW(v0,v16,v0) 440 VPMSUMW(v1,v17,v1) 441 VPMSUMW(v2,v18,v2) 442 VPMSUMW(v3,v19,v3) 443 VPMSUMW(v4,v20,v4) 444 VPMSUMW(v5,v21,v5) 445 VPMSUMW(v6,v22,v6) 446 VPMSUMW(v7,v23,v7) 447 448 /* Now reduce the tail (0 - 112 bytes) */ 449 cmpdi r7,0 450 beq 1f 451 452 lvx v16,0,r4 453 lvx v17,0,r3 454 VPERM(v16,v16,v16,byteswap) 455 VPMSUMW(v16,v16,v17) 456 vxor v0,v0,v16 457 bdz 1f 458 459 lvx v16,off16,r4 460 lvx v17,off16,r3 461 VPERM(v16,v16,v16,byteswap) 462 VPMSUMW(v16,v16,v17) 463 vxor v0,v0,v16 464 bdz 1f 465 466 lvx v16,off32,r4 467 lvx v17,off32,r3 468 VPERM(v16,v16,v16,byteswap) 469 VPMSUMW(v16,v16,v17) 470 vxor v0,v0,v16 471 bdz 1f 472 473 lvx v16,off48,r4 474 lvx v17,off48,r3 475 VPERM(v16,v16,v16,byteswap) 476 VPMSUMW(v16,v16,v17) 477 vxor v0,v0,v16 478 bdz 1f 479 480 lvx v16,off64,r4 481 lvx v17,off64,r3 482 VPERM(v16,v16,v16,byteswap) 483 VPMSUMW(v16,v16,v17) 484 vxor v0,v0,v16 485 bdz 1f 486 487 lvx v16,off80,r4 488 lvx v17,off80,r3 489 VPERM(v16,v16,v16,byteswap) 490 VPMSUMW(v16,v16,v17) 491 vxor v0,v0,v16 492 bdz 1f 493 494 lvx v16,off96,r4 495 lvx v17,off96,r3 496 VPERM(v16,v16,v16,byteswap) 497 VPMSUMW(v16,v16,v17) 498 vxor v0,v0,v16 499 500 /* Now xor all the parallel chunks together */ 5011: vxor v0,v0,v1 502 vxor v2,v2,v3 503 vxor v4,v4,v5 504 vxor v6,v6,v7 505 506 vxor v0,v0,v2 507 vxor v4,v4,v6 508 509 vxor v0,v0,v4 510 511.Lbarrett_reduction: 512 /* Barrett constants */ 513 addis r3,r2,.barrett_constants@toc@ha 514 addi r3,r3,.barrett_constants@toc@l 515 516 lvx const1,0,r3 517 lvx const2,off16,r3 518 519 vsldoi v1,v0,v0,8 520 vxor v0,v0,v1 /* xor two 64 bit results together */ 521 522#ifdef REFLECT 523 /* shift left one bit */ 524 vspltisb v1,1 525 vsl v0,v0,v1 526#endif 527 528 vand v0,v0,mask_64bit 529#ifndef REFLECT 530 /* 531 * Now for the Barrett reduction algorithm. The idea is to calculate q, 532 * the multiple of our polynomial that we need to subtract. By 533 * doing the computation 2x bits higher (ie 64 bits) and shifting the 534 * result back down 2x bits, we round down to the nearest multiple. 535 */ 536 VPMSUMD(v1,v0,const1) /* ma */ 537 vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */ 538 VPMSUMD(v1,v1,const2) /* qn */ 539 vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ 540 541 /* 542 * Get the result into r3. We need to shift it left 8 bytes: 543 * V0 [ 0 1 2 X ] 544 * V0 [ 0 X 2 3 ] 545 */ 546 vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */ 547#else 548 /* 549 * The reflected version of Barrett reduction. Instead of bit 550 * reflecting our data (which is expensive to do), we bit reflect our 551 * constants and our algorithm, which means the intermediate data in 552 * our vector registers goes from 0-63 instead of 63-0. We can reflect 553 * the algorithm because we don't carry in mod 2 arithmetic. 554 */ 555 vand v1,v0,mask_32bit /* bottom 32 bits of a */ 556 VPMSUMD(v1,v1,const1) /* ma */ 557 vand v1,v1,mask_32bit /* bottom 32bits of ma */ 558 VPMSUMD(v1,v1,const2) /* qn */ 559 vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ 560 561 /* 562 * Since we are bit reflected, the result (ie the low 32 bits) is in 563 * the high 32 bits. We just need to shift it left 4 bytes 564 * V0 [ 0 1 X 3 ] 565 * V0 [ 0 X 2 3 ] 566 */ 567 vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */ 568#endif 569 570 /* Get it into r3 */ 571 MFVRD(R3, v0) 572 573.Lout: 574 subi r6,r1,56+10*16 575 subi r7,r1,56+2*16 576 577 lvx v20,0,r6 578 lvx v21,off16,r6 579 lvx v22,off32,r6 580 lvx v23,off48,r6 581 lvx v24,off64,r6 582 lvx v25,off80,r6 583 lvx v26,off96,r6 584 lvx v27,off112,r6 585 lvx v28,0,r7 586 lvx v29,off16,r7 587 588 ld r31,-8(r1) 589 ld r30,-16(r1) 590 ld r29,-24(r1) 591 ld r28,-32(r1) 592 ld r27,-40(r1) 593 ld r26,-48(r1) 594 ld r25,-56(r1) 595 596 blr 597 598.Lfirst_warm_up_done: 599 lvx const1,0,r3 600 addi r3,r3,16 601 602 VPMSUMD(v8,v16,const1) 603 VPMSUMD(v9,v17,const1) 604 VPMSUMD(v10,v18,const1) 605 VPMSUMD(v11,v19,const1) 606 VPMSUMD(v12,v20,const1) 607 VPMSUMD(v13,v21,const1) 608 VPMSUMD(v14,v22,const1) 609 VPMSUMD(v15,v23,const1) 610 611 b .Lsecond_cool_down 612 613.Lshort: 614 cmpdi r5,0 615 beq .Lzero 616 617 addis r3,r2,.short_constants@toc@ha 618 addi r3,r3,.short_constants@toc@l 619 620 /* Calculate where in the constant table we need to start */ 621 subfic r6,r5,256 622 add r3,r3,r6 623 624 /* How many 16 byte chunks? */ 625 srdi r7,r5,4 626 mtctr r7 627 628 vxor v19,v19,v19 629 vxor v20,v20,v20 630 631 lvx v0,0,r4 632 lvx v16,0,r3 633 VPERM(v0,v0,v16,byteswap) 634 vxor v0,v0,v8 /* xor in initial value */ 635 VPMSUMW(v0,v0,v16) 636 bdz .Lv0 637 638 lvx v1,off16,r4 639 lvx v17,off16,r3 640 VPERM(v1,v1,v17,byteswap) 641 VPMSUMW(v1,v1,v17) 642 bdz .Lv1 643 644 lvx v2,off32,r4 645 lvx v16,off32,r3 646 VPERM(v2,v2,v16,byteswap) 647 VPMSUMW(v2,v2,v16) 648 bdz .Lv2 649 650 lvx v3,off48,r4 651 lvx v17,off48,r3 652 VPERM(v3,v3,v17,byteswap) 653 VPMSUMW(v3,v3,v17) 654 bdz .Lv3 655 656 lvx v4,off64,r4 657 lvx v16,off64,r3 658 VPERM(v4,v4,v16,byteswap) 659 VPMSUMW(v4,v4,v16) 660 bdz .Lv4 661 662 lvx v5,off80,r4 663 lvx v17,off80,r3 664 VPERM(v5,v5,v17,byteswap) 665 VPMSUMW(v5,v5,v17) 666 bdz .Lv5 667 668 lvx v6,off96,r4 669 lvx v16,off96,r3 670 VPERM(v6,v6,v16,byteswap) 671 VPMSUMW(v6,v6,v16) 672 bdz .Lv6 673 674 lvx v7,off112,r4 675 lvx v17,off112,r3 676 VPERM(v7,v7,v17,byteswap) 677 VPMSUMW(v7,v7,v17) 678 bdz .Lv7 679 680 addi r3,r3,128 681 addi r4,r4,128 682 683 lvx v8,0,r4 684 lvx v16,0,r3 685 VPERM(v8,v8,v16,byteswap) 686 VPMSUMW(v8,v8,v16) 687 bdz .Lv8 688 689 lvx v9,off16,r4 690 lvx v17,off16,r3 691 VPERM(v9,v9,v17,byteswap) 692 VPMSUMW(v9,v9,v17) 693 bdz .Lv9 694 695 lvx v10,off32,r4 696 lvx v16,off32,r3 697 VPERM(v10,v10,v16,byteswap) 698 VPMSUMW(v10,v10,v16) 699 bdz .Lv10 700 701 lvx v11,off48,r4 702 lvx v17,off48,r3 703 VPERM(v11,v11,v17,byteswap) 704 VPMSUMW(v11,v11,v17) 705 bdz .Lv11 706 707 lvx v12,off64,r4 708 lvx v16,off64,r3 709 VPERM(v12,v12,v16,byteswap) 710 VPMSUMW(v12,v12,v16) 711 bdz .Lv12 712 713 lvx v13,off80,r4 714 lvx v17,off80,r3 715 VPERM(v13,v13,v17,byteswap) 716 VPMSUMW(v13,v13,v17) 717 bdz .Lv13 718 719 lvx v14,off96,r4 720 lvx v16,off96,r3 721 VPERM(v14,v14,v16,byteswap) 722 VPMSUMW(v14,v14,v16) 723 bdz .Lv14 724 725 lvx v15,off112,r4 726 lvx v17,off112,r3 727 VPERM(v15,v15,v17,byteswap) 728 VPMSUMW(v15,v15,v17) 729 730.Lv15: vxor v19,v19,v15 731.Lv14: vxor v20,v20,v14 732.Lv13: vxor v19,v19,v13 733.Lv12: vxor v20,v20,v12 734.Lv11: vxor v19,v19,v11 735.Lv10: vxor v20,v20,v10 736.Lv9: vxor v19,v19,v9 737.Lv8: vxor v20,v20,v8 738.Lv7: vxor v19,v19,v7 739.Lv6: vxor v20,v20,v6 740.Lv5: vxor v19,v19,v5 741.Lv4: vxor v20,v20,v4 742.Lv3: vxor v19,v19,v3 743.Lv2: vxor v20,v20,v2 744.Lv1: vxor v19,v19,v1 745.Lv0: vxor v20,v20,v0 746 747 vxor v0,v19,v20 748 749 b .Lbarrett_reduction 750 751.Lzero: 752 mr r3,r10 753 b .Lout 754 755FUNC_END(CRC_FUNCTION_NAME) 756