1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * This file contains assembly-language implementations 4 * of IP-style 1's complement checksum routines. 5 * 6 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) 7 * 8 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). 9 */ 10 11#include <linux/sys.h> 12#include <asm/processor.h> 13#include <asm/errno.h> 14#include <asm/ppc_asm.h> 15#include <asm/export.h> 16 17/* 18 * Computes the checksum of a memory block at buff, length len, 19 * and adds in "sum" (32-bit). 20 * 21 * __csum_partial(r3=buff, r4=len, r5=sum) 22 */ 23_GLOBAL(__csum_partial) 24 addic r0,r5,0 /* clear carry */ 25 26 srdi. r6,r4,3 /* less than 8 bytes? */ 27 beq .Lcsum_tail_word 28 29 /* 30 * If only halfword aligned, align to a double word. Since odd 31 * aligned addresses should be rare and they would require more 32 * work to calculate the correct checksum, we ignore that case 33 * and take the potential slowdown of unaligned loads. 34 */ 35 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */ 36 beq .Lcsum_aligned 37 38 li r7,4 39 sub r6,r7,r6 40 mtctr r6 41 421: 43 lhz r6,0(r3) /* align to doubleword */ 44 subi r4,r4,2 45 addi r3,r3,2 46 adde r0,r0,r6 47 bdnz 1b 48 49.Lcsum_aligned: 50 /* 51 * We unroll the loop such that each iteration is 64 bytes with an 52 * entry and exit limb of 64 bytes, meaning a minimum size of 53 * 128 bytes. 54 */ 55 srdi. r6,r4,7 56 beq .Lcsum_tail_doublewords /* len < 128 */ 57 58 srdi r6,r4,6 59 subi r6,r6,1 60 mtctr r6 61 62 stdu r1,-STACKFRAMESIZE(r1) 63 std r14,STK_REG(R14)(r1) 64 std r15,STK_REG(R15)(r1) 65 std r16,STK_REG(R16)(r1) 66 67 ld r6,0(r3) 68 ld r9,8(r3) 69 70 ld r10,16(r3) 71 ld r11,24(r3) 72 73 /* 74 * On POWER6 and POWER7 back to back adde instructions take 2 cycles 75 * because of the XER dependency. This means the fastest this loop can 76 * go is 16 cycles per iteration. The scheduling of the loop below has 77 * been shown to hit this on both POWER6 and POWER7. 78 */ 79 .align 5 802: 81 adde r0,r0,r6 82 ld r12,32(r3) 83 ld r14,40(r3) 84 85 adde r0,r0,r9 86 ld r15,48(r3) 87 ld r16,56(r3) 88 addi r3,r3,64 89 90 adde r0,r0,r10 91 92 adde r0,r0,r11 93 94 adde r0,r0,r12 95 96 adde r0,r0,r14 97 98 adde r0,r0,r15 99 ld r6,0(r3) 100 ld r9,8(r3) 101 102 adde r0,r0,r16 103 ld r10,16(r3) 104 ld r11,24(r3) 105 bdnz 2b 106 107 108 adde r0,r0,r6 109 ld r12,32(r3) 110 ld r14,40(r3) 111 112 adde r0,r0,r9 113 ld r15,48(r3) 114 ld r16,56(r3) 115 addi r3,r3,64 116 117 adde r0,r0,r10 118 adde r0,r0,r11 119 adde r0,r0,r12 120 adde r0,r0,r14 121 adde r0,r0,r15 122 adde r0,r0,r16 123 124 ld r14,STK_REG(R14)(r1) 125 ld r15,STK_REG(R15)(r1) 126 ld r16,STK_REG(R16)(r1) 127 addi r1,r1,STACKFRAMESIZE 128 129 andi. r4,r4,63 130 131.Lcsum_tail_doublewords: /* Up to 127 bytes to go */ 132 srdi. r6,r4,3 133 beq .Lcsum_tail_word 134 135 mtctr r6 1363: 137 ld r6,0(r3) 138 addi r3,r3,8 139 adde r0,r0,r6 140 bdnz 3b 141 142 andi. r4,r4,7 143 144.Lcsum_tail_word: /* Up to 7 bytes to go */ 145 srdi. r6,r4,2 146 beq .Lcsum_tail_halfword 147 148 lwz r6,0(r3) 149 addi r3,r3,4 150 adde r0,r0,r6 151 subi r4,r4,4 152 153.Lcsum_tail_halfword: /* Up to 3 bytes to go */ 154 srdi. r6,r4,1 155 beq .Lcsum_tail_byte 156 157 lhz r6,0(r3) 158 addi r3,r3,2 159 adde r0,r0,r6 160 subi r4,r4,2 161 162.Lcsum_tail_byte: /* Up to 1 byte to go */ 163 andi. r6,r4,1 164 beq .Lcsum_finish 165 166 lbz r6,0(r3) 167#ifdef __BIG_ENDIAN__ 168 sldi r9,r6,8 /* Pad the byte out to 16 bits */ 169 adde r0,r0,r9 170#else 171 adde r0,r0,r6 172#endif 173 174.Lcsum_finish: 175 addze r0,r0 /* add in final carry */ 176 rldicl r4,r0,32,0 /* fold two 32 bit halves together */ 177 add r3,r4,r0 178 srdi r3,r3,32 179 blr 180EXPORT_SYMBOL(__csum_partial) 181 182 183 .macro srcnr 184100: 185 EX_TABLE(100b,.Lsrc_error_nr) 186 .endm 187 188 .macro source 189150: 190 EX_TABLE(150b,.Lsrc_error) 191 .endm 192 193 .macro dstnr 194200: 195 EX_TABLE(200b,.Ldest_error_nr) 196 .endm 197 198 .macro dest 199250: 200 EX_TABLE(250b,.Ldest_error) 201 .endm 202 203/* 204 * Computes the checksum of a memory block at src, length len, 205 * and adds in "sum" (32-bit), while copying the block to dst. 206 * If an access exception occurs on src or dst, it stores -EFAULT 207 * to *src_err or *dst_err respectively. The caller must take any action 208 * required in this case (zeroing memory, recalculating partial checksum etc). 209 * 210 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err) 211 */ 212_GLOBAL(csum_partial_copy_generic) 213 addic r0,r6,0 /* clear carry */ 214 215 srdi. r6,r5,3 /* less than 8 bytes? */ 216 beq .Lcopy_tail_word 217 218 /* 219 * If only halfword aligned, align to a double word. Since odd 220 * aligned addresses should be rare and they would require more 221 * work to calculate the correct checksum, we ignore that case 222 * and take the potential slowdown of unaligned loads. 223 * 224 * If the source and destination are relatively unaligned we only 225 * align the source. This keeps things simple. 226 */ 227 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */ 228 beq .Lcopy_aligned 229 230 li r9,4 231 sub r6,r9,r6 232 mtctr r6 233 2341: 235srcnr; lhz r6,0(r3) /* align to doubleword */ 236 subi r5,r5,2 237 addi r3,r3,2 238 adde r0,r0,r6 239dstnr; sth r6,0(r4) 240 addi r4,r4,2 241 bdnz 1b 242 243.Lcopy_aligned: 244 /* 245 * We unroll the loop such that each iteration is 64 bytes with an 246 * entry and exit limb of 64 bytes, meaning a minimum size of 247 * 128 bytes. 248 */ 249 srdi. r6,r5,7 250 beq .Lcopy_tail_doublewords /* len < 128 */ 251 252 srdi r6,r5,6 253 subi r6,r6,1 254 mtctr r6 255 256 stdu r1,-STACKFRAMESIZE(r1) 257 std r14,STK_REG(R14)(r1) 258 std r15,STK_REG(R15)(r1) 259 std r16,STK_REG(R16)(r1) 260 261source; ld r6,0(r3) 262source; ld r9,8(r3) 263 264source; ld r10,16(r3) 265source; ld r11,24(r3) 266 267 /* 268 * On POWER6 and POWER7 back to back adde instructions take 2 cycles 269 * because of the XER dependency. This means the fastest this loop can 270 * go is 16 cycles per iteration. The scheduling of the loop below has 271 * been shown to hit this on both POWER6 and POWER7. 272 */ 273 .align 5 2742: 275 adde r0,r0,r6 276source; ld r12,32(r3) 277source; ld r14,40(r3) 278 279 adde r0,r0,r9 280source; ld r15,48(r3) 281source; ld r16,56(r3) 282 addi r3,r3,64 283 284 adde r0,r0,r10 285dest; std r6,0(r4) 286dest; std r9,8(r4) 287 288 adde r0,r0,r11 289dest; std r10,16(r4) 290dest; std r11,24(r4) 291 292 adde r0,r0,r12 293dest; std r12,32(r4) 294dest; std r14,40(r4) 295 296 adde r0,r0,r14 297dest; std r15,48(r4) 298dest; std r16,56(r4) 299 addi r4,r4,64 300 301 adde r0,r0,r15 302source; ld r6,0(r3) 303source; ld r9,8(r3) 304 305 adde r0,r0,r16 306source; ld r10,16(r3) 307source; ld r11,24(r3) 308 bdnz 2b 309 310 311 adde r0,r0,r6 312source; ld r12,32(r3) 313source; ld r14,40(r3) 314 315 adde r0,r0,r9 316source; ld r15,48(r3) 317source; ld r16,56(r3) 318 addi r3,r3,64 319 320 adde r0,r0,r10 321dest; std r6,0(r4) 322dest; std r9,8(r4) 323 324 adde r0,r0,r11 325dest; std r10,16(r4) 326dest; std r11,24(r4) 327 328 adde r0,r0,r12 329dest; std r12,32(r4) 330dest; std r14,40(r4) 331 332 adde r0,r0,r14 333dest; std r15,48(r4) 334dest; std r16,56(r4) 335 addi r4,r4,64 336 337 adde r0,r0,r15 338 adde r0,r0,r16 339 340 ld r14,STK_REG(R14)(r1) 341 ld r15,STK_REG(R15)(r1) 342 ld r16,STK_REG(R16)(r1) 343 addi r1,r1,STACKFRAMESIZE 344 345 andi. r5,r5,63 346 347.Lcopy_tail_doublewords: /* Up to 127 bytes to go */ 348 srdi. r6,r5,3 349 beq .Lcopy_tail_word 350 351 mtctr r6 3523: 353srcnr; ld r6,0(r3) 354 addi r3,r3,8 355 adde r0,r0,r6 356dstnr; std r6,0(r4) 357 addi r4,r4,8 358 bdnz 3b 359 360 andi. r5,r5,7 361 362.Lcopy_tail_word: /* Up to 7 bytes to go */ 363 srdi. r6,r5,2 364 beq .Lcopy_tail_halfword 365 366srcnr; lwz r6,0(r3) 367 addi r3,r3,4 368 adde r0,r0,r6 369dstnr; stw r6,0(r4) 370 addi r4,r4,4 371 subi r5,r5,4 372 373.Lcopy_tail_halfword: /* Up to 3 bytes to go */ 374 srdi. r6,r5,1 375 beq .Lcopy_tail_byte 376 377srcnr; lhz r6,0(r3) 378 addi r3,r3,2 379 adde r0,r0,r6 380dstnr; sth r6,0(r4) 381 addi r4,r4,2 382 subi r5,r5,2 383 384.Lcopy_tail_byte: /* Up to 1 byte to go */ 385 andi. r6,r5,1 386 beq .Lcopy_finish 387 388srcnr; lbz r6,0(r3) 389#ifdef __BIG_ENDIAN__ 390 sldi r9,r6,8 /* Pad the byte out to 16 bits */ 391 adde r0,r0,r9 392#else 393 adde r0,r0,r6 394#endif 395dstnr; stb r6,0(r4) 396 397.Lcopy_finish: 398 addze r0,r0 /* add in final carry */ 399 rldicl r4,r0,32,0 /* fold two 32 bit halves together */ 400 add r3,r4,r0 401 srdi r3,r3,32 402 blr 403 404.Lsrc_error: 405 ld r14,STK_REG(R14)(r1) 406 ld r15,STK_REG(R15)(r1) 407 ld r16,STK_REG(R16)(r1) 408 addi r1,r1,STACKFRAMESIZE 409.Lsrc_error_nr: 410 cmpdi 0,r7,0 411 beqlr 412 li r6,-EFAULT 413 stw r6,0(r7) 414 blr 415 416.Ldest_error: 417 ld r14,STK_REG(R14)(r1) 418 ld r15,STK_REG(R15)(r1) 419 ld r16,STK_REG(R16)(r1) 420 addi r1,r1,STACKFRAMESIZE 421.Ldest_error_nr: 422 cmpdi 0,r8,0 423 beqlr 424 li r6,-EFAULT 425 stw r6,0(r8) 426 blr 427EXPORT_SYMBOL(csum_partial_copy_generic) 428 429/* 430 * __sum16 csum_ipv6_magic(const struct in6_addr *saddr, 431 * const struct in6_addr *daddr, 432 * __u32 len, __u8 proto, __wsum sum) 433 */ 434 435_GLOBAL(csum_ipv6_magic) 436 ld r8, 0(r3) 437 ld r9, 8(r3) 438 add r5, r5, r6 439 addc r0, r8, r9 440 ld r10, 0(r4) 441 ld r11, 8(r4) 442#ifdef CONFIG_CPU_LITTLE_ENDIAN 443 rotldi r5, r5, 8 444#endif 445 adde r0, r0, r10 446 add r5, r5, r7 447 adde r0, r0, r11 448 adde r0, r0, r5 449 addze r0, r0 450 rotldi r3, r0, 32 /* fold two 32 bit halves together */ 451 add r3, r0, r3 452 srdi r0, r3, 32 453 rotlwi r3, r0, 16 /* fold two 16 bit halves together */ 454 add r3, r0, r3 455 not r3, r3 456 rlwinm r3, r3, 16, 16, 31 457 blr 458EXPORT_SYMBOL(csum_ipv6_magic) 459