1/* 2 * This file contains assembly-language implementations 3 * of IP-style 1's complement checksum routines. 4 * 5 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) 6 * 7 * This program is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License 9 * as published by the Free Software Foundation; either version 10 * 2 of the License, or (at your option) any later version. 11 * 12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). 13 */ 14 15#include <linux/sys.h> 16#include <asm/processor.h> 17#include <asm/errno.h> 18#include <asm/ppc_asm.h> 19#include <asm/export.h> 20 21/* 22 * Computes the checksum of a memory block at buff, length len, 23 * and adds in "sum" (32-bit). 24 * 25 * __csum_partial(r3=buff, r4=len, r5=sum) 26 */ 27_GLOBAL(__csum_partial) 28 addic r0,r5,0 /* clear carry */ 29 30 srdi. r6,r4,3 /* less than 8 bytes? */ 31 beq .Lcsum_tail_word 32 33 /* 34 * If only halfword aligned, align to a double word. Since odd 35 * aligned addresses should be rare and they would require more 36 * work to calculate the correct checksum, we ignore that case 37 * and take the potential slowdown of unaligned loads. 38 */ 39 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */ 40 beq .Lcsum_aligned 41 42 li r7,4 43 sub r6,r7,r6 44 mtctr r6 45 461: 47 lhz r6,0(r3) /* align to doubleword */ 48 subi r4,r4,2 49 addi r3,r3,2 50 adde r0,r0,r6 51 bdnz 1b 52 53.Lcsum_aligned: 54 /* 55 * We unroll the loop such that each iteration is 64 bytes with an 56 * entry and exit limb of 64 bytes, meaning a minimum size of 57 * 128 bytes. 58 */ 59 srdi. r6,r4,7 60 beq .Lcsum_tail_doublewords /* len < 128 */ 61 62 srdi r6,r4,6 63 subi r6,r6,1 64 mtctr r6 65 66 stdu r1,-STACKFRAMESIZE(r1) 67 std r14,STK_REG(R14)(r1) 68 std r15,STK_REG(R15)(r1) 69 std r16,STK_REG(R16)(r1) 70 71 ld r6,0(r3) 72 ld r9,8(r3) 73 74 ld r10,16(r3) 75 ld r11,24(r3) 76 77 /* 78 * On POWER6 and POWER7 back to back adde instructions take 2 cycles 79 * because of the XER dependency. This means the fastest this loop can 80 * go is 16 cycles per iteration. The scheduling of the loop below has 81 * been shown to hit this on both POWER6 and POWER7. 82 */ 83 .align 5 842: 85 adde r0,r0,r6 86 ld r12,32(r3) 87 ld r14,40(r3) 88 89 adde r0,r0,r9 90 ld r15,48(r3) 91 ld r16,56(r3) 92 addi r3,r3,64 93 94 adde r0,r0,r10 95 96 adde r0,r0,r11 97 98 adde r0,r0,r12 99 100 adde r0,r0,r14 101 102 adde r0,r0,r15 103 ld r6,0(r3) 104 ld r9,8(r3) 105 106 adde r0,r0,r16 107 ld r10,16(r3) 108 ld r11,24(r3) 109 bdnz 2b 110 111 112 adde r0,r0,r6 113 ld r12,32(r3) 114 ld r14,40(r3) 115 116 adde r0,r0,r9 117 ld r15,48(r3) 118 ld r16,56(r3) 119 addi r3,r3,64 120 121 adde r0,r0,r10 122 adde r0,r0,r11 123 adde r0,r0,r12 124 adde r0,r0,r14 125 adde r0,r0,r15 126 adde r0,r0,r16 127 128 ld r14,STK_REG(R14)(r1) 129 ld r15,STK_REG(R15)(r1) 130 ld r16,STK_REG(R16)(r1) 131 addi r1,r1,STACKFRAMESIZE 132 133 andi. r4,r4,63 134 135.Lcsum_tail_doublewords: /* Up to 127 bytes to go */ 136 srdi. r6,r4,3 137 beq .Lcsum_tail_word 138 139 mtctr r6 1403: 141 ld r6,0(r3) 142 addi r3,r3,8 143 adde r0,r0,r6 144 bdnz 3b 145 146 andi. r4,r4,7 147 148.Lcsum_tail_word: /* Up to 7 bytes to go */ 149 srdi. r6,r4,2 150 beq .Lcsum_tail_halfword 151 152 lwz r6,0(r3) 153 addi r3,r3,4 154 adde r0,r0,r6 155 subi r4,r4,4 156 157.Lcsum_tail_halfword: /* Up to 3 bytes to go */ 158 srdi. r6,r4,1 159 beq .Lcsum_tail_byte 160 161 lhz r6,0(r3) 162 addi r3,r3,2 163 adde r0,r0,r6 164 subi r4,r4,2 165 166.Lcsum_tail_byte: /* Up to 1 byte to go */ 167 andi. r6,r4,1 168 beq .Lcsum_finish 169 170 lbz r6,0(r3) 171#ifdef __BIG_ENDIAN__ 172 sldi r9,r6,8 /* Pad the byte out to 16 bits */ 173 adde r0,r0,r9 174#else 175 adde r0,r0,r6 176#endif 177 178.Lcsum_finish: 179 addze r0,r0 /* add in final carry */ 180 rldicl r4,r0,32,0 /* fold two 32 bit halves together */ 181 add r3,r4,r0 182 srdi r3,r3,32 183 blr 184EXPORT_SYMBOL(__csum_partial) 185 186 187 .macro srcnr 188100: 189 EX_TABLE(100b,.Lsrc_error_nr) 190 .endm 191 192 .macro source 193150: 194 EX_TABLE(150b,.Lsrc_error) 195 .endm 196 197 .macro dstnr 198200: 199 EX_TABLE(200b,.Ldest_error_nr) 200 .endm 201 202 .macro dest 203250: 204 EX_TABLE(250b,.Ldest_error) 205 .endm 206 207/* 208 * Computes the checksum of a memory block at src, length len, 209 * and adds in "sum" (32-bit), while copying the block to dst. 210 * If an access exception occurs on src or dst, it stores -EFAULT 211 * to *src_err or *dst_err respectively. The caller must take any action 212 * required in this case (zeroing memory, recalculating partial checksum etc). 213 * 214 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err) 215 */ 216_GLOBAL(csum_partial_copy_generic) 217 addic r0,r6,0 /* clear carry */ 218 219 srdi. r6,r5,3 /* less than 8 bytes? */ 220 beq .Lcopy_tail_word 221 222 /* 223 * If only halfword aligned, align to a double word. Since odd 224 * aligned addresses should be rare and they would require more 225 * work to calculate the correct checksum, we ignore that case 226 * and take the potential slowdown of unaligned loads. 227 * 228 * If the source and destination are relatively unaligned we only 229 * align the source. This keeps things simple. 230 */ 231 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */ 232 beq .Lcopy_aligned 233 234 li r9,4 235 sub r6,r9,r6 236 mtctr r6 237 2381: 239srcnr; lhz r6,0(r3) /* align to doubleword */ 240 subi r5,r5,2 241 addi r3,r3,2 242 adde r0,r0,r6 243dstnr; sth r6,0(r4) 244 addi r4,r4,2 245 bdnz 1b 246 247.Lcopy_aligned: 248 /* 249 * We unroll the loop such that each iteration is 64 bytes with an 250 * entry and exit limb of 64 bytes, meaning a minimum size of 251 * 128 bytes. 252 */ 253 srdi. r6,r5,7 254 beq .Lcopy_tail_doublewords /* len < 128 */ 255 256 srdi r6,r5,6 257 subi r6,r6,1 258 mtctr r6 259 260 stdu r1,-STACKFRAMESIZE(r1) 261 std r14,STK_REG(R14)(r1) 262 std r15,STK_REG(R15)(r1) 263 std r16,STK_REG(R16)(r1) 264 265source; ld r6,0(r3) 266source; ld r9,8(r3) 267 268source; ld r10,16(r3) 269source; ld r11,24(r3) 270 271 /* 272 * On POWER6 and POWER7 back to back adde instructions take 2 cycles 273 * because of the XER dependency. This means the fastest this loop can 274 * go is 16 cycles per iteration. The scheduling of the loop below has 275 * been shown to hit this on both POWER6 and POWER7. 276 */ 277 .align 5 2782: 279 adde r0,r0,r6 280source; ld r12,32(r3) 281source; ld r14,40(r3) 282 283 adde r0,r0,r9 284source; ld r15,48(r3) 285source; ld r16,56(r3) 286 addi r3,r3,64 287 288 adde r0,r0,r10 289dest; std r6,0(r4) 290dest; std r9,8(r4) 291 292 adde r0,r0,r11 293dest; std r10,16(r4) 294dest; std r11,24(r4) 295 296 adde r0,r0,r12 297dest; std r12,32(r4) 298dest; std r14,40(r4) 299 300 adde r0,r0,r14 301dest; std r15,48(r4) 302dest; std r16,56(r4) 303 addi r4,r4,64 304 305 adde r0,r0,r15 306source; ld r6,0(r3) 307source; ld r9,8(r3) 308 309 adde r0,r0,r16 310source; ld r10,16(r3) 311source; ld r11,24(r3) 312 bdnz 2b 313 314 315 adde r0,r0,r6 316source; ld r12,32(r3) 317source; ld r14,40(r3) 318 319 adde r0,r0,r9 320source; ld r15,48(r3) 321source; ld r16,56(r3) 322 addi r3,r3,64 323 324 adde r0,r0,r10 325dest; std r6,0(r4) 326dest; std r9,8(r4) 327 328 adde r0,r0,r11 329dest; std r10,16(r4) 330dest; std r11,24(r4) 331 332 adde r0,r0,r12 333dest; std r12,32(r4) 334dest; std r14,40(r4) 335 336 adde r0,r0,r14 337dest; std r15,48(r4) 338dest; std r16,56(r4) 339 addi r4,r4,64 340 341 adde r0,r0,r15 342 adde r0,r0,r16 343 344 ld r14,STK_REG(R14)(r1) 345 ld r15,STK_REG(R15)(r1) 346 ld r16,STK_REG(R16)(r1) 347 addi r1,r1,STACKFRAMESIZE 348 349 andi. r5,r5,63 350 351.Lcopy_tail_doublewords: /* Up to 127 bytes to go */ 352 srdi. r6,r5,3 353 beq .Lcopy_tail_word 354 355 mtctr r6 3563: 357srcnr; ld r6,0(r3) 358 addi r3,r3,8 359 adde r0,r0,r6 360dstnr; std r6,0(r4) 361 addi r4,r4,8 362 bdnz 3b 363 364 andi. r5,r5,7 365 366.Lcopy_tail_word: /* Up to 7 bytes to go */ 367 srdi. r6,r5,2 368 beq .Lcopy_tail_halfword 369 370srcnr; lwz r6,0(r3) 371 addi r3,r3,4 372 adde r0,r0,r6 373dstnr; stw r6,0(r4) 374 addi r4,r4,4 375 subi r5,r5,4 376 377.Lcopy_tail_halfword: /* Up to 3 bytes to go */ 378 srdi. r6,r5,1 379 beq .Lcopy_tail_byte 380 381srcnr; lhz r6,0(r3) 382 addi r3,r3,2 383 adde r0,r0,r6 384dstnr; sth r6,0(r4) 385 addi r4,r4,2 386 subi r5,r5,2 387 388.Lcopy_tail_byte: /* Up to 1 byte to go */ 389 andi. r6,r5,1 390 beq .Lcopy_finish 391 392srcnr; lbz r6,0(r3) 393#ifdef __BIG_ENDIAN__ 394 sldi r9,r6,8 /* Pad the byte out to 16 bits */ 395 adde r0,r0,r9 396#else 397 adde r0,r0,r6 398#endif 399dstnr; stb r6,0(r4) 400 401.Lcopy_finish: 402 addze r0,r0 /* add in final carry */ 403 rldicl r4,r0,32,0 /* fold two 32 bit halves together */ 404 add r3,r4,r0 405 srdi r3,r3,32 406 blr 407 408.Lsrc_error: 409 ld r14,STK_REG(R14)(r1) 410 ld r15,STK_REG(R15)(r1) 411 ld r16,STK_REG(R16)(r1) 412 addi r1,r1,STACKFRAMESIZE 413.Lsrc_error_nr: 414 cmpdi 0,r7,0 415 beqlr 416 li r6,-EFAULT 417 stw r6,0(r7) 418 blr 419 420.Ldest_error: 421 ld r14,STK_REG(R14)(r1) 422 ld r15,STK_REG(R15)(r1) 423 ld r16,STK_REG(R16)(r1) 424 addi r1,r1,STACKFRAMESIZE 425.Ldest_error_nr: 426 cmpdi 0,r8,0 427 beqlr 428 li r6,-EFAULT 429 stw r6,0(r8) 430 blr 431EXPORT_SYMBOL(csum_partial_copy_generic) 432 433/* 434 * __sum16 csum_ipv6_magic(const struct in6_addr *saddr, 435 * const struct in6_addr *daddr, 436 * __u32 len, __u8 proto, __wsum sum) 437 */ 438 439_GLOBAL(csum_ipv6_magic) 440 ld r8, 0(r3) 441 ld r9, 8(r3) 442 add r5, r5, r6 443 addc r0, r8, r9 444 ld r10, 0(r4) 445 ld r11, 8(r4) 446 adde r0, r0, r10 447 add r5, r5, r7 448 adde r0, r0, r11 449 adde r0, r0, r5 450 addze r0, r0 451 rotldi r3, r0, 32 /* fold two 32 bit halves together */ 452 add r3, r0, r3 453 srdi r0, r3, 32 454 rotlwi r3, r0, 16 /* fold two 16 bit halves together */ 455 add r3, r0, r3 456 not r3, r3 457 rlwinm r3, r3, 16, 16, 31 458 blr 459EXPORT_SYMBOL(csum_ipv6_magic) 460