1/* 2 * This file contains assembly-language implementations 3 * of IP-style 1's complement checksum routines. 4 * 5 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) 6 * 7 * This program is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License 9 * as published by the Free Software Foundation; either version 10 * 2 of the License, or (at your option) any later version. 11 * 12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). 13 */ 14 15#include <linux/sys.h> 16#include <asm/processor.h> 17#include <asm/errno.h> 18#include <asm/ppc_asm.h> 19#include <asm/export.h> 20 21/* 22 * Computes the checksum of a memory block at buff, length len, 23 * and adds in "sum" (32-bit). 24 * 25 * __csum_partial(r3=buff, r4=len, r5=sum) 26 */ 27_GLOBAL(__csum_partial) 28 addic r0,r5,0 /* clear carry */ 29 30 srdi. r6,r4,3 /* less than 8 bytes? */ 31 beq .Lcsum_tail_word 32 33 /* 34 * If only halfword aligned, align to a double word. Since odd 35 * aligned addresses should be rare and they would require more 36 * work to calculate the correct checksum, we ignore that case 37 * and take the potential slowdown of unaligned loads. 38 */ 39 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ 40 beq .Lcsum_aligned 41 42 li r7,4 43 sub r6,r7,r6 44 mtctr r6 45 461: 47 lhz r6,0(r3) /* align to doubleword */ 48 subi r4,r4,2 49 addi r3,r3,2 50 adde r0,r0,r6 51 bdnz 1b 52 53.Lcsum_aligned: 54 /* 55 * We unroll the loop such that each iteration is 64 bytes with an 56 * entry and exit limb of 64 bytes, meaning a minimum size of 57 * 128 bytes. 58 */ 59 srdi. r6,r4,7 60 beq .Lcsum_tail_doublewords /* len < 128 */ 61 62 srdi r6,r4,6 63 subi r6,r6,1 64 mtctr r6 65 66 stdu r1,-STACKFRAMESIZE(r1) 67 std r14,STK_REG(R14)(r1) 68 std r15,STK_REG(R15)(r1) 69 std r16,STK_REG(R16)(r1) 70 71 ld r6,0(r3) 72 ld r9,8(r3) 73 74 ld r10,16(r3) 75 ld r11,24(r3) 76 77 /* 78 * On POWER6 and POWER7 back to back adde instructions take 2 cycles 79 * because of the XER dependency. This means the fastest this loop can 80 * go is 16 cycles per iteration. The scheduling of the loop below has 81 * been shown to hit this on both POWER6 and POWER7. 82 */ 83 .align 5 842: 85 adde r0,r0,r6 86 ld r12,32(r3) 87 ld r14,40(r3) 88 89 adde r0,r0,r9 90 ld r15,48(r3) 91 ld r16,56(r3) 92 addi r3,r3,64 93 94 adde r0,r0,r10 95 96 adde r0,r0,r11 97 98 adde r0,r0,r12 99 100 adde r0,r0,r14 101 102 adde r0,r0,r15 103 ld r6,0(r3) 104 ld r9,8(r3) 105 106 adde r0,r0,r16 107 ld r10,16(r3) 108 ld r11,24(r3) 109 bdnz 2b 110 111 112 adde r0,r0,r6 113 ld r12,32(r3) 114 ld r14,40(r3) 115 116 adde r0,r0,r9 117 ld r15,48(r3) 118 ld r16,56(r3) 119 addi r3,r3,64 120 121 adde r0,r0,r10 122 adde r0,r0,r11 123 adde r0,r0,r12 124 adde r0,r0,r14 125 adde r0,r0,r15 126 adde r0,r0,r16 127 128 ld r14,STK_REG(R14)(r1) 129 ld r15,STK_REG(R15)(r1) 130 ld r16,STK_REG(R16)(r1) 131 addi r1,r1,STACKFRAMESIZE 132 133 andi. r4,r4,63 134 135.Lcsum_tail_doublewords: /* Up to 127 bytes to go */ 136 srdi. r6,r4,3 137 beq .Lcsum_tail_word 138 139 mtctr r6 1403: 141 ld r6,0(r3) 142 addi r3,r3,8 143 adde r0,r0,r6 144 bdnz 3b 145 146 andi. r4,r4,7 147 148.Lcsum_tail_word: /* Up to 7 bytes to go */ 149 srdi. r6,r4,2 150 beq .Lcsum_tail_halfword 151 152 lwz r6,0(r3) 153 addi r3,r3,4 154 adde r0,r0,r6 155 subi r4,r4,4 156 157.Lcsum_tail_halfword: /* Up to 3 bytes to go */ 158 srdi. r6,r4,1 159 beq .Lcsum_tail_byte 160 161 lhz r6,0(r3) 162 addi r3,r3,2 163 adde r0,r0,r6 164 subi r4,r4,2 165 166.Lcsum_tail_byte: /* Up to 1 byte to go */ 167 andi. r6,r4,1 168 beq .Lcsum_finish 169 170 lbz r6,0(r3) 171 sldi r9,r6,8 /* Pad the byte out to 16 bits */ 172 adde r0,r0,r9 173 174.Lcsum_finish: 175 addze r0,r0 /* add in final carry */ 176 rldicl r4,r0,32,0 /* fold two 32 bit halves together */ 177 add r3,r4,r0 178 srdi r3,r3,32 179 blr 180EXPORT_SYMBOL(__csum_partial) 181 182 183 .macro srcnr 184100: 185 .section __ex_table,"a" 186 .align 3 187 .llong 100b,.Lsrc_error_nr 188 .previous 189 .endm 190 191 .macro source 192150: 193 .section __ex_table,"a" 194 .align 3 195 .llong 150b,.Lsrc_error 196 .previous 197 .endm 198 199 .macro dstnr 200200: 201 .section __ex_table,"a" 202 .align 3 203 .llong 200b,.Ldest_error_nr 204 .previous 205 .endm 206 207 .macro dest 208250: 209 .section __ex_table,"a" 210 .align 3 211 .llong 250b,.Ldest_error 212 .previous 213 .endm 214 215/* 216 * Computes the checksum of a memory block at src, length len, 217 * and adds in "sum" (32-bit), while copying the block to dst. 218 * If an access exception occurs on src or dst, it stores -EFAULT 219 * to *src_err or *dst_err respectively. The caller must take any action 220 * required in this case (zeroing memory, recalculating partial checksum etc). 221 * 222 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err) 223 */ 224_GLOBAL(csum_partial_copy_generic) 225 addic r0,r6,0 /* clear carry */ 226 227 srdi. r6,r5,3 /* less than 8 bytes? */ 228 beq .Lcopy_tail_word 229 230 /* 231 * If only halfword aligned, align to a double word. Since odd 232 * aligned addresses should be rare and they would require more 233 * work to calculate the correct checksum, we ignore that case 234 * and take the potential slowdown of unaligned loads. 235 * 236 * If the source and destination are relatively unaligned we only 237 * align the source. This keeps things simple. 238 */ 239 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ 240 beq .Lcopy_aligned 241 242 li r9,4 243 sub r6,r9,r6 244 mtctr r6 245 2461: 247srcnr; lhz r6,0(r3) /* align to doubleword */ 248 subi r5,r5,2 249 addi r3,r3,2 250 adde r0,r0,r6 251dstnr; sth r6,0(r4) 252 addi r4,r4,2 253 bdnz 1b 254 255.Lcopy_aligned: 256 /* 257 * We unroll the loop such that each iteration is 64 bytes with an 258 * entry and exit limb of 64 bytes, meaning a minimum size of 259 * 128 bytes. 260 */ 261 srdi. r6,r5,7 262 beq .Lcopy_tail_doublewords /* len < 128 */ 263 264 srdi r6,r5,6 265 subi r6,r6,1 266 mtctr r6 267 268 stdu r1,-STACKFRAMESIZE(r1) 269 std r14,STK_REG(R14)(r1) 270 std r15,STK_REG(R15)(r1) 271 std r16,STK_REG(R16)(r1) 272 273source; ld r6,0(r3) 274source; ld r9,8(r3) 275 276source; ld r10,16(r3) 277source; ld r11,24(r3) 278 279 /* 280 * On POWER6 and POWER7 back to back adde instructions take 2 cycles 281 * because of the XER dependency. This means the fastest this loop can 282 * go is 16 cycles per iteration. The scheduling of the loop below has 283 * been shown to hit this on both POWER6 and POWER7. 284 */ 285 .align 5 2862: 287 adde r0,r0,r6 288source; ld r12,32(r3) 289source; ld r14,40(r3) 290 291 adde r0,r0,r9 292source; ld r15,48(r3) 293source; ld r16,56(r3) 294 addi r3,r3,64 295 296 adde r0,r0,r10 297dest; std r6,0(r4) 298dest; std r9,8(r4) 299 300 adde r0,r0,r11 301dest; std r10,16(r4) 302dest; std r11,24(r4) 303 304 adde r0,r0,r12 305dest; std r12,32(r4) 306dest; std r14,40(r4) 307 308 adde r0,r0,r14 309dest; std r15,48(r4) 310dest; std r16,56(r4) 311 addi r4,r4,64 312 313 adde r0,r0,r15 314source; ld r6,0(r3) 315source; ld r9,8(r3) 316 317 adde r0,r0,r16 318source; ld r10,16(r3) 319source; ld r11,24(r3) 320 bdnz 2b 321 322 323 adde r0,r0,r6 324source; ld r12,32(r3) 325source; ld r14,40(r3) 326 327 adde r0,r0,r9 328source; ld r15,48(r3) 329source; ld r16,56(r3) 330 addi r3,r3,64 331 332 adde r0,r0,r10 333dest; std r6,0(r4) 334dest; std r9,8(r4) 335 336 adde r0,r0,r11 337dest; std r10,16(r4) 338dest; std r11,24(r4) 339 340 adde r0,r0,r12 341dest; std r12,32(r4) 342dest; std r14,40(r4) 343 344 adde r0,r0,r14 345dest; std r15,48(r4) 346dest; std r16,56(r4) 347 addi r4,r4,64 348 349 adde r0,r0,r15 350 adde r0,r0,r16 351 352 ld r14,STK_REG(R14)(r1) 353 ld r15,STK_REG(R15)(r1) 354 ld r16,STK_REG(R16)(r1) 355 addi r1,r1,STACKFRAMESIZE 356 357 andi. r5,r5,63 358 359.Lcopy_tail_doublewords: /* Up to 127 bytes to go */ 360 srdi. r6,r5,3 361 beq .Lcopy_tail_word 362 363 mtctr r6 3643: 365srcnr; ld r6,0(r3) 366 addi r3,r3,8 367 adde r0,r0,r6 368dstnr; std r6,0(r4) 369 addi r4,r4,8 370 bdnz 3b 371 372 andi. r5,r5,7 373 374.Lcopy_tail_word: /* Up to 7 bytes to go */ 375 srdi. r6,r5,2 376 beq .Lcopy_tail_halfword 377 378srcnr; lwz r6,0(r3) 379 addi r3,r3,4 380 adde r0,r0,r6 381dstnr; stw r6,0(r4) 382 addi r4,r4,4 383 subi r5,r5,4 384 385.Lcopy_tail_halfword: /* Up to 3 bytes to go */ 386 srdi. r6,r5,1 387 beq .Lcopy_tail_byte 388 389srcnr; lhz r6,0(r3) 390 addi r3,r3,2 391 adde r0,r0,r6 392dstnr; sth r6,0(r4) 393 addi r4,r4,2 394 subi r5,r5,2 395 396.Lcopy_tail_byte: /* Up to 1 byte to go */ 397 andi. r6,r5,1 398 beq .Lcopy_finish 399 400srcnr; lbz r6,0(r3) 401 sldi r9,r6,8 /* Pad the byte out to 16 bits */ 402 adde r0,r0,r9 403dstnr; stb r6,0(r4) 404 405.Lcopy_finish: 406 addze r0,r0 /* add in final carry */ 407 rldicl r4,r0,32,0 /* fold two 32 bit halves together */ 408 add r3,r4,r0 409 srdi r3,r3,32 410 blr 411 412.Lsrc_error: 413 ld r14,STK_REG(R14)(r1) 414 ld r15,STK_REG(R15)(r1) 415 ld r16,STK_REG(R16)(r1) 416 addi r1,r1,STACKFRAMESIZE 417.Lsrc_error_nr: 418 cmpdi 0,r7,0 419 beqlr 420 li r6,-EFAULT 421 stw r6,0(r7) 422 blr 423 424.Ldest_error: 425 ld r14,STK_REG(R14)(r1) 426 ld r15,STK_REG(R15)(r1) 427 ld r16,STK_REG(R16)(r1) 428 addi r1,r1,STACKFRAMESIZE 429.Ldest_error_nr: 430 cmpdi 0,r8,0 431 beqlr 432 li r6,-EFAULT 433 stw r6,0(r8) 434 blr 435EXPORT_SYMBOL(csum_partial_copy_generic) 436