1/* 2 * This file contains assembly-language implementations 3 * of IP-style 1's complement checksum routines. 4 * 5 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) 6 * 7 * This program is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License 9 * as published by the Free Software Foundation; either version 10 * 2 of the License, or (at your option) any later version. 11 * 12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). 13 */ 14 15#include <linux/sys.h> 16#include <asm/processor.h> 17#include <asm/cache.h> 18#include <asm/errno.h> 19#include <asm/ppc_asm.h> 20#include <asm/export.h> 21 22 .text 23 24/* 25 * computes the checksum of a memory block at buff, length len, 26 * and adds in "sum" (32-bit) 27 * 28 * __csum_partial(buff, len, sum) 29 */ 30_GLOBAL(__csum_partial) 31 subi r3,r3,4 32 srawi. r6,r4,2 /* Divide len by 4 and also clear carry */ 33 beq 3f /* if we're doing < 4 bytes */ 34 andi. r0,r3,2 /* Align buffer to longword boundary */ 35 beq+ 1f 36 lhz r0,4(r3) /* do 2 bytes to get aligned */ 37 subi r4,r4,2 38 addi r3,r3,2 39 srwi. r6,r4,2 /* # words to do */ 40 adde r5,r5,r0 41 beq 3f 421: andi. r6,r6,3 /* Prepare to handle words 4 by 4 */ 43 beq 21f 44 mtctr r6 452: lwzu r0,4(r3) 46 adde r5,r5,r0 47 bdnz 2b 4821: srwi. r6,r4,4 /* # blocks of 4 words to do */ 49 beq 3f 50 lwz r0,4(r3) 51 mtctr r6 52 lwz r6,8(r3) 53 adde r5,r5,r0 54 lwz r7,12(r3) 55 adde r5,r5,r6 56 lwzu r8,16(r3) 57 adde r5,r5,r7 58 bdz 23f 5922: lwz r0,4(r3) 60 adde r5,r5,r8 61 lwz r6,8(r3) 62 adde r5,r5,r0 63 lwz r7,12(r3) 64 adde r5,r5,r6 65 lwzu r8,16(r3) 66 adde r5,r5,r7 67 bdnz 22b 6823: adde r5,r5,r8 693: andi. r0,r4,2 70 beq+ 4f 71 lhz r0,4(r3) 72 addi r3,r3,2 73 adde r5,r5,r0 744: andi. r0,r4,1 75 beq+ 5f 76 lbz r0,4(r3) 77 slwi r0,r0,8 /* Upper byte of word */ 78 adde r5,r5,r0 795: addze r3,r5 /* add in final carry */ 80 blr 81EXPORT_SYMBOL(__csum_partial) 82 83/* 84 * Computes the checksum of a memory block at src, length len, 85 * and adds in "sum" (32-bit), while copying the block to dst. 86 * If an access exception occurs on src or dst, it stores -EFAULT 87 * to *src_err or *dst_err respectively, and (for an error on 88 * src) zeroes the rest of dst. 89 * 90 * csum_partial_copy_generic(src, dst, len, sum, src_err, dst_err) 91 */ 92#define CSUM_COPY_16_BYTES_WITHEX(n) \ 938 ## n ## 0: \ 94 lwz r7,4(r4); \ 958 ## n ## 1: \ 96 lwz r8,8(r4); \ 978 ## n ## 2: \ 98 lwz r9,12(r4); \ 998 ## n ## 3: \ 100 lwzu r10,16(r4); \ 1018 ## n ## 4: \ 102 stw r7,4(r6); \ 103 adde r12,r12,r7; \ 1048 ## n ## 5: \ 105 stw r8,8(r6); \ 106 adde r12,r12,r8; \ 1078 ## n ## 6: \ 108 stw r9,12(r6); \ 109 adde r12,r12,r9; \ 1108 ## n ## 7: \ 111 stwu r10,16(r6); \ 112 adde r12,r12,r10 113 114#define CSUM_COPY_16_BYTES_EXCODE(n) \ 115 EX_TABLE(8 ## n ## 0b, src_error); \ 116 EX_TABLE(8 ## n ## 1b, src_error); \ 117 EX_TABLE(8 ## n ## 2b, src_error); \ 118 EX_TABLE(8 ## n ## 3b, src_error); \ 119 EX_TABLE(8 ## n ## 4b, dst_error); \ 120 EX_TABLE(8 ## n ## 5b, dst_error); \ 121 EX_TABLE(8 ## n ## 6b, dst_error); \ 122 EX_TABLE(8 ## n ## 7b, dst_error); 123 124 .text 125 .stabs "arch/powerpc/lib/",N_SO,0,0,0f 126 .stabs "checksum_32.S",N_SO,0,0,0f 1270: 128 129CACHELINE_BYTES = L1_CACHE_BYTES 130LG_CACHELINE_BYTES = L1_CACHE_SHIFT 131CACHELINE_MASK = (L1_CACHE_BYTES-1) 132 133_GLOBAL(csum_partial_copy_generic) 134 stwu r1,-16(r1) 135 stw r7,12(r1) 136 stw r8,8(r1) 137 138 addic r12,r6,0 139 addi r6,r4,-4 140 neg r0,r4 141 addi r4,r3,-4 142 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ 143 crset 4*cr7+eq 144 beq 58f 145 146 cmplw 0,r5,r0 /* is this more than total to do? */ 147 blt 63f /* if not much to do */ 148 rlwinm r7,r6,3,0x8 149 rlwnm r12,r12,r7,0,31 /* odd destination address: rotate one byte */ 150 cmplwi cr7,r7,0 /* is destination address even ? */ 151 andi. r8,r0,3 /* get it word-aligned first */ 152 mtctr r8 153 beq+ 61f 154 li r3,0 15570: lbz r9,4(r4) /* do some bytes */ 156 addi r4,r4,1 157 slwi r3,r3,8 158 rlwimi r3,r9,0,24,31 15971: stb r9,4(r6) 160 addi r6,r6,1 161 bdnz 70b 162 adde r12,r12,r3 16361: subf r5,r0,r5 164 srwi. r0,r0,2 165 mtctr r0 166 beq 58f 16772: lwzu r9,4(r4) /* do some words */ 168 adde r12,r12,r9 16973: stwu r9,4(r6) 170 bdnz 72b 171 17258: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ 173 clrlwi r5,r5,32-LG_CACHELINE_BYTES 174 li r11,4 175 beq 63f 176 177 /* Here we decide how far ahead to prefetch the source */ 178 li r3,4 179 cmpwi r0,1 180 li r7,0 181 ble 114f 182 li r7,1 183#if MAX_COPY_PREFETCH > 1 184 /* Heuristically, for large transfers we prefetch 185 MAX_COPY_PREFETCH cachelines ahead. For small transfers 186 we prefetch 1 cacheline ahead. */ 187 cmpwi r0,MAX_COPY_PREFETCH 188 ble 112f 189 li r7,MAX_COPY_PREFETCH 190112: mtctr r7 191111: dcbt r3,r4 192 addi r3,r3,CACHELINE_BYTES 193 bdnz 111b 194#else 195 dcbt r3,r4 196 addi r3,r3,CACHELINE_BYTES 197#endif /* MAX_COPY_PREFETCH > 1 */ 198 199114: subf r8,r7,r0 200 mr r0,r7 201 mtctr r8 202 20353: dcbt r3,r4 20454: dcbz r11,r6 205/* the main body of the cacheline loop */ 206 CSUM_COPY_16_BYTES_WITHEX(0) 207#if L1_CACHE_BYTES >= 32 208 CSUM_COPY_16_BYTES_WITHEX(1) 209#if L1_CACHE_BYTES >= 64 210 CSUM_COPY_16_BYTES_WITHEX(2) 211 CSUM_COPY_16_BYTES_WITHEX(3) 212#if L1_CACHE_BYTES >= 128 213 CSUM_COPY_16_BYTES_WITHEX(4) 214 CSUM_COPY_16_BYTES_WITHEX(5) 215 CSUM_COPY_16_BYTES_WITHEX(6) 216 CSUM_COPY_16_BYTES_WITHEX(7) 217#endif 218#endif 219#endif 220 bdnz 53b 221 cmpwi r0,0 222 li r3,4 223 li r7,0 224 bne 114b 225 22663: srwi. r0,r5,2 227 mtctr r0 228 beq 64f 22930: lwzu r0,4(r4) 230 adde r12,r12,r0 23131: stwu r0,4(r6) 232 bdnz 30b 233 23464: andi. r0,r5,2 235 beq+ 65f 23640: lhz r0,4(r4) 237 addi r4,r4,2 23841: sth r0,4(r6) 239 adde r12,r12,r0 240 addi r6,r6,2 24165: andi. r0,r5,1 242 beq+ 66f 24350: lbz r0,4(r4) 24451: stb r0,4(r6) 245 slwi r0,r0,8 246 adde r12,r12,r0 24766: addze r3,r12 248 addi r1,r1,16 249 beqlr+ cr7 250 rlwinm r3,r3,8,0,31 /* odd destination address: rotate one byte */ 251 blr 252 253/* read fault */ 254src_error: 255 lwz r7,12(r1) 256 addi r1,r1,16 257 cmpwi cr0,r7,0 258 beqlr 259 li r0,-EFAULT 260 stw r0,0(r7) 261 blr 262/* write fault */ 263dst_error: 264 lwz r8,8(r1) 265 addi r1,r1,16 266 cmpwi cr0,r8,0 267 beqlr 268 li r0,-EFAULT 269 stw r0,0(r8) 270 blr 271 272 EX_TABLE(70b, src_error); 273 EX_TABLE(71b, dst_error); 274 EX_TABLE(72b, src_error); 275 EX_TABLE(73b, dst_error); 276 EX_TABLE(54b, dst_error); 277 278/* 279 * this stuff handles faults in the cacheline loop and branches to either 280 * src_error (if in read part) or dst_error (if in write part) 281 */ 282 CSUM_COPY_16_BYTES_EXCODE(0) 283#if L1_CACHE_BYTES >= 32 284 CSUM_COPY_16_BYTES_EXCODE(1) 285#if L1_CACHE_BYTES >= 64 286 CSUM_COPY_16_BYTES_EXCODE(2) 287 CSUM_COPY_16_BYTES_EXCODE(3) 288#if L1_CACHE_BYTES >= 128 289 CSUM_COPY_16_BYTES_EXCODE(4) 290 CSUM_COPY_16_BYTES_EXCODE(5) 291 CSUM_COPY_16_BYTES_EXCODE(6) 292 CSUM_COPY_16_BYTES_EXCODE(7) 293#endif 294#endif 295#endif 296 297 EX_TABLE(30b, src_error); 298 EX_TABLE(31b, dst_error); 299 EX_TABLE(40b, src_error); 300 EX_TABLE(41b, dst_error); 301 EX_TABLE(50b, src_error); 302 EX_TABLE(51b, dst_error); 303 304EXPORT_SYMBOL(csum_partial_copy_generic) 305 306/* 307 * __sum16 csum_ipv6_magic(const struct in6_addr *saddr, 308 * const struct in6_addr *daddr, 309 * __u32 len, __u8 proto, __wsum sum) 310 */ 311 312_GLOBAL(csum_ipv6_magic) 313 lwz r8, 0(r3) 314 lwz r9, 4(r3) 315 addc r0, r7, r8 316 lwz r10, 8(r3) 317 adde r0, r0, r9 318 lwz r11, 12(r3) 319 adde r0, r0, r10 320 lwz r8, 0(r4) 321 adde r0, r0, r11 322 lwz r9, 4(r4) 323 adde r0, r0, r8 324 lwz r10, 8(r4) 325 adde r0, r0, r9 326 lwz r11, 12(r4) 327 adde r0, r0, r10 328 add r5, r5, r6 /* assumption: len + proto doesn't carry */ 329 adde r0, r0, r11 330 adde r0, r0, r5 331 addze r0, r0 332 rotlwi r3, r0, 16 333 add r3, r0, r3 334 not r3, r3 335 rlwinm r3, r3, 16, 16, 31 336 blr 337EXPORT_SYMBOL(csum_ipv6_magic) 338