1/* 2 * This file contains assembly-language implementations 3 * of IP-style 1's complement checksum routines. 4 * 5 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) 6 * 7 * This program is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License 9 * as published by the Free Software Foundation; either version 10 * 2 of the License, or (at your option) any later version. 11 * 12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). 13 */ 14 15#include <linux/sys.h> 16#include <asm/processor.h> 17#include <asm/cache.h> 18#include <asm/errno.h> 19#include <asm/ppc_asm.h> 20 21 .text 22 23/* 24 * computes the checksum of a memory block at buff, length len, 25 * and adds in "sum" (32-bit) 26 * 27 * __csum_partial(buff, len, sum) 28 */ 29_GLOBAL(__csum_partial) 30 subi r3,r3,4 31 srawi. r6,r4,2 /* Divide len by 4 and also clear carry */ 32 beq 3f /* if we're doing < 4 bytes */ 33 andi. r0,r3,2 /* Align buffer to longword boundary */ 34 beq+ 1f 35 lhz r0,4(r3) /* do 2 bytes to get aligned */ 36 subi r4,r4,2 37 addi r3,r3,2 38 srwi. r6,r4,2 /* # words to do */ 39 adde r5,r5,r0 40 beq 3f 411: andi. r6,r6,3 /* Prepare to handle words 4 by 4 */ 42 beq 21f 43 mtctr r6 442: lwzu r0,4(r3) 45 adde r5,r5,r0 46 bdnz 2b 4721: srwi. r6,r4,4 /* # blocks of 4 words to do */ 48 beq 3f 49 mtctr r6 5022: lwz r0,4(r3) 51 lwz r6,8(r3) 52 lwz r7,12(r3) 53 lwzu r8,16(r3) 54 adde r5,r5,r0 55 adde r5,r5,r6 56 adde r5,r5,r7 57 adde r5,r5,r8 58 bdnz 22b 593: andi. r0,r4,2 60 beq+ 4f 61 lhz r0,4(r3) 62 addi r3,r3,2 63 adde r5,r5,r0 644: andi. r0,r4,1 65 beq+ 5f 66 lbz r0,4(r3) 67 slwi r0,r0,8 /* Upper byte of word */ 68 adde r5,r5,r0 695: addze r3,r5 /* add in final carry */ 70 blr 71 72/* 73 * Computes the checksum of a memory block at src, length len, 74 * and adds in "sum" (32-bit), while copying the block to dst. 75 * If an access exception occurs on src or dst, it stores -EFAULT 76 * to *src_err or *dst_err respectively, and (for an error on 77 * src) zeroes the rest of dst. 78 * 79 * csum_partial_copy_generic(src, dst, len, sum, src_err, dst_err) 80 */ 81#define CSUM_COPY_16_BYTES_WITHEX(n) \ 828 ## n ## 0: \ 83 lwz r7,4(r4); \ 848 ## n ## 1: \ 85 lwz r8,8(r4); \ 868 ## n ## 2: \ 87 lwz r9,12(r4); \ 888 ## n ## 3: \ 89 lwzu r10,16(r4); \ 908 ## n ## 4: \ 91 stw r7,4(r6); \ 92 adde r12,r12,r7; \ 938 ## n ## 5: \ 94 stw r8,8(r6); \ 95 adde r12,r12,r8; \ 968 ## n ## 6: \ 97 stw r9,12(r6); \ 98 adde r12,r12,r9; \ 998 ## n ## 7: \ 100 stwu r10,16(r6); \ 101 adde r12,r12,r10 102 103#define CSUM_COPY_16_BYTES_EXCODE(n) \ 104.section __ex_table,"a"; \ 105 .align 2; \ 106 .long 8 ## n ## 0b,src_error; \ 107 .long 8 ## n ## 1b,src_error; \ 108 .long 8 ## n ## 2b,src_error; \ 109 .long 8 ## n ## 3b,src_error; \ 110 .long 8 ## n ## 4b,dst_error; \ 111 .long 8 ## n ## 5b,dst_error; \ 112 .long 8 ## n ## 6b,dst_error; \ 113 .long 8 ## n ## 7b,dst_error; \ 114 .text 115 116 .text 117 .stabs "arch/powerpc/lib/",N_SO,0,0,0f 118 .stabs "checksum_32.S",N_SO,0,0,0f 1190: 120 121CACHELINE_BYTES = L1_CACHE_BYTES 122LG_CACHELINE_BYTES = L1_CACHE_SHIFT 123CACHELINE_MASK = (L1_CACHE_BYTES-1) 124 125_GLOBAL(csum_partial_copy_generic) 126 stwu r1,-16(r1) 127 stw r7,12(r1) 128 stw r8,8(r1) 129 130 andi. r0,r4,1 /* is destination address even ? */ 131 cmplwi cr7,r0,0 132 addic r12,r6,0 133 addi r6,r4,-4 134 neg r0,r4 135 addi r4,r3,-4 136 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ 137 beq 58f 138 139 cmplw 0,r5,r0 /* is this more than total to do? */ 140 blt 63f /* if not much to do */ 141 andi. r8,r0,3 /* get it word-aligned first */ 142 mtctr r8 143 beq+ 61f 144 li r3,0 14570: lbz r9,4(r4) /* do some bytes */ 146 addi r4,r4,1 147 slwi r3,r3,8 148 rlwimi r3,r9,0,24,31 14971: stb r9,4(r6) 150 addi r6,r6,1 151 bdnz 70b 152 adde r12,r12,r3 15361: subf r5,r0,r5 154 srwi. r0,r0,2 155 mtctr r0 156 beq 58f 15772: lwzu r9,4(r4) /* do some words */ 158 adde r12,r12,r9 15973: stwu r9,4(r6) 160 bdnz 72b 161 16258: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ 163 clrlwi r5,r5,32-LG_CACHELINE_BYTES 164 li r11,4 165 beq 63f 166 167 /* Here we decide how far ahead to prefetch the source */ 168 li r3,4 169 cmpwi r0,1 170 li r7,0 171 ble 114f 172 li r7,1 173#if MAX_COPY_PREFETCH > 1 174 /* Heuristically, for large transfers we prefetch 175 MAX_COPY_PREFETCH cachelines ahead. For small transfers 176 we prefetch 1 cacheline ahead. */ 177 cmpwi r0,MAX_COPY_PREFETCH 178 ble 112f 179 li r7,MAX_COPY_PREFETCH 180112: mtctr r7 181111: dcbt r3,r4 182 addi r3,r3,CACHELINE_BYTES 183 bdnz 111b 184#else 185 dcbt r3,r4 186 addi r3,r3,CACHELINE_BYTES 187#endif /* MAX_COPY_PREFETCH > 1 */ 188 189114: subf r8,r7,r0 190 mr r0,r7 191 mtctr r8 192 19353: dcbt r3,r4 19454: dcbz r11,r6 195/* the main body of the cacheline loop */ 196 CSUM_COPY_16_BYTES_WITHEX(0) 197#if L1_CACHE_BYTES >= 32 198 CSUM_COPY_16_BYTES_WITHEX(1) 199#if L1_CACHE_BYTES >= 64 200 CSUM_COPY_16_BYTES_WITHEX(2) 201 CSUM_COPY_16_BYTES_WITHEX(3) 202#if L1_CACHE_BYTES >= 128 203 CSUM_COPY_16_BYTES_WITHEX(4) 204 CSUM_COPY_16_BYTES_WITHEX(5) 205 CSUM_COPY_16_BYTES_WITHEX(6) 206 CSUM_COPY_16_BYTES_WITHEX(7) 207#endif 208#endif 209#endif 210 bdnz 53b 211 cmpwi r0,0 212 li r3,4 213 li r7,0 214 bne 114b 215 21663: srwi. r0,r5,2 217 mtctr r0 218 beq 64f 21930: lwzu r0,4(r4) 220 adde r12,r12,r0 22131: stwu r0,4(r6) 222 bdnz 30b 223 22464: andi. r0,r5,2 225 beq+ 65f 22640: lhz r0,4(r4) 227 addi r4,r4,2 22841: sth r0,4(r6) 229 adde r12,r12,r0 230 addi r6,r6,2 23165: andi. r0,r5,1 232 beq+ 66f 23350: lbz r0,4(r4) 23451: stb r0,4(r6) 235 slwi r0,r0,8 236 adde r12,r12,r0 23766: addze r3,r12 238 addi r1,r1,16 239 beqlr+ cr7 240 rlwinm r3,r3,8,0,31 /* swap bytes for odd destination */ 241 blr 242 243/* read fault */ 244src_error: 245 lwz r7,12(r1) 246 addi r1,r1,16 247 cmpwi cr0,r7,0 248 beqlr 249 li r0,-EFAULT 250 stw r0,0(r7) 251 blr 252/* write fault */ 253dst_error: 254 lwz r8,8(r1) 255 addi r1,r1,16 256 cmpwi cr0,r8,0 257 beqlr 258 li r0,-EFAULT 259 stw r0,0(r8) 260 blr 261 262 .section __ex_table,"a" 263 .align 2 264 .long 70b,src_error 265 .long 71b,dst_error 266 .long 72b,src_error 267 .long 73b,dst_error 268 .long 54b,dst_error 269 .text 270 271/* 272 * this stuff handles faults in the cacheline loop and branches to either 273 * src_error (if in read part) or dst_error (if in write part) 274 */ 275 CSUM_COPY_16_BYTES_EXCODE(0) 276#if L1_CACHE_BYTES >= 32 277 CSUM_COPY_16_BYTES_EXCODE(1) 278#if L1_CACHE_BYTES >= 64 279 CSUM_COPY_16_BYTES_EXCODE(2) 280 CSUM_COPY_16_BYTES_EXCODE(3) 281#if L1_CACHE_BYTES >= 128 282 CSUM_COPY_16_BYTES_EXCODE(4) 283 CSUM_COPY_16_BYTES_EXCODE(5) 284 CSUM_COPY_16_BYTES_EXCODE(6) 285 CSUM_COPY_16_BYTES_EXCODE(7) 286#endif 287#endif 288#endif 289 290 .section __ex_table,"a" 291 .align 2 292 .long 30b,src_error 293 .long 31b,dst_error 294 .long 40b,src_error 295 .long 41b,dst_error 296 .long 50b,src_error 297 .long 51b,dst_error 298