1/* 2 * This file contains assembly-language implementations 3 * of IP-style 1's complement checksum routines. 4 * 5 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) 6 * 7 * This program is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License 9 * as published by the Free Software Foundation; either version 10 * 2 of the License, or (at your option) any later version. 11 * 12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). 13 */ 14 15#include <linux/sys.h> 16#include <asm/processor.h> 17#include <asm/cache.h> 18#include <asm/errno.h> 19#include <asm/ppc_asm.h> 20 21 .text 22 23/* 24 * computes the checksum of a memory block at buff, length len, 25 * and adds in "sum" (32-bit) 26 * 27 * __csum_partial(buff, len, sum) 28 */ 29_GLOBAL(__csum_partial) 30 subi r3,r3,4 31 srawi. r6,r4,2 /* Divide len by 4 and also clear carry */ 32 beq 3f /* if we're doing < 4 bytes */ 33 andi. r0,r3,2 /* Align buffer to longword boundary */ 34 beq+ 1f 35 lhz r0,4(r3) /* do 2 bytes to get aligned */ 36 subi r4,r4,2 37 addi r3,r3,2 38 srwi. r6,r4,2 /* # words to do */ 39 adde r5,r5,r0 40 beq 3f 411: andi. r6,r6,3 /* Prepare to handle words 4 by 4 */ 42 beq 21f 43 mtctr r6 442: lwzu r0,4(r3) 45 adde r5,r5,r0 46 bdnz 2b 4721: srwi. r6,r4,4 /* # blocks of 4 words to do */ 48 beq 3f 49 mtctr r6 5022: lwz r0,4(r3) 51 lwz r6,8(r3) 52 lwz r7,12(r3) 53 lwzu r8,16(r3) 54 adde r5,r5,r0 55 adde r5,r5,r6 56 adde r5,r5,r7 57 adde r5,r5,r8 58 bdnz 22b 593: andi. r0,r4,2 60 beq+ 4f 61 lhz r0,4(r3) 62 addi r3,r3,2 63 adde r5,r5,r0 644: andi. r0,r4,1 65 beq+ 5f 66 lbz r0,4(r3) 67 slwi r0,r0,8 /* Upper byte of word */ 68 adde r5,r5,r0 695: addze r3,r5 /* add in final carry */ 70 blr 71 72/* 73 * Computes the checksum of a memory block at src, length len, 74 * and adds in "sum" (32-bit), while copying the block to dst. 75 * If an access exception occurs on src or dst, it stores -EFAULT 76 * to *src_err or *dst_err respectively, and (for an error on 77 * src) zeroes the rest of dst. 78 * 79 * csum_partial_copy_generic(src, dst, len, sum, src_err, dst_err) 80 */ 81#define CSUM_COPY_16_BYTES_WITHEX(n) \ 828 ## n ## 0: \ 83 lwz r7,4(r4); \ 848 ## n ## 1: \ 85 lwz r8,8(r4); \ 868 ## n ## 2: \ 87 lwz r9,12(r4); \ 888 ## n ## 3: \ 89 lwzu r10,16(r4); \ 908 ## n ## 4: \ 91 stw r7,4(r6); \ 92 adde r12,r12,r7; \ 938 ## n ## 5: \ 94 stw r8,8(r6); \ 95 adde r12,r12,r8; \ 968 ## n ## 6: \ 97 stw r9,12(r6); \ 98 adde r12,r12,r9; \ 998 ## n ## 7: \ 100 stwu r10,16(r6); \ 101 adde r12,r12,r10 102 103#define CSUM_COPY_16_BYTES_EXCODE(n) \ 104.section __ex_table,"a"; \ 105 .align 2; \ 106 .long 8 ## n ## 0b,src_error; \ 107 .long 8 ## n ## 1b,src_error; \ 108 .long 8 ## n ## 2b,src_error; \ 109 .long 8 ## n ## 3b,src_error; \ 110 .long 8 ## n ## 4b,dst_error; \ 111 .long 8 ## n ## 5b,dst_error; \ 112 .long 8 ## n ## 6b,dst_error; \ 113 .long 8 ## n ## 7b,dst_error; \ 114 .text 115 116 .text 117 .stabs "arch/powerpc/lib/",N_SO,0,0,0f 118 .stabs "checksum_32.S",N_SO,0,0,0f 1190: 120 121CACHELINE_BYTES = L1_CACHE_BYTES 122LG_CACHELINE_BYTES = L1_CACHE_SHIFT 123CACHELINE_MASK = (L1_CACHE_BYTES-1) 124 125_GLOBAL(csum_partial_copy_generic) 126 stwu r1,-16(r1) 127 stw r7,12(r1) 128 stw r8,8(r1) 129 130 rlwinm r0,r4,3,0x8 131 rlwnm r6,r6,r0,0,31 /* odd destination address: rotate one byte */ 132 cmplwi cr7,r0,0 /* is destination address even ? */ 133 addic r12,r6,0 134 addi r6,r4,-4 135 neg r0,r4 136 addi r4,r3,-4 137 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ 138 beq 58f 139 140 cmplw 0,r5,r0 /* is this more than total to do? */ 141 blt 63f /* if not much to do */ 142 andi. r8,r0,3 /* get it word-aligned first */ 143 mtctr r8 144 beq+ 61f 145 li r3,0 14670: lbz r9,4(r4) /* do some bytes */ 147 addi r4,r4,1 148 slwi r3,r3,8 149 rlwimi r3,r9,0,24,31 15071: stb r9,4(r6) 151 addi r6,r6,1 152 bdnz 70b 153 adde r12,r12,r3 15461: subf r5,r0,r5 155 srwi. r0,r0,2 156 mtctr r0 157 beq 58f 15872: lwzu r9,4(r4) /* do some words */ 159 adde r12,r12,r9 16073: stwu r9,4(r6) 161 bdnz 72b 162 16358: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ 164 clrlwi r5,r5,32-LG_CACHELINE_BYTES 165 li r11,4 166 beq 63f 167 168 /* Here we decide how far ahead to prefetch the source */ 169 li r3,4 170 cmpwi r0,1 171 li r7,0 172 ble 114f 173 li r7,1 174#if MAX_COPY_PREFETCH > 1 175 /* Heuristically, for large transfers we prefetch 176 MAX_COPY_PREFETCH cachelines ahead. For small transfers 177 we prefetch 1 cacheline ahead. */ 178 cmpwi r0,MAX_COPY_PREFETCH 179 ble 112f 180 li r7,MAX_COPY_PREFETCH 181112: mtctr r7 182111: dcbt r3,r4 183 addi r3,r3,CACHELINE_BYTES 184 bdnz 111b 185#else 186 dcbt r3,r4 187 addi r3,r3,CACHELINE_BYTES 188#endif /* MAX_COPY_PREFETCH > 1 */ 189 190114: subf r8,r7,r0 191 mr r0,r7 192 mtctr r8 193 19453: dcbt r3,r4 19554: dcbz r11,r6 196/* the main body of the cacheline loop */ 197 CSUM_COPY_16_BYTES_WITHEX(0) 198#if L1_CACHE_BYTES >= 32 199 CSUM_COPY_16_BYTES_WITHEX(1) 200#if L1_CACHE_BYTES >= 64 201 CSUM_COPY_16_BYTES_WITHEX(2) 202 CSUM_COPY_16_BYTES_WITHEX(3) 203#if L1_CACHE_BYTES >= 128 204 CSUM_COPY_16_BYTES_WITHEX(4) 205 CSUM_COPY_16_BYTES_WITHEX(5) 206 CSUM_COPY_16_BYTES_WITHEX(6) 207 CSUM_COPY_16_BYTES_WITHEX(7) 208#endif 209#endif 210#endif 211 bdnz 53b 212 cmpwi r0,0 213 li r3,4 214 li r7,0 215 bne 114b 216 21763: srwi. r0,r5,2 218 mtctr r0 219 beq 64f 22030: lwzu r0,4(r4) 221 adde r12,r12,r0 22231: stwu r0,4(r6) 223 bdnz 30b 224 22564: andi. r0,r5,2 226 beq+ 65f 22740: lhz r0,4(r4) 228 addi r4,r4,2 22941: sth r0,4(r6) 230 adde r12,r12,r0 231 addi r6,r6,2 23265: andi. r0,r5,1 233 beq+ 66f 23450: lbz r0,4(r4) 23551: stb r0,4(r6) 236 slwi r0,r0,8 237 adde r12,r12,r0 23866: addze r3,r12 239 addi r1,r1,16 240 beqlr+ cr7 241 rlwinm r3,r3,8,0,31 /* odd destination address: rotate one byte */ 242 blr 243 244/* read fault */ 245src_error: 246 lwz r7,12(r1) 247 addi r1,r1,16 248 cmpwi cr0,r7,0 249 beqlr 250 li r0,-EFAULT 251 stw r0,0(r7) 252 blr 253/* write fault */ 254dst_error: 255 lwz r8,8(r1) 256 addi r1,r1,16 257 cmpwi cr0,r8,0 258 beqlr 259 li r0,-EFAULT 260 stw r0,0(r8) 261 blr 262 263 .section __ex_table,"a" 264 .align 2 265 .long 70b,src_error 266 .long 71b,dst_error 267 .long 72b,src_error 268 .long 73b,dst_error 269 .long 54b,dst_error 270 .text 271 272/* 273 * this stuff handles faults in the cacheline loop and branches to either 274 * src_error (if in read part) or dst_error (if in write part) 275 */ 276 CSUM_COPY_16_BYTES_EXCODE(0) 277#if L1_CACHE_BYTES >= 32 278 CSUM_COPY_16_BYTES_EXCODE(1) 279#if L1_CACHE_BYTES >= 64 280 CSUM_COPY_16_BYTES_EXCODE(2) 281 CSUM_COPY_16_BYTES_EXCODE(3) 282#if L1_CACHE_BYTES >= 128 283 CSUM_COPY_16_BYTES_EXCODE(4) 284 CSUM_COPY_16_BYTES_EXCODE(5) 285 CSUM_COPY_16_BYTES_EXCODE(6) 286 CSUM_COPY_16_BYTES_EXCODE(7) 287#endif 288#endif 289#endif 290 291 .section __ex_table,"a" 292 .align 2 293 .long 30b,src_error 294 .long 31b,dst_error 295 .long 40b,src_error 296 .long 41b,dst_error 297 .long 50b,src_error 298 .long 51b,dst_error 299