1/* 2 * This file contains assembly-language implementations 3 * of IP-style 1's complement checksum routines. 4 * 5 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) 6 * 7 * This program is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License 9 * as published by the Free Software Foundation; either version 10 * 2 of the License, or (at your option) any later version. 11 * 12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). 13 */ 14 15#include <linux/sys.h> 16#include <asm/processor.h> 17#include <asm/cache.h> 18#include <asm/errno.h> 19#include <asm/ppc_asm.h> 20 21 .text 22 23/* 24 * computes the checksum of a memory block at buff, length len, 25 * and adds in "sum" (32-bit) 26 * 27 * __csum_partial(buff, len, sum) 28 */ 29_GLOBAL(__csum_partial) 30 subi r3,r3,4 31 srawi. r6,r4,2 /* Divide len by 4 and also clear carry */ 32 beq 3f /* if we're doing < 4 bytes */ 33 andi. r0,r3,2 /* Align buffer to longword boundary */ 34 beq+ 1f 35 lhz r0,4(r3) /* do 2 bytes to get aligned */ 36 subi r4,r4,2 37 addi r3,r3,2 38 srwi. r6,r4,2 /* # words to do */ 39 adde r5,r5,r0 40 beq 3f 411: andi. r6,r6,3 /* Prepare to handle words 4 by 4 */ 42 beq 21f 43 mtctr r6 442: lwzu r0,4(r3) 45 adde r5,r5,r0 46 bdnz 2b 4721: srwi. r6,r4,4 /* # blocks of 4 words to do */ 48 beq 3f 49 mtctr r6 5022: lwz r0,4(r3) 51 lwz r6,8(r3) 52 lwz r7,12(r3) 53 lwzu r8,16(r3) 54 adde r5,r5,r0 55 adde r5,r5,r6 56 adde r5,r5,r7 57 adde r5,r5,r8 58 bdnz 22b 593: andi. r0,r4,2 60 beq+ 4f 61 lhz r0,4(r3) 62 addi r3,r3,2 63 adde r5,r5,r0 644: andi. r0,r4,1 65 beq+ 5f 66 lbz r0,4(r3) 67 slwi r0,r0,8 /* Upper byte of word */ 68 adde r5,r5,r0 695: addze r3,r5 /* add in final carry */ 70 blr 71 72/* 73 * Computes the checksum of a memory block at src, length len, 74 * and adds in "sum" (32-bit), while copying the block to dst. 75 * If an access exception occurs on src or dst, it stores -EFAULT 76 * to *src_err or *dst_err respectively, and (for an error on 77 * src) zeroes the rest of dst. 78 * 79 * csum_partial_copy_generic(src, dst, len, sum, src_err, dst_err) 80 */ 81#define CSUM_COPY_16_BYTES_WITHEX(n) \ 828 ## n ## 0: \ 83 lwz r7,4(r4); \ 848 ## n ## 1: \ 85 lwz r8,8(r4); \ 868 ## n ## 2: \ 87 lwz r9,12(r4); \ 888 ## n ## 3: \ 89 lwzu r10,16(r4); \ 908 ## n ## 4: \ 91 stw r7,4(r6); \ 92 adde r12,r12,r7; \ 938 ## n ## 5: \ 94 stw r8,8(r6); \ 95 adde r12,r12,r8; \ 968 ## n ## 6: \ 97 stw r9,12(r6); \ 98 adde r12,r12,r9; \ 998 ## n ## 7: \ 100 stwu r10,16(r6); \ 101 adde r12,r12,r10 102 103#define CSUM_COPY_16_BYTES_EXCODE(n) \ 104.section __ex_table,"a"; \ 105 .align 2; \ 106 .long 8 ## n ## 0b,src_error; \ 107 .long 8 ## n ## 1b,src_error; \ 108 .long 8 ## n ## 2b,src_error; \ 109 .long 8 ## n ## 3b,src_error; \ 110 .long 8 ## n ## 4b,dst_error; \ 111 .long 8 ## n ## 5b,dst_error; \ 112 .long 8 ## n ## 6b,dst_error; \ 113 .long 8 ## n ## 7b,dst_error; \ 114 .text 115 116 .text 117 .stabs "arch/powerpc/lib/",N_SO,0,0,0f 118 .stabs "checksum_32.S",N_SO,0,0,0f 1190: 120 121CACHELINE_BYTES = L1_CACHE_BYTES 122LG_CACHELINE_BYTES = L1_CACHE_SHIFT 123CACHELINE_MASK = (L1_CACHE_BYTES-1) 124 125_GLOBAL(csum_partial_copy_generic) 126 stwu r1,-16(r1) 127 stw r7,12(r1) 128 stw r8,8(r1) 129 130 addic r12,r6,0 131 addi r6,r4,-4 132 neg r0,r4 133 addi r4,r3,-4 134 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ 135 crset 4*cr7+eq 136 beq 58f 137 138 cmplw 0,r5,r0 /* is this more than total to do? */ 139 blt 63f /* if not much to do */ 140 rlwinm r7,r6,3,0x8 141 rlwnm r12,r12,r7,0,31 /* odd destination address: rotate one byte */ 142 cmplwi cr7,r7,0 /* is destination address even ? */ 143 andi. r8,r0,3 /* get it word-aligned first */ 144 mtctr r8 145 beq+ 61f 146 li r3,0 14770: lbz r9,4(r4) /* do some bytes */ 148 addi r4,r4,1 149 slwi r3,r3,8 150 rlwimi r3,r9,0,24,31 15171: stb r9,4(r6) 152 addi r6,r6,1 153 bdnz 70b 154 adde r12,r12,r3 15561: subf r5,r0,r5 156 srwi. r0,r0,2 157 mtctr r0 158 beq 58f 15972: lwzu r9,4(r4) /* do some words */ 160 adde r12,r12,r9 16173: stwu r9,4(r6) 162 bdnz 72b 163 16458: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ 165 clrlwi r5,r5,32-LG_CACHELINE_BYTES 166 li r11,4 167 beq 63f 168 169 /* Here we decide how far ahead to prefetch the source */ 170 li r3,4 171 cmpwi r0,1 172 li r7,0 173 ble 114f 174 li r7,1 175#if MAX_COPY_PREFETCH > 1 176 /* Heuristically, for large transfers we prefetch 177 MAX_COPY_PREFETCH cachelines ahead. For small transfers 178 we prefetch 1 cacheline ahead. */ 179 cmpwi r0,MAX_COPY_PREFETCH 180 ble 112f 181 li r7,MAX_COPY_PREFETCH 182112: mtctr r7 183111: dcbt r3,r4 184 addi r3,r3,CACHELINE_BYTES 185 bdnz 111b 186#else 187 dcbt r3,r4 188 addi r3,r3,CACHELINE_BYTES 189#endif /* MAX_COPY_PREFETCH > 1 */ 190 191114: subf r8,r7,r0 192 mr r0,r7 193 mtctr r8 194 19553: dcbt r3,r4 19654: dcbz r11,r6 197/* the main body of the cacheline loop */ 198 CSUM_COPY_16_BYTES_WITHEX(0) 199#if L1_CACHE_BYTES >= 32 200 CSUM_COPY_16_BYTES_WITHEX(1) 201#if L1_CACHE_BYTES >= 64 202 CSUM_COPY_16_BYTES_WITHEX(2) 203 CSUM_COPY_16_BYTES_WITHEX(3) 204#if L1_CACHE_BYTES >= 128 205 CSUM_COPY_16_BYTES_WITHEX(4) 206 CSUM_COPY_16_BYTES_WITHEX(5) 207 CSUM_COPY_16_BYTES_WITHEX(6) 208 CSUM_COPY_16_BYTES_WITHEX(7) 209#endif 210#endif 211#endif 212 bdnz 53b 213 cmpwi r0,0 214 li r3,4 215 li r7,0 216 bne 114b 217 21863: srwi. r0,r5,2 219 mtctr r0 220 beq 64f 22130: lwzu r0,4(r4) 222 adde r12,r12,r0 22331: stwu r0,4(r6) 224 bdnz 30b 225 22664: andi. r0,r5,2 227 beq+ 65f 22840: lhz r0,4(r4) 229 addi r4,r4,2 23041: sth r0,4(r6) 231 adde r12,r12,r0 232 addi r6,r6,2 23365: andi. r0,r5,1 234 beq+ 66f 23550: lbz r0,4(r4) 23651: stb r0,4(r6) 237 slwi r0,r0,8 238 adde r12,r12,r0 23966: addze r3,r12 240 addi r1,r1,16 241 beqlr+ cr7 242 rlwinm r3,r3,8,0,31 /* odd destination address: rotate one byte */ 243 blr 244 245/* read fault */ 246src_error: 247 lwz r7,12(r1) 248 addi r1,r1,16 249 cmpwi cr0,r7,0 250 beqlr 251 li r0,-EFAULT 252 stw r0,0(r7) 253 blr 254/* write fault */ 255dst_error: 256 lwz r8,8(r1) 257 addi r1,r1,16 258 cmpwi cr0,r8,0 259 beqlr 260 li r0,-EFAULT 261 stw r0,0(r8) 262 blr 263 264 .section __ex_table,"a" 265 .align 2 266 .long 70b,src_error 267 .long 71b,dst_error 268 .long 72b,src_error 269 .long 73b,dst_error 270 .long 54b,dst_error 271 .text 272 273/* 274 * this stuff handles faults in the cacheline loop and branches to either 275 * src_error (if in read part) or dst_error (if in write part) 276 */ 277 CSUM_COPY_16_BYTES_EXCODE(0) 278#if L1_CACHE_BYTES >= 32 279 CSUM_COPY_16_BYTES_EXCODE(1) 280#if L1_CACHE_BYTES >= 64 281 CSUM_COPY_16_BYTES_EXCODE(2) 282 CSUM_COPY_16_BYTES_EXCODE(3) 283#if L1_CACHE_BYTES >= 128 284 CSUM_COPY_16_BYTES_EXCODE(4) 285 CSUM_COPY_16_BYTES_EXCODE(5) 286 CSUM_COPY_16_BYTES_EXCODE(6) 287 CSUM_COPY_16_BYTES_EXCODE(7) 288#endif 289#endif 290#endif 291 292 .section __ex_table,"a" 293 .align 2 294 .long 30b,src_error 295 .long 31b,dst_error 296 .long 40b,src_error 297 .long 41b,dst_error 298 .long 50b,src_error 299 .long 51b,dst_error 300