1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * IP/TCP/UDP checksumming routines 7 * 8 * Xtensa version: Copyright (C) 2001 Tensilica, Inc. by Kevin Chea 9 * Optimized by Joe Taylor 10 * 11 * This program is free software; you can redistribute it and/or 12 * modify it under the terms of the GNU General Public License 13 * as published by the Free Software Foundation; either version 14 * 2 of the License, or (at your option) any later version. 15 */ 16 17#include <asm/errno.h> 18#include <linux/linkage.h> 19#include <variant/core.h> 20 21/* 22 * computes a partial checksum, e.g. for TCP/UDP fragments 23 */ 24 25/* 26 * unsigned int csum_partial(const unsigned char *buf, int len, 27 * unsigned int sum); 28 * a2 = buf 29 * a3 = len 30 * a4 = sum 31 * 32 * This function assumes 2- or 4-byte alignment. Other alignments will fail! 33 */ 34 35/* ONES_ADD converts twos-complement math to ones-complement. */ 36#define ONES_ADD(sum, val) \ 37 add sum, sum, val ; \ 38 bgeu sum, val, 99f ; \ 39 addi sum, sum, 1 ; \ 4099: ; 41 42.text 43ENTRY(csum_partial) 44 /* 45 * Experiments with Ethernet and SLIP connections show that buf 46 * is aligned on either a 2-byte or 4-byte boundary. 47 */ 48 entry sp, 32 49 extui a5, a2, 0, 2 50 bnez a5, 8f /* branch if 2-byte aligned */ 51 /* Fall-through on common case, 4-byte alignment */ 521: 53 srli a5, a3, 5 /* 32-byte chunks */ 54#if XCHAL_HAVE_LOOPS 55 loopgtz a5, 2f 56#else 57 beqz a5, 2f 58 slli a5, a5, 5 59 add a5, a5, a2 /* a5 = end of last 32-byte chunk */ 60.Loop1: 61#endif 62 l32i a6, a2, 0 63 l32i a7, a2, 4 64 ONES_ADD(a4, a6) 65 ONES_ADD(a4, a7) 66 l32i a6, a2, 8 67 l32i a7, a2, 12 68 ONES_ADD(a4, a6) 69 ONES_ADD(a4, a7) 70 l32i a6, a2, 16 71 l32i a7, a2, 20 72 ONES_ADD(a4, a6) 73 ONES_ADD(a4, a7) 74 l32i a6, a2, 24 75 l32i a7, a2, 28 76 ONES_ADD(a4, a6) 77 ONES_ADD(a4, a7) 78 addi a2, a2, 4*8 79#if !XCHAL_HAVE_LOOPS 80 blt a2, a5, .Loop1 81#endif 822: 83 extui a5, a3, 2, 3 /* remaining 4-byte chunks */ 84#if XCHAL_HAVE_LOOPS 85 loopgtz a5, 3f 86#else 87 beqz a5, 3f 88 slli a5, a5, 2 89 add a5, a5, a2 /* a5 = end of last 4-byte chunk */ 90.Loop2: 91#endif 92 l32i a6, a2, 0 93 ONES_ADD(a4, a6) 94 addi a2, a2, 4 95#if !XCHAL_HAVE_LOOPS 96 blt a2, a5, .Loop2 97#endif 983: 99 _bbci.l a3, 1, 5f /* remaining 2-byte chunk */ 100 l16ui a6, a2, 0 101 ONES_ADD(a4, a6) 102 addi a2, a2, 2 1035: 104 _bbci.l a3, 0, 7f /* remaining 1-byte chunk */ 1056: l8ui a6, a2, 0 106#ifdef __XTENSA_EB__ 107 slli a6, a6, 8 /* load byte into bits 8..15 */ 108#endif 109 ONES_ADD(a4, a6) 1107: 111 mov a2, a4 112 retw 113 114 /* uncommon case, buf is 2-byte aligned */ 1158: 116 beqz a3, 7b /* branch if len == 0 */ 117 beqi a3, 1, 6b /* branch if len == 1 */ 118 119 extui a5, a2, 0, 1 120 bnez a5, 8f /* branch if 1-byte aligned */ 121 122 l16ui a6, a2, 0 /* common case, len >= 2 */ 123 ONES_ADD(a4, a6) 124 addi a2, a2, 2 /* adjust buf */ 125 addi a3, a3, -2 /* adjust len */ 126 j 1b /* now buf is 4-byte aligned */ 127 128 /* case: odd-byte aligned, len > 1 129 * This case is dog slow, so don't give us an odd address. 130 * (I don't think this ever happens, but just in case.) 131 */ 1328: 133 srli a5, a3, 2 /* 4-byte chunks */ 134#if XCHAL_HAVE_LOOPS 135 loopgtz a5, 2f 136#else 137 beqz a5, 2f 138 slli a5, a5, 2 139 add a5, a5, a2 /* a5 = end of last 4-byte chunk */ 140.Loop3: 141#endif 142 l8ui a6, a2, 0 /* bits 24..31 */ 143 l16ui a7, a2, 1 /* bits 8..23 */ 144 l8ui a8, a2, 3 /* bits 0.. 8 */ 145#ifdef __XTENSA_EB__ 146 slli a6, a6, 24 147#else 148 slli a8, a8, 24 149#endif 150 slli a7, a7, 8 151 or a7, a7, a6 152 or a7, a7, a8 153 ONES_ADD(a4, a7) 154 addi a2, a2, 4 155#if !XCHAL_HAVE_LOOPS 156 blt a2, a5, .Loop3 157#endif 1582: 159 _bbci.l a3, 1, 3f /* remaining 2-byte chunk, still odd addr */ 160 l8ui a6, a2, 0 161 l8ui a7, a2, 1 162#ifdef __XTENSA_EB__ 163 slli a6, a6, 8 164#else 165 slli a7, a7, 8 166#endif 167 or a7, a7, a6 168 ONES_ADD(a4, a7) 169 addi a2, a2, 2 1703: 171 j 5b /* branch to handle the remaining byte */ 172 173 174 175/* 176 * Copy from ds while checksumming, otherwise like csum_partial 177 * 178 * The macros SRC and DST specify the type of access for the instruction. 179 * thus we can call a custom exception handler for each access type. 180 */ 181 182#define SRC(y...) \ 183 9999: y; \ 184 .section __ex_table, "a"; \ 185 .long 9999b, 6001f ; \ 186 .previous 187 188#define DST(y...) \ 189 9999: y; \ 190 .section __ex_table, "a"; \ 191 .long 9999b, 6002f ; \ 192 .previous 193 194/* 195unsigned int csum_partial_copy_generic (const char *src, char *dst, int len, 196 int sum, int *src_err_ptr, int *dst_err_ptr) 197 a2 = src 198 a3 = dst 199 a4 = len 200 a5 = sum 201 a6 = src_err_ptr 202 a7 = dst_err_ptr 203 a8 = temp 204 a9 = temp 205 a10 = temp 206 a11 = original len for exception handling 207 a12 = original dst for exception handling 208 209 This function is optimized for 4-byte aligned addresses. Other 210 alignments work, but not nearly as efficiently. 211 */ 212 213ENTRY(csum_partial_copy_generic) 214 entry sp, 32 215 mov a12, a3 216 mov a11, a4 217 or a10, a2, a3 218 219 /* We optimize the following alignment tests for the 4-byte 220 aligned case. Two bbsi.l instructions might seem more optimal 221 (commented out below). However, both labels 5: and 3: are out 222 of the imm8 range, so the assembler relaxes them into 223 equivalent bbci.l, j combinations, which is actually 224 slower. */ 225 226 extui a9, a10, 0, 2 227 beqz a9, 1f /* branch if both are 4-byte aligned */ 228 bbsi.l a10, 0, 5f /* branch if one address is odd */ 229 j 3f /* one address is 2-byte aligned */ 230 231/* _bbsi.l a10, 0, 5f */ /* branch if odd address */ 232/* _bbsi.l a10, 1, 3f */ /* branch if 2-byte-aligned address */ 233 2341: 235 /* src and dst are both 4-byte aligned */ 236 srli a10, a4, 5 /* 32-byte chunks */ 237#if XCHAL_HAVE_LOOPS 238 loopgtz a10, 2f 239#else 240 beqz a10, 2f 241 slli a10, a10, 5 242 add a10, a10, a2 /* a10 = end of last 32-byte src chunk */ 243.Loop5: 244#endif 245SRC( l32i a9, a2, 0 ) 246SRC( l32i a8, a2, 4 ) 247DST( s32i a9, a3, 0 ) 248DST( s32i a8, a3, 4 ) 249 ONES_ADD(a5, a9) 250 ONES_ADD(a5, a8) 251SRC( l32i a9, a2, 8 ) 252SRC( l32i a8, a2, 12 ) 253DST( s32i a9, a3, 8 ) 254DST( s32i a8, a3, 12 ) 255 ONES_ADD(a5, a9) 256 ONES_ADD(a5, a8) 257SRC( l32i a9, a2, 16 ) 258SRC( l32i a8, a2, 20 ) 259DST( s32i a9, a3, 16 ) 260DST( s32i a8, a3, 20 ) 261 ONES_ADD(a5, a9) 262 ONES_ADD(a5, a8) 263SRC( l32i a9, a2, 24 ) 264SRC( l32i a8, a2, 28 ) 265DST( s32i a9, a3, 24 ) 266DST( s32i a8, a3, 28 ) 267 ONES_ADD(a5, a9) 268 ONES_ADD(a5, a8) 269 addi a2, a2, 32 270 addi a3, a3, 32 271#if !XCHAL_HAVE_LOOPS 272 blt a2, a10, .Loop5 273#endif 2742: 275 extui a10, a4, 2, 3 /* remaining 4-byte chunks */ 276 extui a4, a4, 0, 2 /* reset len for general-case, 2-byte chunks */ 277#if XCHAL_HAVE_LOOPS 278 loopgtz a10, 3f 279#else 280 beqz a10, 3f 281 slli a10, a10, 2 282 add a10, a10, a2 /* a10 = end of last 4-byte src chunk */ 283.Loop6: 284#endif 285SRC( l32i a9, a2, 0 ) 286DST( s32i a9, a3, 0 ) 287 ONES_ADD(a5, a9) 288 addi a2, a2, 4 289 addi a3, a3, 4 290#if !XCHAL_HAVE_LOOPS 291 blt a2, a10, .Loop6 292#endif 2933: 294 /* 295 Control comes to here in two cases: (1) It may fall through 296 to here from the 4-byte alignment case to process, at most, 297 one 2-byte chunk. (2) It branches to here from above if 298 either src or dst is 2-byte aligned, and we process all bytes 299 here, except for perhaps a trailing odd byte. It's 300 inefficient, so align your addresses to 4-byte boundaries. 301 302 a2 = src 303 a3 = dst 304 a4 = len 305 a5 = sum 306 */ 307 srli a10, a4, 1 /* 2-byte chunks */ 308#if XCHAL_HAVE_LOOPS 309 loopgtz a10, 4f 310#else 311 beqz a10, 4f 312 slli a10, a10, 1 313 add a10, a10, a2 /* a10 = end of last 2-byte src chunk */ 314.Loop7: 315#endif 316SRC( l16ui a9, a2, 0 ) 317DST( s16i a9, a3, 0 ) 318 ONES_ADD(a5, a9) 319 addi a2, a2, 2 320 addi a3, a3, 2 321#if !XCHAL_HAVE_LOOPS 322 blt a2, a10, .Loop7 323#endif 3244: 325 /* This section processes a possible trailing odd byte. */ 326 _bbci.l a4, 0, 8f /* 1-byte chunk */ 327SRC( l8ui a9, a2, 0 ) 328DST( s8i a9, a3, 0 ) 329#ifdef __XTENSA_EB__ 330 slli a9, a9, 8 /* shift byte to bits 8..15 */ 331#endif 332 ONES_ADD(a5, a9) 3338: 334 mov a2, a5 335 retw 336 3375: 338 /* Control branch to here when either src or dst is odd. We 339 process all bytes using 8-bit accesses. Grossly inefficient, 340 so don't feed us an odd address. */ 341 342 srli a10, a4, 1 /* handle in pairs for 16-bit csum */ 343#if XCHAL_HAVE_LOOPS 344 loopgtz a10, 6f 345#else 346 beqz a10, 6f 347 slli a10, a10, 1 348 add a10, a10, a2 /* a10 = end of last odd-aligned, 2-byte src chunk */ 349.Loop8: 350#endif 351SRC( l8ui a9, a2, 0 ) 352SRC( l8ui a8, a2, 1 ) 353DST( s8i a9, a3, 0 ) 354DST( s8i a8, a3, 1 ) 355#ifdef __XTENSA_EB__ 356 slli a9, a9, 8 /* combine into a single 16-bit value */ 357#else /* for checksum computation */ 358 slli a8, a8, 8 359#endif 360 or a9, a9, a8 361 ONES_ADD(a5, a9) 362 addi a2, a2, 2 363 addi a3, a3, 2 364#if !XCHAL_HAVE_LOOPS 365 blt a2, a10, .Loop8 366#endif 3676: 368 j 4b /* process the possible trailing odd byte */ 369 370 371# Exception handler: 372.section .fixup, "ax" 373/* 374 a6 = src_err_ptr 375 a7 = dst_err_ptr 376 a11 = original len for exception handling 377 a12 = original dst for exception handling 378*/ 379 3806001: 381 _movi a2, -EFAULT 382 s32i a2, a6, 0 /* src_err_ptr */ 383 384 # clear the complete destination - computing the rest 385 # is too much work 386 movi a2, 0 387#if XCHAL_HAVE_LOOPS 388 loopgtz a11, 2f 389#else 390 beqz a11, 2f 391 add a11, a11, a12 /* a11 = ending address */ 392.Leloop: 393#endif 394 s8i a2, a12, 0 395 addi a12, a12, 1 396#if !XCHAL_HAVE_LOOPS 397 blt a12, a11, .Leloop 398#endif 3992: 400 retw 401 4026002: 403 movi a2, -EFAULT 404 s32i a2, a7, 0 /* dst_err_ptr */ 405 movi a2, 0 406 retw 407 408.previous 409 410