1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * IP/TCP/UDP checksumming routines 7 * 8 * Xtensa version: Copyright (C) 2001 Tensilica, Inc. by Kevin Chea 9 * Optimized by Joe Taylor 10 * 11 * This program is free software; you can redistribute it and/or 12 * modify it under the terms of the GNU General Public License 13 * as published by the Free Software Foundation; either version 14 * 2 of the License, or (at your option) any later version. 15 */ 16 17#include <linux/errno.h> 18#include <linux/linkage.h> 19#include <variant/core.h> 20#include <asm/asmmacro.h> 21 22/* 23 * computes a partial checksum, e.g. for TCP/UDP fragments 24 */ 25 26/* 27 * unsigned int csum_partial(const unsigned char *buf, int len, 28 * unsigned int sum); 29 * a2 = buf 30 * a3 = len 31 * a4 = sum 32 * 33 * This function assumes 2- or 4-byte alignment. Other alignments will fail! 34 */ 35 36/* ONES_ADD converts twos-complement math to ones-complement. */ 37#define ONES_ADD(sum, val) \ 38 add sum, sum, val ; \ 39 bgeu sum, val, 99f ; \ 40 addi sum, sum, 1 ; \ 4199: ; 42 43.text 44ENTRY(csum_partial) 45 46 /* 47 * Experiments with Ethernet and SLIP connections show that buf 48 * is aligned on either a 2-byte or 4-byte boundary. 49 */ 50 entry sp, 32 51 extui a5, a2, 0, 2 52 bnez a5, 8f /* branch if 2-byte aligned */ 53 /* Fall-through on common case, 4-byte alignment */ 541: 55 srli a5, a3, 5 /* 32-byte chunks */ 56#if XCHAL_HAVE_LOOPS 57 loopgtz a5, 2f 58#else 59 beqz a5, 2f 60 slli a5, a5, 5 61 add a5, a5, a2 /* a5 = end of last 32-byte chunk */ 62.Loop1: 63#endif 64 l32i a6, a2, 0 65 l32i a7, a2, 4 66 ONES_ADD(a4, a6) 67 ONES_ADD(a4, a7) 68 l32i a6, a2, 8 69 l32i a7, a2, 12 70 ONES_ADD(a4, a6) 71 ONES_ADD(a4, a7) 72 l32i a6, a2, 16 73 l32i a7, a2, 20 74 ONES_ADD(a4, a6) 75 ONES_ADD(a4, a7) 76 l32i a6, a2, 24 77 l32i a7, a2, 28 78 ONES_ADD(a4, a6) 79 ONES_ADD(a4, a7) 80 addi a2, a2, 4*8 81#if !XCHAL_HAVE_LOOPS 82 blt a2, a5, .Loop1 83#endif 842: 85 extui a5, a3, 2, 3 /* remaining 4-byte chunks */ 86#if XCHAL_HAVE_LOOPS 87 loopgtz a5, 3f 88#else 89 beqz a5, 3f 90 slli a5, a5, 2 91 add a5, a5, a2 /* a5 = end of last 4-byte chunk */ 92.Loop2: 93#endif 94 l32i a6, a2, 0 95 ONES_ADD(a4, a6) 96 addi a2, a2, 4 97#if !XCHAL_HAVE_LOOPS 98 blt a2, a5, .Loop2 99#endif 1003: 101 _bbci.l a3, 1, 5f /* remaining 2-byte chunk */ 102 l16ui a6, a2, 0 103 ONES_ADD(a4, a6) 104 addi a2, a2, 2 1055: 106 _bbci.l a3, 0, 7f /* remaining 1-byte chunk */ 1076: l8ui a6, a2, 0 108#ifdef __XTENSA_EB__ 109 slli a6, a6, 8 /* load byte into bits 8..15 */ 110#endif 111 ONES_ADD(a4, a6) 1127: 113 mov a2, a4 114 retw 115 116 /* uncommon case, buf is 2-byte aligned */ 1178: 118 beqz a3, 7b /* branch if len == 0 */ 119 beqi a3, 1, 6b /* branch if len == 1 */ 120 121 extui a5, a2, 0, 1 122 bnez a5, 8f /* branch if 1-byte aligned */ 123 124 l16ui a6, a2, 0 /* common case, len >= 2 */ 125 ONES_ADD(a4, a6) 126 addi a2, a2, 2 /* adjust buf */ 127 addi a3, a3, -2 /* adjust len */ 128 j 1b /* now buf is 4-byte aligned */ 129 130 /* case: odd-byte aligned, len > 1 131 * This case is dog slow, so don't give us an odd address. 132 * (I don't think this ever happens, but just in case.) 133 */ 1348: 135 srli a5, a3, 2 /* 4-byte chunks */ 136#if XCHAL_HAVE_LOOPS 137 loopgtz a5, 2f 138#else 139 beqz a5, 2f 140 slli a5, a5, 2 141 add a5, a5, a2 /* a5 = end of last 4-byte chunk */ 142.Loop3: 143#endif 144 l8ui a6, a2, 0 /* bits 24..31 */ 145 l16ui a7, a2, 1 /* bits 8..23 */ 146 l8ui a8, a2, 3 /* bits 0.. 8 */ 147#ifdef __XTENSA_EB__ 148 slli a6, a6, 24 149#else 150 slli a8, a8, 24 151#endif 152 slli a7, a7, 8 153 or a7, a7, a6 154 or a7, a7, a8 155 ONES_ADD(a4, a7) 156 addi a2, a2, 4 157#if !XCHAL_HAVE_LOOPS 158 blt a2, a5, .Loop3 159#endif 1602: 161 _bbci.l a3, 1, 3f /* remaining 2-byte chunk, still odd addr */ 162 l8ui a6, a2, 0 163 l8ui a7, a2, 1 164#ifdef __XTENSA_EB__ 165 slli a6, a6, 8 166#else 167 slli a7, a7, 8 168#endif 169 or a7, a7, a6 170 ONES_ADD(a4, a7) 171 addi a2, a2, 2 1723: 173 j 5b /* branch to handle the remaining byte */ 174 175ENDPROC(csum_partial) 176 177/* 178 * Copy from ds while checksumming, otherwise like csum_partial 179 */ 180 181/* 182unsigned int csum_partial_copy_generic (const char *src, char *dst, int len, 183 int sum, int *src_err_ptr, int *dst_err_ptr) 184 a2 = src 185 a3 = dst 186 a4 = len 187 a5 = sum 188 a6 = src_err_ptr 189 a7 = dst_err_ptr 190 a8 = temp 191 a9 = temp 192 a10 = temp 193 a11 = original len for exception handling 194 a12 = original dst for exception handling 195 196 This function is optimized for 4-byte aligned addresses. Other 197 alignments work, but not nearly as efficiently. 198 */ 199 200ENTRY(csum_partial_copy_generic) 201 202 entry sp, 32 203 mov a12, a3 204 mov a11, a4 205 or a10, a2, a3 206 207 /* We optimize the following alignment tests for the 4-byte 208 aligned case. Two bbsi.l instructions might seem more optimal 209 (commented out below). However, both labels 5: and 3: are out 210 of the imm8 range, so the assembler relaxes them into 211 equivalent bbci.l, j combinations, which is actually 212 slower. */ 213 214 extui a9, a10, 0, 2 215 beqz a9, 1f /* branch if both are 4-byte aligned */ 216 bbsi.l a10, 0, 5f /* branch if one address is odd */ 217 j 3f /* one address is 2-byte aligned */ 218 219/* _bbsi.l a10, 0, 5f */ /* branch if odd address */ 220/* _bbsi.l a10, 1, 3f */ /* branch if 2-byte-aligned address */ 221 2221: 223 /* src and dst are both 4-byte aligned */ 224 srli a10, a4, 5 /* 32-byte chunks */ 225#if XCHAL_HAVE_LOOPS 226 loopgtz a10, 2f 227#else 228 beqz a10, 2f 229 slli a10, a10, 5 230 add a10, a10, a2 /* a10 = end of last 32-byte src chunk */ 231.Loop5: 232#endif 233EX(10f) l32i a9, a2, 0 234EX(10f) l32i a8, a2, 4 235EX(11f) s32i a9, a3, 0 236EX(11f) s32i a8, a3, 4 237 ONES_ADD(a5, a9) 238 ONES_ADD(a5, a8) 239EX(10f) l32i a9, a2, 8 240EX(10f) l32i a8, a2, 12 241EX(11f) s32i a9, a3, 8 242EX(11f) s32i a8, a3, 12 243 ONES_ADD(a5, a9) 244 ONES_ADD(a5, a8) 245EX(10f) l32i a9, a2, 16 246EX(10f) l32i a8, a2, 20 247EX(11f) s32i a9, a3, 16 248EX(11f) s32i a8, a3, 20 249 ONES_ADD(a5, a9) 250 ONES_ADD(a5, a8) 251EX(10f) l32i a9, a2, 24 252EX(10f) l32i a8, a2, 28 253EX(11f) s32i a9, a3, 24 254EX(11f) s32i a8, a3, 28 255 ONES_ADD(a5, a9) 256 ONES_ADD(a5, a8) 257 addi a2, a2, 32 258 addi a3, a3, 32 259#if !XCHAL_HAVE_LOOPS 260 blt a2, a10, .Loop5 261#endif 2622: 263 extui a10, a4, 2, 3 /* remaining 4-byte chunks */ 264 extui a4, a4, 0, 2 /* reset len for general-case, 2-byte chunks */ 265#if XCHAL_HAVE_LOOPS 266 loopgtz a10, 3f 267#else 268 beqz a10, 3f 269 slli a10, a10, 2 270 add a10, a10, a2 /* a10 = end of last 4-byte src chunk */ 271.Loop6: 272#endif 273EX(10f) l32i a9, a2, 0 274EX(11f) s32i a9, a3, 0 275 ONES_ADD(a5, a9) 276 addi a2, a2, 4 277 addi a3, a3, 4 278#if !XCHAL_HAVE_LOOPS 279 blt a2, a10, .Loop6 280#endif 2813: 282 /* 283 Control comes to here in two cases: (1) It may fall through 284 to here from the 4-byte alignment case to process, at most, 285 one 2-byte chunk. (2) It branches to here from above if 286 either src or dst is 2-byte aligned, and we process all bytes 287 here, except for perhaps a trailing odd byte. It's 288 inefficient, so align your addresses to 4-byte boundaries. 289 290 a2 = src 291 a3 = dst 292 a4 = len 293 a5 = sum 294 */ 295 srli a10, a4, 1 /* 2-byte chunks */ 296#if XCHAL_HAVE_LOOPS 297 loopgtz a10, 4f 298#else 299 beqz a10, 4f 300 slli a10, a10, 1 301 add a10, a10, a2 /* a10 = end of last 2-byte src chunk */ 302.Loop7: 303#endif 304EX(10f) l16ui a9, a2, 0 305EX(11f) s16i a9, a3, 0 306 ONES_ADD(a5, a9) 307 addi a2, a2, 2 308 addi a3, a3, 2 309#if !XCHAL_HAVE_LOOPS 310 blt a2, a10, .Loop7 311#endif 3124: 313 /* This section processes a possible trailing odd byte. */ 314 _bbci.l a4, 0, 8f /* 1-byte chunk */ 315EX(10f) l8ui a9, a2, 0 316EX(11f) s8i a9, a3, 0 317#ifdef __XTENSA_EB__ 318 slli a9, a9, 8 /* shift byte to bits 8..15 */ 319#endif 320 ONES_ADD(a5, a9) 3218: 322 mov a2, a5 323 retw 324 3255: 326 /* Control branch to here when either src or dst is odd. We 327 process all bytes using 8-bit accesses. Grossly inefficient, 328 so don't feed us an odd address. */ 329 330 srli a10, a4, 1 /* handle in pairs for 16-bit csum */ 331#if XCHAL_HAVE_LOOPS 332 loopgtz a10, 6f 333#else 334 beqz a10, 6f 335 slli a10, a10, 1 336 add a10, a10, a2 /* a10 = end of last odd-aligned, 2-byte src chunk */ 337.Loop8: 338#endif 339EX(10f) l8ui a9, a2, 0 340EX(10f) l8ui a8, a2, 1 341EX(11f) s8i a9, a3, 0 342EX(11f) s8i a8, a3, 1 343#ifdef __XTENSA_EB__ 344 slli a9, a9, 8 /* combine into a single 16-bit value */ 345#else /* for checksum computation */ 346 slli a8, a8, 8 347#endif 348 or a9, a9, a8 349 ONES_ADD(a5, a9) 350 addi a2, a2, 2 351 addi a3, a3, 2 352#if !XCHAL_HAVE_LOOPS 353 blt a2, a10, .Loop8 354#endif 3556: 356 j 4b /* process the possible trailing odd byte */ 357 358ENDPROC(csum_partial_copy_generic) 359 360 361# Exception handler: 362.section .fixup, "ax" 363/* 364 a6 = src_err_ptr 365 a7 = dst_err_ptr 366 a11 = original len for exception handling 367 a12 = original dst for exception handling 368*/ 369 37010: 371 _movi a2, -EFAULT 372 s32i a2, a6, 0 /* src_err_ptr */ 373 374 # clear the complete destination - computing the rest 375 # is too much work 376 movi a2, 0 377#if XCHAL_HAVE_LOOPS 378 loopgtz a11, 2f 379#else 380 beqz a11, 2f 381 add a11, a11, a12 /* a11 = ending address */ 382.Leloop: 383#endif 384 s8i a2, a12, 0 385 addi a12, a12, 1 386#if !XCHAL_HAVE_LOOPS 387 blt a12, a11, .Leloop 388#endif 3892: 390 retw 391 39211: 393 movi a2, -EFAULT 394 s32i a2, a7, 0 /* dst_err_ptr */ 395 movi a2, 0 396 retw 397 398.previous 399